Browse Source

Optimize language parser loading by only loading once for all files

Saoud Rizwan 1 year ago
parent
commit
c40fae4cfb
2 changed files with 157 additions and 99 deletions
  1. 12 99
      src/AnalyzeProject/index.ts
  2. 145 0
      src/AnalyzeProject/languageParser.ts

+ 12 - 99
src/AnalyzeProject/index.ts

@@ -1,22 +1,7 @@
-import * as path from "path"
-import { globby } from "globby"
 import * as fs from "fs/promises"
-import Parser from "web-tree-sitter"
-
-import {
-	javascriptQuery,
-	typescriptQuery,
-	pythonQuery,
-	rustQuery,
-	goQuery,
-	cppQuery,
-	cQuery,
-	csharpQuery,
-	rubyQuery,
-	javaQuery,
-	phpQuery,
-	swiftQuery,
-} from "./tree-sitter-queries/tags"
+import { globby } from "globby"
+import * as path from "path"
+import { LanguageParser, loadAllLanguages } from "./languageParser"
 
 async function analyzeProject(dirPath: string): Promise<string> {
 	let result = ""
@@ -27,11 +12,14 @@ async function analyzeProject(dirPath: string): Promise<string> {
 	// Separate files to parse and remaining files
 	const { filesToParse, remainingFiles } = separateFiles(allFiles)
 
+	// Load only the necessary language parsers
+	const languageParsers = await loadAllLanguages(filesToParse)
+
 	// Parse specific files and generate result
 	result += "Files parsed with ASTs:\n"
 	for (const file of filesToParse) {
 		result += `File: ${file}\n`
-		const ast = await parseFile(file)
+		const ast = await parseFile(file, languageParsers)
 		result += `AST: ${JSON.stringify(ast, null, 2)}\n\n`
 	}
 
@@ -121,84 +109,13 @@ This approach allows us to focus on the most relevant parts of the code (defined
 - https://github.com/tree-sitter/tree-sitter/blob/master/lib/binding_web/test/helper.js
 - https://tree-sitter.github.io/tree-sitter/code-navigation-systems
 */
-async function parseFile(filePath: string): Promise<string> {
+async function parseFile(filePath: string, languageParsers: LanguageParser): Promise<string> {
 	const fileContent = await fs.readFile(filePath, "utf8")
 	const ext = path.extname(filePath).toLowerCase().slice(1)
-	await Parser.init()
-	const parser = new Parser()
-	let query: Parser.Query
-
-	switch (ext) {
-		case "js":
-		case "jsx":
-			const JavaScript = await loadLanguage("javascript")
-			parser.setLanguage(JavaScript)
-			query = JavaScript.query(javascriptQuery)
-			break
-		case "ts":
-			const TypeScript = await loadLanguage("typescript")
-			parser.setLanguage(TypeScript)
-			query = TypeScript.query(typescriptQuery)
-			break
-		case "tsx":
-			const Tsx = await loadLanguage("tsx")
-			parser.setLanguage(Tsx)
-			query = Tsx.query(typescriptQuery)
-			break
-		case "py":
-			const Python = await loadLanguage("python")
-			parser.setLanguage(Python)
-			query = Python.query(pythonQuery)
-			break
-		case "rs":
-			const Rust = await loadLanguage("rust")
-			parser.setLanguage(Rust)
-			query = Rust.query(rustQuery)
-			break
-		case "go":
-			const Go = await loadLanguage("go")
-			parser.setLanguage(Go)
-			query = Go.query(goQuery)
-			break
-		case "cpp":
-		case "hpp":
-			const Cpp = await loadLanguage("cpp")
-			parser.setLanguage(Cpp)
-			query = Cpp.query(cppQuery)
-			break
-		case "c":
-		case "h":
-			const C = await loadLanguage("c")
-			parser.setLanguage(C)
-			query = C.query(cQuery)
-			break
-		case "cs":
-			const CSharp = await loadLanguage("c_sharp")
-			parser.setLanguage(CSharp)
-			query = CSharp.query(csharpQuery)
-			break
-		case "rb":
-			const Ruby = await loadLanguage("ruby")
-			parser.setLanguage(Ruby)
-			query = Ruby.query(rubyQuery)
-			break
-		case "java":
-			const Java = await loadLanguage("java")
-			parser.setLanguage(Java)
-			query = Java.query(javaQuery)
-			break
-		case "php":
-			const PHP = await loadLanguage("php")
-			parser.setLanguage(PHP)
-			query = PHP.query(phpQuery)
-			break
-		case "swift":
-			const Swift = await loadLanguage("swift")
-			parser.setLanguage(Swift)
-			query = Swift.query(swiftQuery)
-			break
-		default:
-			return `Unsupported file type: ${filePath}`
+
+	const { parser, query } = languageParsers[ext] || {}
+	if (!parser || !query) {
+		return `Unsupported file type: ${filePath}`
 	}
 
 	let formattedOutput = `${filePath}:\n|----\n`
@@ -247,8 +164,4 @@ async function parseFile(filePath: string): Promise<string> {
 	return formattedOutput
 }
 
-async function loadLanguage(langName: string) {
-	return await Parser.Language.load(path.join(__dirname, `tree-sitter-${langName}.wasm`))
-}
-
 export { analyzeProject }

+ 145 - 0
src/AnalyzeProject/languageParser.ts

@@ -0,0 +1,145 @@
+import * as path from "path"
+import Parser from "web-tree-sitter"
+import {
+	javascriptQuery,
+	typescriptQuery,
+	pythonQuery,
+	rustQuery,
+	goQuery,
+	cppQuery,
+	cQuery,
+	csharpQuery,
+	rubyQuery,
+	javaQuery,
+	phpQuery,
+	swiftQuery,
+} from "./tree-sitter-queries/tags"
+
+export interface LanguageParser {
+	[key: string]: {
+		parser: Parser
+		query: Parser.Query
+	}
+}
+
+async function loadLanguage(langName: string) {
+	return await Parser.Language.load(path.join(__dirname, `tree-sitter-${langName}.wasm`))
+}
+
+/*
+Using node bindings for tree-sitter is problematic in vscode extensions 
+because of incompatibility with electron. Going the .wasm route has the 
+advantage of not having to build for multiple architectures.
+
+We use web-tree-sitter and tree-sitter-wasms which provides auto-updating prebuilt WASM binaries for tree-sitter's language parsers.
+
+This function loads WASM modules for relevant language parsers based on input files:
+1. Extracts unique file extensions
+2. Maps extensions to language names
+3. Loads corresponding WASM files (containing grammar rules)
+4. Uses WASM modules to initialize tree-sitter parsers
+
+This approach optimizes performance by loading only necessary parsers once for all relevant files.
+
+Sources:
+- https://github.com/tree-sitter/node-tree-sitter/issues/169
+- https://github.com/tree-sitter/node-tree-sitter/issues/168
+- https://github.com/Gregoor/tree-sitter-wasms/blob/main/README.md
+*/
+export async function loadAllLanguages(filesToParse: string[]): Promise<LanguageParser> {
+	await Parser.init()
+
+	const extensionsToLoad = new Set(filesToParse.map((file) => path.extname(file).toLowerCase().slice(1)))
+
+	const languageMap: { [key: string]: string } = {
+		js: "javascript",
+		jsx: "javascript",
+		ts: "typescript",
+		tsx: "tsx",
+		py: "python",
+		rs: "rust",
+		go: "go",
+		cpp: "cpp",
+		hpp: "cpp",
+		c: "c",
+		h: "c",
+		cs: "c_sharp",
+		rb: "ruby",
+		java: "java",
+		php: "php",
+		swift: "swift",
+	}
+
+	const languages: { [key: string]: Parser.Language } = {}
+
+	for (const ext of extensionsToLoad) {
+		if (ext in languageMap) {
+			const langName = languageMap[ext as keyof typeof languageMap]
+			if (!languages[langName]) {
+				languages[langName] = await loadLanguage(langName)
+			}
+		}
+	}
+
+	const parsers: LanguageParser = {}
+
+	for (const ext of extensionsToLoad) {
+		if (ext in languageMap) {
+			const langName = languageMap[ext as keyof typeof languageMap]
+			const lang = languages[langName]
+
+			const parser = new Parser()
+			parser.setLanguage(lang)
+			let query: Parser.Query
+
+			switch (ext) {
+				case "js":
+				case "jsx":
+					query = lang.query(javascriptQuery)
+					break
+				case "ts":
+				case "tsx":
+					query = lang.query(typescriptQuery)
+					break
+				case "py":
+					query = lang.query(pythonQuery)
+					break
+				case "rs":
+					query = lang.query(rustQuery)
+					break
+				case "go":
+					query = lang.query(goQuery)
+					break
+				case "cpp":
+				case "hpp":
+					query = lang.query(cppQuery)
+					break
+				case "c":
+				case "h":
+					query = lang.query(cQuery)
+					break
+				case "cs":
+					query = lang.query(csharpQuery)
+					break
+				case "rb":
+					query = lang.query(rubyQuery)
+					break
+				case "java":
+					query = lang.query(javaQuery)
+					break
+				case "php":
+					query = lang.query(phpQuery)
+					break
+				case "swift":
+					query = lang.query(swiftQuery)
+					break
+				default:
+					throw new Error(`Unsupported language: ${ext}`)
+			}
+
+			parsers[ext] = { parser, query }
+		}
+	}
+
+	return parsers
+}