index.ts 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. import * as fs from "fs/promises"
  2. import { globby } from "globby"
  3. import * as path from "path"
  4. import { LanguageParser, loadRequiredLanguageParsers } from "./languageParser"
  5. // TODO: implement caching behavior to avoid having to keep analyzing project for new tasks.
  6. export async function parseSourceCodeForDefinitionsTopLevel(dirPath: string): Promise<string> {
  7. // Get all files at top level (not gitignored)
  8. const allFiles = await listFiles(dirPath, false)
  9. let result = ""
  10. // Separate files to parse and remaining files
  11. const { filesToParse, remainingFiles } = separateFiles(allFiles)
  12. const languageParsers = await loadRequiredLanguageParsers(filesToParse)
  13. // Parse specific files we have language parsers for
  14. // const filesWithoutDefinitions: string[] = []
  15. for (const file of filesToParse) {
  16. const definitions = await parseFile(file, languageParsers)
  17. if (definitions) {
  18. result += `${path.relative(dirPath, file)}\n${definitions}\n`
  19. }
  20. // else {
  21. // filesWithoutDefinitions.push(file)
  22. // }
  23. }
  24. // List remaining files' paths
  25. // let didFindUnparsedFiles = false
  26. // filesWithoutDefinitions
  27. // .concat(remainingFiles)
  28. // .sort()
  29. // .forEach((file) => {
  30. // if (!didFindUnparsedFiles) {
  31. // result += "# Unparsed Files\n\n"
  32. // didFindUnparsedFiles = true
  33. // }
  34. // result += `${path.relative(dirPath, file)}\n`
  35. // })
  36. return result ? result : "No source code definitions found."
  37. }
  38. export async function listFiles(dirPath: string, recursive: boolean): Promise<string[]> {
  39. const absolutePath = path.resolve(dirPath)
  40. const root = process.platform === "win32" ? path.parse(absolutePath).root : "/"
  41. const isRoot = absolutePath === root
  42. if (isRoot) {
  43. return [root]
  44. }
  45. const dirsToIgnore = [
  46. "node_modules",
  47. "__pycache__",
  48. "env",
  49. "venv",
  50. "target/dependency",
  51. "build/dependencies",
  52. "dist",
  53. "out",
  54. "bundle",
  55. "vendor",
  56. "tmp",
  57. "temp",
  58. "deps",
  59. "pkg",
  60. "Pods",
  61. ".*", // '!**/.*' excludes hidden directories, while '!**/.*/**' excludes only their contents. This way we are at least aware of the existence of hidden directories.
  62. ].map((dir) => `**/${dir}/**`)
  63. const options = {
  64. cwd: dirPath,
  65. dot: true, // do not ignore hidden files/directories
  66. absolute: true,
  67. markDirectories: true, // Append a / on any directories matched
  68. gitignore: recursive, // globby ignores any files that are gitignored
  69. ignore: recursive ? dirsToIgnore : undefined, // just in case there is no gitignore, we ignore sensible defaults
  70. onlyFiles: recursive, // true by default, false means it will list directories on their own too
  71. }
  72. // * globs all files in one dir, ** globs files in nested directories
  73. const files = await globby(recursive ? "**" : "*", options)
  74. return files
  75. }
  76. function separateFiles(allFiles: string[]): { filesToParse: string[]; remainingFiles: string[] } {
  77. const extensions = [
  78. "js",
  79. "jsx",
  80. "ts",
  81. "tsx",
  82. "py",
  83. // Rust
  84. "rs",
  85. "go",
  86. // C
  87. "c",
  88. "h",
  89. // C++
  90. "cpp",
  91. "hpp",
  92. // C#
  93. "cs",
  94. // Ruby
  95. "rb",
  96. "java",
  97. "php",
  98. "swift",
  99. ].map((e) => `.${e}`)
  100. const filesToParse = allFiles.filter((file) => extensions.includes(path.extname(file))).slice(0, 50) // 50 files max
  101. const remainingFiles = allFiles.filter((file) => !filesToParse.includes(file))
  102. return { filesToParse, remainingFiles }
  103. }
  104. /*
  105. Parsing files using tree-sitter
  106. 1. Parse the file content into an AST (Abstract Syntax Tree) using the appropriate language grammar (set of rules that define how the components of a language like keywords, expressions, and statements can be combined to create valid programs).
  107. 2. Create a query using a language-specific query string, and run it against the AST's root node to capture specific syntax elements.
  108. - We use tag queries to identify named entities in a program, and then use a syntax capture to label the entity and its name. A notable example of this is GitHub's search-based code navigation.
  109. - Our custom tag queries are based on tree-sitter's default tag queries, but modified to only capture definitions.
  110. 3. Sort the captures by their position in the file, output the name of the definition, and format by i.e. adding "|----\n" for gaps between captured sections.
  111. This approach allows us to focus on the most relevant parts of the code (defined by our language-specific queries) and provides a concise yet informative view of the file's structure and key elements.
  112. - https://github.com/tree-sitter/node-tree-sitter/blob/master/test/query_test.js
  113. - https://github.com/tree-sitter/tree-sitter/blob/master/lib/binding_web/test/query-test.js
  114. - https://github.com/tree-sitter/tree-sitter/blob/master/lib/binding_web/test/helper.js
  115. - https://tree-sitter.github.io/tree-sitter/code-navigation-systems
  116. */
  117. async function parseFile(filePath: string, languageParsers: LanguageParser): Promise<string | undefined> {
  118. const fileContent = await fs.readFile(filePath, "utf8")
  119. const ext = path.extname(filePath).toLowerCase().slice(1)
  120. const { parser, query } = languageParsers[ext] || {}
  121. if (!parser || !query) {
  122. return `Unsupported file type: ${filePath}`
  123. }
  124. let formattedOutput = ""
  125. try {
  126. // Parse the file content into an Abstract Syntax Tree (AST), a tree-like representation of the code
  127. const tree = parser.parse(fileContent)
  128. // Apply the query to the AST and get the captures
  129. // Captures are specific parts of the AST that match our query patterns, each capture represents a node in the AST that we're interested in.
  130. const captures = query.captures(tree.rootNode)
  131. // Sort captures by their start position
  132. captures.sort((a, b) => a.node.startPosition.row - b.node.startPosition.row)
  133. // Split the file content into individual lines
  134. const lines = fileContent.split("\n")
  135. // Keep track of the last line we've processed
  136. let lastLine = -1
  137. captures.forEach((capture) => {
  138. const { node, name } = capture
  139. // Get the start and end lines of the current AST node
  140. const startLine = node.startPosition.row
  141. const endLine = node.endPosition.row
  142. // Once we've retrieved the nodes we care about through the language query, we filter for lines with definition names only.
  143. // name.startsWith("name.reference.") > refs can be used for ranking purposes, but we don't need them for the output
  144. // previously we did `name.startsWith("name.definition.")` but this was too strict and excluded some relevant definitions
  145. // Add separator if there's a gap between captures
  146. if (lastLine !== -1 && startLine > lastLine + 1) {
  147. formattedOutput += "|----\n"
  148. }
  149. // Only add the first line of the definition
  150. // query captures includes the definition name and the definition implementation, but we only want the name (I found discrepencies in the naming structure for various languages, i.e. javascript names would be 'name' and typescript names would be 'name.definition)
  151. if (name.includes("name") && lines[startLine]) {
  152. formattedOutput += `│${lines[startLine]}\n`
  153. }
  154. // Adds all the captured lines
  155. // for (let i = startLine; i <= endLine; i++) {
  156. // formattedOutput += `│${lines[i]}\n`
  157. // }
  158. //}
  159. lastLine = endLine
  160. })
  161. } catch (error) {
  162. console.log(`Error parsing file: ${error}\n`)
  163. }
  164. if (formattedOutput.length > 0) {
  165. return `|----\n${formattedOutput}|----\n`
  166. }
  167. return undefined
  168. }