scanner.ts 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364
  1. import { listFiles } from "../../glob/list-files"
  2. import { Ignore } from "ignore"
  3. import { RooIgnoreController } from "../../../core/ignore/RooIgnoreController"
  4. import { stat } from "fs/promises"
  5. import * as path from "path"
  6. import { generateNormalizedAbsolutePath, generateRelativeFilePath } from "../shared/get-relative-path"
  7. import { scannerExtensions } from "../shared/supported-extensions"
  8. import * as vscode from "vscode"
  9. import { CodeBlock, ICodeParser, IEmbedder, IVectorStore, IDirectoryScanner } from "../interfaces"
  10. import { createHash } from "crypto"
  11. import { v5 as uuidv5 } from "uuid"
  12. import pLimit from "p-limit"
  13. import { Mutex } from "async-mutex"
  14. import { CacheManager } from "../cache-manager"
  15. import { t } from "../../../i18n"
  16. import {
  17. QDRANT_CODE_BLOCK_NAMESPACE,
  18. MAX_FILE_SIZE_BYTES,
  19. MAX_LIST_FILES_LIMIT,
  20. BATCH_SEGMENT_THRESHOLD,
  21. MAX_BATCH_RETRIES,
  22. INITIAL_RETRY_DELAY_MS,
  23. PARSING_CONCURRENCY,
  24. BATCH_PROCESSING_CONCURRENCY,
  25. } from "../constants"
  26. import { isPathInIgnoredDirectory } from "../../glob/ignore-utils"
  27. export class DirectoryScanner implements IDirectoryScanner {
  28. constructor(
  29. private readonly embedder: IEmbedder,
  30. private readonly qdrantClient: IVectorStore,
  31. private readonly codeParser: ICodeParser,
  32. private readonly cacheManager: CacheManager,
  33. private readonly ignoreInstance: Ignore,
  34. ) {}
  35. /**
  36. * Recursively scans a directory for code blocks in supported files.
  37. * @param directoryPath The directory to scan
  38. * @param rooIgnoreController Optional RooIgnoreController instance for filtering
  39. * @param context VS Code ExtensionContext for cache storage
  40. * @param onError Optional error handler callback
  41. * @returns Promise<{codeBlocks: CodeBlock[], stats: {processed: number, skipped: number}}> Array of parsed code blocks and processing stats
  42. */
  43. public async scanDirectory(
  44. directory: string,
  45. onError?: (error: Error) => void,
  46. onBlocksIndexed?: (indexedCount: number) => void,
  47. onFileParsed?: (fileBlockCount: number) => void,
  48. ): Promise<{ codeBlocks: CodeBlock[]; stats: { processed: number; skipped: number }; totalBlockCount: number }> {
  49. const directoryPath = directory
  50. // Get all files recursively (handles .gitignore automatically)
  51. const [allPaths, _] = await listFiles(directoryPath, true, MAX_LIST_FILES_LIMIT)
  52. // Filter out directories (marked with trailing '/')
  53. const filePaths = allPaths.filter((p) => !p.endsWith("/"))
  54. // Initialize RooIgnoreController if not provided
  55. const ignoreController = new RooIgnoreController(directoryPath)
  56. await ignoreController.initialize()
  57. // Filter paths using .rooignore
  58. const allowedPaths = ignoreController.filterPaths(filePaths)
  59. // Filter by supported extensions, ignore patterns, and excluded directories
  60. const supportedPaths = allowedPaths.filter((filePath) => {
  61. const ext = path.extname(filePath).toLowerCase()
  62. const relativeFilePath = generateRelativeFilePath(filePath)
  63. // Check if file is in an ignored directory using the shared helper
  64. if (isPathInIgnoredDirectory(filePath)) {
  65. return false
  66. }
  67. return scannerExtensions.includes(ext) && !this.ignoreInstance.ignores(relativeFilePath)
  68. })
  69. // Initialize tracking variables
  70. const processedFiles = new Set<string>()
  71. const codeBlocks: CodeBlock[] = []
  72. let processedCount = 0
  73. let skippedCount = 0
  74. // Initialize parallel processing tools
  75. const parseLimiter = pLimit(PARSING_CONCURRENCY) // Concurrency for file parsing
  76. const batchLimiter = pLimit(BATCH_PROCESSING_CONCURRENCY) // Concurrency for batch processing
  77. const mutex = new Mutex()
  78. // Shared batch accumulators (protected by mutex)
  79. let currentBatchBlocks: CodeBlock[] = []
  80. let currentBatchTexts: string[] = []
  81. let currentBatchFileInfos: { filePath: string; fileHash: string; isNew: boolean }[] = []
  82. const activeBatchPromises: Promise<void>[] = []
  83. // Initialize block counter
  84. let totalBlockCount = 0
  85. // Process all files in parallel with concurrency control
  86. const parsePromises = supportedPaths.map((filePath) =>
  87. parseLimiter(async () => {
  88. try {
  89. // Check file size
  90. const stats = await stat(filePath)
  91. if (stats.size > MAX_FILE_SIZE_BYTES) {
  92. skippedCount++ // Skip large files
  93. return
  94. }
  95. // Read file content
  96. const content = await vscode.workspace.fs
  97. .readFile(vscode.Uri.file(filePath))
  98. .then((buffer) => Buffer.from(buffer).toString("utf-8"))
  99. // Calculate current hash
  100. const currentFileHash = createHash("sha256").update(content).digest("hex")
  101. processedFiles.add(filePath)
  102. // Check against cache
  103. const cachedFileHash = this.cacheManager.getHash(filePath)
  104. if (cachedFileHash === currentFileHash) {
  105. // File is unchanged
  106. skippedCount++
  107. return
  108. }
  109. // File is new or changed - parse it using the injected parser function
  110. const blocks = await this.codeParser.parseFile(filePath, { content, fileHash: currentFileHash })
  111. const fileBlockCount = blocks.length
  112. onFileParsed?.(fileBlockCount)
  113. codeBlocks.push(...blocks)
  114. processedCount++
  115. // Process embeddings if configured
  116. if (this.embedder && this.qdrantClient && blocks.length > 0) {
  117. // Add to batch accumulators
  118. let addedBlocksFromFile = false
  119. for (const block of blocks) {
  120. const trimmedContent = block.content.trim()
  121. if (trimmedContent) {
  122. const release = await mutex.acquire()
  123. totalBlockCount += fileBlockCount
  124. try {
  125. currentBatchBlocks.push(block)
  126. currentBatchTexts.push(trimmedContent)
  127. addedBlocksFromFile = true
  128. if (addedBlocksFromFile) {
  129. currentBatchFileInfos.push({
  130. filePath,
  131. fileHash: currentFileHash,
  132. isNew: !this.cacheManager.getHash(filePath),
  133. })
  134. }
  135. // Check if batch threshold is met
  136. if (currentBatchBlocks.length >= BATCH_SEGMENT_THRESHOLD) {
  137. // Copy current batch data and clear accumulators
  138. const batchBlocks = [...currentBatchBlocks]
  139. const batchTexts = [...currentBatchTexts]
  140. const batchFileInfos = [...currentBatchFileInfos]
  141. currentBatchBlocks = []
  142. currentBatchTexts = []
  143. currentBatchFileInfos = []
  144. // Queue batch processing
  145. const batchPromise = batchLimiter(() =>
  146. this.processBatch(
  147. batchBlocks,
  148. batchTexts,
  149. batchFileInfos,
  150. onError,
  151. onBlocksIndexed,
  152. ),
  153. )
  154. activeBatchPromises.push(batchPromise)
  155. }
  156. } finally {
  157. release()
  158. }
  159. }
  160. }
  161. } else {
  162. // Only update hash if not being processed in a batch
  163. await this.cacheManager.updateHash(filePath, currentFileHash)
  164. }
  165. } catch (error) {
  166. console.error(`Error processing file ${filePath}:`, error)
  167. if (onError) {
  168. onError(
  169. error instanceof Error
  170. ? error
  171. : new Error(t("embeddings:scanner.unknownErrorProcessingFile", { filePath })),
  172. )
  173. }
  174. }
  175. }),
  176. )
  177. // Wait for all parsing to complete
  178. await Promise.all(parsePromises)
  179. // Process any remaining items in batch
  180. if (currentBatchBlocks.length > 0) {
  181. const release = await mutex.acquire()
  182. try {
  183. // Copy current batch data and clear accumulators
  184. const batchBlocks = [...currentBatchBlocks]
  185. const batchTexts = [...currentBatchTexts]
  186. const batchFileInfos = [...currentBatchFileInfos]
  187. currentBatchBlocks = []
  188. currentBatchTexts = []
  189. currentBatchFileInfos = []
  190. // Queue final batch processing
  191. const batchPromise = batchLimiter(() =>
  192. this.processBatch(batchBlocks, batchTexts, batchFileInfos, onError, onBlocksIndexed),
  193. )
  194. activeBatchPromises.push(batchPromise)
  195. } finally {
  196. release()
  197. }
  198. }
  199. // Wait for all batch processing to complete
  200. await Promise.all(activeBatchPromises)
  201. // Handle deleted files
  202. const oldHashes = this.cacheManager.getAllHashes()
  203. for (const cachedFilePath of Object.keys(oldHashes)) {
  204. if (!processedFiles.has(cachedFilePath)) {
  205. // File was deleted or is no longer supported/indexed
  206. if (this.qdrantClient) {
  207. try {
  208. await this.qdrantClient.deletePointsByFilePath(cachedFilePath)
  209. await this.cacheManager.deleteHash(cachedFilePath)
  210. } catch (error) {
  211. console.error(`[DirectoryScanner] Failed to delete points for ${cachedFilePath}:`, error)
  212. if (onError) {
  213. onError(
  214. error instanceof Error
  215. ? error
  216. : new Error(
  217. t("embeddings:scanner.unknownErrorDeletingPoints", {
  218. filePath: cachedFilePath,
  219. }),
  220. ),
  221. )
  222. }
  223. // Decide if we should re-throw or just log
  224. }
  225. }
  226. }
  227. }
  228. return {
  229. codeBlocks,
  230. stats: {
  231. processed: processedCount,
  232. skipped: skippedCount,
  233. },
  234. totalBlockCount,
  235. }
  236. }
  237. private async processBatch(
  238. batchBlocks: CodeBlock[],
  239. batchTexts: string[],
  240. batchFileInfos: { filePath: string; fileHash: string; isNew: boolean }[],
  241. onError?: (error: Error) => void,
  242. onBlocksIndexed?: (indexedCount: number) => void,
  243. ): Promise<void> {
  244. if (batchBlocks.length === 0) return
  245. let attempts = 0
  246. let success = false
  247. let lastError: Error | null = null
  248. while (attempts < MAX_BATCH_RETRIES && !success) {
  249. attempts++
  250. try {
  251. // --- Deletion Step ---
  252. const uniqueFilePaths = [
  253. ...new Set(
  254. batchFileInfos
  255. .filter((info) => !info.isNew) // Only modified files (not new)
  256. .map((info) => info.filePath),
  257. ),
  258. ]
  259. if (uniqueFilePaths.length > 0) {
  260. try {
  261. await this.qdrantClient.deletePointsByMultipleFilePaths(uniqueFilePaths)
  262. } catch (deleteError) {
  263. console.error(
  264. `[DirectoryScanner] Failed to delete points for ${uniqueFilePaths.length} files before upsert:`,
  265. deleteError,
  266. )
  267. // Re-throw the error to stop processing this batch attempt
  268. throw deleteError
  269. }
  270. }
  271. // --- End Deletion Step ---
  272. // Create embeddings for batch
  273. const { embeddings } = await this.embedder.createEmbeddings(batchTexts)
  274. // Prepare points for Qdrant
  275. const points = batchBlocks.map((block, index) => {
  276. const normalizedAbsolutePath = generateNormalizedAbsolutePath(block.file_path)
  277. const stableName = `${normalizedAbsolutePath}:${block.start_line}`
  278. const pointId = uuidv5(stableName, QDRANT_CODE_BLOCK_NAMESPACE)
  279. return {
  280. id: pointId,
  281. vector: embeddings[index],
  282. payload: {
  283. filePath: generateRelativeFilePath(normalizedAbsolutePath),
  284. codeChunk: block.content,
  285. startLine: block.start_line,
  286. endLine: block.end_line,
  287. },
  288. }
  289. })
  290. // Upsert points to Qdrant
  291. await this.qdrantClient.upsertPoints(points)
  292. onBlocksIndexed?.(batchBlocks.length)
  293. // Update hashes for successfully processed files in this batch
  294. for (const fileInfo of batchFileInfos) {
  295. await this.cacheManager.updateHash(fileInfo.filePath, fileInfo.fileHash)
  296. }
  297. success = true
  298. } catch (error) {
  299. lastError = error as Error
  300. console.error(`[DirectoryScanner] Error processing batch (attempt ${attempts}):`, error)
  301. if (attempts < MAX_BATCH_RETRIES) {
  302. const delay = INITIAL_RETRY_DELAY_MS * Math.pow(2, attempts - 1)
  303. await new Promise((resolve) => setTimeout(resolve, delay))
  304. }
  305. }
  306. }
  307. if (!success && lastError) {
  308. console.error(`[DirectoryScanner] Failed to process batch after ${MAX_BATCH_RETRIES} attempts`)
  309. if (onError) {
  310. // Preserve the original error message from embedders which now have detailed i18n messages
  311. const errorMessage = lastError.message || "Unknown error"
  312. // For other errors, provide context
  313. onError(
  314. new Error(
  315. t("embeddings:scanner.failedToProcessBatchWithError", {
  316. maxRetries: MAX_BATCH_RETRIES,
  317. errorMessage,
  318. }),
  319. ),
  320. )
  321. }
  322. }
  323. }
  324. }