Browse Source

fix: resolve DirectoryScanner memory leak and improve file limit handling (#5785)

Daniel 6 months ago
parent
commit
a7a6bcb30f

+ 1 - 1
src/services/code-index/constants/index.ts

@@ -15,7 +15,7 @@ export const QDRANT_CODE_BLOCK_NAMESPACE = "f47ac10b-58cc-4372-a567-0e02b2c3d479
 export const MAX_FILE_SIZE_BYTES = 1 * 1024 * 1024 // 1MB
 
 /**Directory Scanner */
-export const MAX_LIST_FILES_LIMIT = 3_000
+export const MAX_LIST_FILES_LIMIT_CODE_INDEX = 50_000
 export const BATCH_SEGMENT_THRESHOLD = 60 // Number of code segments to batch for embeddings/upserts
 export const MAX_BATCH_RETRIES = 3
 export const INITIAL_RETRY_DELAY_MS = 500

+ 0 - 1
src/services/code-index/interfaces/file-processor.ts

@@ -38,7 +38,6 @@ export interface IDirectoryScanner {
 		onBlocksIndexed?: (indexedCount: number) => void,
 		onFileParsed?: (fileBlockCount: number) => void,
 	): Promise<{
-		codeBlocks: CodeBlock[]
 		stats: {
 			processed: number
 			skipped: number

+ 22 - 14
src/services/code-index/processors/__tests__/scanner.spec.ts

@@ -168,7 +168,16 @@ describe("DirectoryScanner", () => {
 			expect(mockCodeParser.parseFile).not.toHaveBeenCalled()
 		})
 
-		it("should parse changed files and return code blocks", async () => {
+		it("should parse changed files and return empty codeBlocks array", async () => {
+			// Create scanner without embedder to test the non-embedding path
+			const scannerNoEmbeddings = new DirectoryScanner(
+				null as any, // No embedder
+				null as any, // No vector store
+				mockCodeParser,
+				mockCacheManager,
+				mockIgnoreInstance,
+			)
+
 			const { listFiles } = await import("../../../glob/list-files")
 			vi.mocked(listFiles).mockResolvedValue([["test/file1.js"], false])
 			const mockBlocks: any[] = [
@@ -185,8 +194,7 @@ describe("DirectoryScanner", () => {
 			]
 			;(mockCodeParser.parseFile as any).mockResolvedValue(mockBlocks)
 
-			const result = await scanner.scanDirectory("/test")
-			expect(result.codeBlocks).toEqual(mockBlocks)
+			const result = await scannerNoEmbeddings.scanDirectory("/test")
 			expect(result.stats.processed).toBe(1)
 		})
 
@@ -252,6 +260,15 @@ describe("DirectoryScanner", () => {
 		})
 
 		it("should process markdown files alongside code files", async () => {
+			// Create scanner without embedder to test the non-embedding path
+			const scannerNoEmbeddings = new DirectoryScanner(
+				null as any, // No embedder
+				null as any, // No vector store
+				mockCodeParser,
+				mockCacheManager,
+				mockIgnoreInstance,
+			)
+
 			const { listFiles } = await import("../../../glob/list-files")
 			vi.mocked(listFiles).mockResolvedValue([["test/README.md", "test/app.js", "docs/guide.markdown"], false])
 
@@ -306,7 +323,7 @@ describe("DirectoryScanner", () => {
 				return []
 			})
 
-			const result = await scanner.scanDirectory("/test")
+			const result = await scannerNoEmbeddings.scanDirectory("/test")
 
 			// Verify all files were processed
 			expect(mockCodeParser.parseFile).toHaveBeenCalledTimes(3)
@@ -314,16 +331,7 @@ describe("DirectoryScanner", () => {
 			expect(mockCodeParser.parseFile).toHaveBeenCalledWith("test/app.js", expect.any(Object))
 			expect(mockCodeParser.parseFile).toHaveBeenCalledWith("docs/guide.markdown", expect.any(Object))
 
-			// Verify code blocks include both markdown and code content
-			expect(result.codeBlocks).toHaveLength(3)
-			expect(result.codeBlocks).toEqual(
-				expect.arrayContaining([
-					expect.objectContaining({ type: "markdown_header_h1" }),
-					expect.objectContaining({ type: "function" }),
-					expect.objectContaining({ type: "markdown_header_h2" }),
-				]),
-			)
-
+			// Verify processing still works without codeBlocks accumulation
 			expect(result.stats.processed).toBe(3)
 		})
 

+ 32 - 18
src/services/code-index/processors/scanner.ts

@@ -17,7 +17,7 @@ import { t } from "../../../i18n"
 import {
 	QDRANT_CODE_BLOCK_NAMESPACE,
 	MAX_FILE_SIZE_BYTES,
-	MAX_LIST_FILES_LIMIT,
+	MAX_LIST_FILES_LIMIT_CODE_INDEX,
 	BATCH_SEGMENT_THRESHOLD,
 	MAX_BATCH_RETRIES,
 	INITIAL_RETRY_DELAY_MS,
@@ -51,13 +51,13 @@ export class DirectoryScanner implements IDirectoryScanner {
 		onError?: (error: Error) => void,
 		onBlocksIndexed?: (indexedCount: number) => void,
 		onFileParsed?: (fileBlockCount: number) => void,
-	): Promise<{ codeBlocks: CodeBlock[]; stats: { processed: number; skipped: number }; totalBlockCount: number }> {
+	): Promise<{ stats: { processed: number; skipped: number }; totalBlockCount: number }> {
 		const directoryPath = directory
 		// Capture workspace context at scan start
 		const scanWorkspace = getWorkspacePathForContext(directoryPath)
 
 		// Get all files recursively (handles .gitignore automatically)
-		const [allPaths, _] = await listFiles(directoryPath, true, MAX_LIST_FILES_LIMIT)
+		const [allPaths, _] = await listFiles(directoryPath, true, MAX_LIST_FILES_LIMIT_CODE_INDEX)
 
 		// Filter out directories (marked with trailing '/')
 		const filePaths = allPaths.filter((p) => !p.endsWith("/"))
@@ -85,7 +85,6 @@ export class DirectoryScanner implements IDirectoryScanner {
 
 		// Initialize tracking variables
 		const processedFiles = new Set<string>()
-		const codeBlocks: CodeBlock[] = []
 		let processedCount = 0
 		let skippedCount = 0
 
@@ -98,7 +97,7 @@ export class DirectoryScanner implements IDirectoryScanner {
 		let currentBatchBlocks: CodeBlock[] = []
 		let currentBatchTexts: string[] = []
 		let currentBatchFileInfos: { filePath: string; fileHash: string; isNew: boolean }[] = []
-		const activeBatchPromises: Promise<void>[] = []
+		const activeBatchPromises = new Set<Promise<void>>()
 
 		// Initialize block counter
 		let totalBlockCount = 0
@@ -125,6 +124,7 @@ export class DirectoryScanner implements IDirectoryScanner {
 
 					// Check against cache
 					const cachedFileHash = this.cacheManager.getHash(filePath)
+					const isNewFile = !cachedFileHash
 					if (cachedFileHash === currentFileHash) {
 						// File is unchanged
 						skippedCount++
@@ -135,7 +135,6 @@ export class DirectoryScanner implements IDirectoryScanner {
 					const blocks = await this.codeParser.parseFile(filePath, { content, fileHash: currentFileHash })
 					const fileBlockCount = blocks.length
 					onFileParsed?.(fileBlockCount)
-					codeBlocks.push(...blocks)
 					processedCount++
 
 					// Process embeddings if configured
@@ -146,20 +145,11 @@ export class DirectoryScanner implements IDirectoryScanner {
 							const trimmedContent = block.content.trim()
 							if (trimmedContent) {
 								const release = await mutex.acquire()
-								totalBlockCount += fileBlockCount
 								try {
 									currentBatchBlocks.push(block)
 									currentBatchTexts.push(trimmedContent)
 									addedBlocksFromFile = true
 
-									if (addedBlocksFromFile) {
-										currentBatchFileInfos.push({
-											filePath,
-											fileHash: currentFileHash,
-											isNew: !this.cacheManager.getHash(filePath),
-										})
-									}
-
 									// Check if batch threshold is met
 									if (currentBatchBlocks.length >= BATCH_SEGMENT_THRESHOLD) {
 										// Copy current batch data and clear accumulators
@@ -181,13 +171,33 @@ export class DirectoryScanner implements IDirectoryScanner {
 												onBlocksIndexed,
 											),
 										)
-										activeBatchPromises.push(batchPromise)
+										activeBatchPromises.add(batchPromise)
+
+										// Clean up completed promises to prevent memory accumulation
+										batchPromise.finally(() => {
+											activeBatchPromises.delete(batchPromise)
+										})
 									}
 								} finally {
 									release()
 								}
 							}
 						}
+
+						// Add file info once per file (outside the block loop)
+						if (addedBlocksFromFile) {
+							const release = await mutex.acquire()
+							try {
+								totalBlockCount += fileBlockCount
+								currentBatchFileInfos.push({
+									filePath,
+									fileHash: currentFileHash,
+									isNew: isNewFile,
+								})
+							} finally {
+								release()
+							}
+						}
 					} else {
 						// Only update hash if not being processed in a batch
 						await this.cacheManager.updateHash(filePath, currentFileHash)
@@ -232,7 +242,12 @@ export class DirectoryScanner implements IDirectoryScanner {
 				const batchPromise = batchLimiter(() =>
 					this.processBatch(batchBlocks, batchTexts, batchFileInfos, scanWorkspace, onError, onBlocksIndexed),
 				)
-				activeBatchPromises.push(batchPromise)
+				activeBatchPromises.add(batchPromise)
+
+				// Clean up completed promises to prevent memory accumulation
+				batchPromise.finally(() => {
+					activeBatchPromises.delete(batchPromise)
+				})
 			} finally {
 				release()
 			}
@@ -280,7 +295,6 @@ export class DirectoryScanner implements IDirectoryScanner {
 		}
 
 		return {
-			codeBlocks,
 			stats: {
 				processed: processedCount,
 				skipped: skippedCount,