|
|
@@ -17,7 +17,7 @@ import { t } from "../../../i18n"
|
|
|
import {
|
|
|
QDRANT_CODE_BLOCK_NAMESPACE,
|
|
|
MAX_FILE_SIZE_BYTES,
|
|
|
- MAX_LIST_FILES_LIMIT,
|
|
|
+ MAX_LIST_FILES_LIMIT_CODE_INDEX,
|
|
|
BATCH_SEGMENT_THRESHOLD,
|
|
|
MAX_BATCH_RETRIES,
|
|
|
INITIAL_RETRY_DELAY_MS,
|
|
|
@@ -51,13 +51,13 @@ export class DirectoryScanner implements IDirectoryScanner {
|
|
|
onError?: (error: Error) => void,
|
|
|
onBlocksIndexed?: (indexedCount: number) => void,
|
|
|
onFileParsed?: (fileBlockCount: number) => void,
|
|
|
- ): Promise<{ codeBlocks: CodeBlock[]; stats: { processed: number; skipped: number }; totalBlockCount: number }> {
|
|
|
+ ): Promise<{ stats: { processed: number; skipped: number }; totalBlockCount: number }> {
|
|
|
const directoryPath = directory
|
|
|
// Capture workspace context at scan start
|
|
|
const scanWorkspace = getWorkspacePathForContext(directoryPath)
|
|
|
|
|
|
// Get all files recursively (handles .gitignore automatically)
|
|
|
- const [allPaths, _] = await listFiles(directoryPath, true, MAX_LIST_FILES_LIMIT)
|
|
|
+ const [allPaths, _] = await listFiles(directoryPath, true, MAX_LIST_FILES_LIMIT_CODE_INDEX)
|
|
|
|
|
|
// Filter out directories (marked with trailing '/')
|
|
|
const filePaths = allPaths.filter((p) => !p.endsWith("/"))
|
|
|
@@ -85,7 +85,6 @@ export class DirectoryScanner implements IDirectoryScanner {
|
|
|
|
|
|
// Initialize tracking variables
|
|
|
const processedFiles = new Set<string>()
|
|
|
- const codeBlocks: CodeBlock[] = []
|
|
|
let processedCount = 0
|
|
|
let skippedCount = 0
|
|
|
|
|
|
@@ -98,7 +97,7 @@ export class DirectoryScanner implements IDirectoryScanner {
|
|
|
let currentBatchBlocks: CodeBlock[] = []
|
|
|
let currentBatchTexts: string[] = []
|
|
|
let currentBatchFileInfos: { filePath: string; fileHash: string; isNew: boolean }[] = []
|
|
|
- const activeBatchPromises: Promise<void>[] = []
|
|
|
+ const activeBatchPromises = new Set<Promise<void>>()
|
|
|
|
|
|
// Initialize block counter
|
|
|
let totalBlockCount = 0
|
|
|
@@ -125,6 +124,7 @@ export class DirectoryScanner implements IDirectoryScanner {
|
|
|
|
|
|
// Check against cache
|
|
|
const cachedFileHash = this.cacheManager.getHash(filePath)
|
|
|
+ const isNewFile = !cachedFileHash
|
|
|
if (cachedFileHash === currentFileHash) {
|
|
|
// File is unchanged
|
|
|
skippedCount++
|
|
|
@@ -135,7 +135,6 @@ export class DirectoryScanner implements IDirectoryScanner {
|
|
|
const blocks = await this.codeParser.parseFile(filePath, { content, fileHash: currentFileHash })
|
|
|
const fileBlockCount = blocks.length
|
|
|
onFileParsed?.(fileBlockCount)
|
|
|
- codeBlocks.push(...blocks)
|
|
|
processedCount++
|
|
|
|
|
|
// Process embeddings if configured
|
|
|
@@ -146,20 +145,11 @@ export class DirectoryScanner implements IDirectoryScanner {
|
|
|
const trimmedContent = block.content.trim()
|
|
|
if (trimmedContent) {
|
|
|
const release = await mutex.acquire()
|
|
|
- totalBlockCount += fileBlockCount
|
|
|
try {
|
|
|
currentBatchBlocks.push(block)
|
|
|
currentBatchTexts.push(trimmedContent)
|
|
|
addedBlocksFromFile = true
|
|
|
|
|
|
- if (addedBlocksFromFile) {
|
|
|
- currentBatchFileInfos.push({
|
|
|
- filePath,
|
|
|
- fileHash: currentFileHash,
|
|
|
- isNew: !this.cacheManager.getHash(filePath),
|
|
|
- })
|
|
|
- }
|
|
|
-
|
|
|
// Check if batch threshold is met
|
|
|
if (currentBatchBlocks.length >= BATCH_SEGMENT_THRESHOLD) {
|
|
|
// Copy current batch data and clear accumulators
|
|
|
@@ -181,13 +171,33 @@ export class DirectoryScanner implements IDirectoryScanner {
|
|
|
onBlocksIndexed,
|
|
|
),
|
|
|
)
|
|
|
- activeBatchPromises.push(batchPromise)
|
|
|
+ activeBatchPromises.add(batchPromise)
|
|
|
+
|
|
|
+ // Clean up completed promises to prevent memory accumulation
|
|
|
+ batchPromise.finally(() => {
|
|
|
+ activeBatchPromises.delete(batchPromise)
|
|
|
+ })
|
|
|
}
|
|
|
} finally {
|
|
|
release()
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+ // Add file info once per file (outside the block loop)
|
|
|
+ if (addedBlocksFromFile) {
|
|
|
+ const release = await mutex.acquire()
|
|
|
+ try {
|
|
|
+ totalBlockCount += fileBlockCount
|
|
|
+ currentBatchFileInfos.push({
|
|
|
+ filePath,
|
|
|
+ fileHash: currentFileHash,
|
|
|
+ isNew: isNewFile,
|
|
|
+ })
|
|
|
+ } finally {
|
|
|
+ release()
|
|
|
+ }
|
|
|
+ }
|
|
|
} else {
|
|
|
// Only update hash if not being processed in a batch
|
|
|
await this.cacheManager.updateHash(filePath, currentFileHash)
|
|
|
@@ -232,7 +242,12 @@ export class DirectoryScanner implements IDirectoryScanner {
|
|
|
const batchPromise = batchLimiter(() =>
|
|
|
this.processBatch(batchBlocks, batchTexts, batchFileInfos, scanWorkspace, onError, onBlocksIndexed),
|
|
|
)
|
|
|
- activeBatchPromises.push(batchPromise)
|
|
|
+ activeBatchPromises.add(batchPromise)
|
|
|
+
|
|
|
+ // Clean up completed promises to prevent memory accumulation
|
|
|
+ batchPromise.finally(() => {
|
|
|
+ activeBatchPromises.delete(batchPromise)
|
|
|
+ })
|
|
|
} finally {
|
|
|
release()
|
|
|
}
|
|
|
@@ -280,7 +295,6 @@ export class DirectoryScanner implements IDirectoryScanner {
|
|
|
}
|
|
|
|
|
|
return {
|
|
|
- codeBlocks,
|
|
|
stats: {
|
|
|
processed: processedCount,
|
|
|
skipped: skippedCount,
|