AI
/
cline
espejo de https://github.com/cline/cline.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185
							import { runSingleEvaluation, TestInput, TestResult } from "./ClineWrapper"
import { parseAssistantMessageV2, AssistantMessageContent } from "./parsing/parse-assistant-message-06-06-25"
import { constructNewFileContent as constructNewFileContent_06_06_25 } from "./diff-apply/diff-06-06-25"
import { constructNewFileContent as constructNewFileContent_06_23_25 } from "./diff-apply/diff-06-23-25"
import { constructNewFileContent as constructNewFileContent_06_25_25 } from "./diff-apply/diff-06-25-25"
import { constructNewFileContent as constructNewFileContent_06_26_25 } from "./diff-apply/diff-06-26-25"
import { constructNewFileContent as constructNewFileContentV3 } from "../../src/core/assistant-message/diff"
import { basicSystemPrompt } from "./prompts/basicSystemPrompt-06-06-25"
import { claude4SystemPrompt } from "./prompts/claude4SystemPrompt-06-06-25"
import { formatResponse, log } from "./helpers"
import { Anthropic } from "@anthropic-ai/sdk"
import * as fs from "fs"
import * as path from "path"
import { Command } from "commander"
import { InputMessage, ProcessedTestCase, TestCase, TestConfig, SystemPromptDetails, ConstructSystemPromptFn } from "./types"
import { loadOpenRouterModelData, EvalOpenRouterModelInfo } from "./openRouterModelsHelper" // Added import
import {
	getDatabase,
	upsertSystemPrompt,
	upsertProcessingFunctions,
	upsertFile,
	createBenchmarkRun,
	createCase,
	insertResult,
	DatabaseClient,
	CreateResultInput,
	getResultsByRun,
	getCaseById,
	getFileByHash,
	getBenchmarkRun,
} from "./database"

// Load environment variables from .env file
import * as dotenv from "dotenv"
dotenv.config({ path: path.join(__dirname, "../.env") })

// tiktoken for token counting
import { get_encoding } from "tiktoken";
const encoding = get_encoding("cl100k_base"); 

let openRouterModelDataGlobal: Record<string, EvalOpenRouterModelInfo> = {}; // Global to store fetched data

const systemPromptGeneratorLookup: Record<string, ConstructSystemPromptFn> = {
	basicSystemPrompt: basicSystemPrompt,
	claude4SystemPrompt: claude4SystemPrompt,
}

type TestResultSet = { [test_id: string]: (TestResult & { test_id?: string })[] }

class NodeTestRunner {
	private apiKey: string | undefined
	private provider: string
	private currentRunId: string | null = null
	private systemPromptHash: string | null = null
	private processingFunctionsHash: string | null = null
	private caseIdMap: Map<string, string> = new Map() // test_id -> case_id mapping

	constructor(isReplay: boolean, provider: string = "openrouter") {
		this.provider = provider
		if (!isReplay) {
			if (provider === "openai") {
				this.apiKey = process.env.OPENAI_API_KEY
				if (!this.apiKey) {
					throw new Error("OPENAI_API_KEY environment variable not set for a non-replay run with OpenAI provider.")
				}
			} else {
				this.apiKey = process.env.OPENROUTER_API_KEY
				if (!this.apiKey) {
					throw new Error("OPENROUTER_API_KEY environment variable not set for a non-replay run with OpenRouter provider.")
				}
			}
		}
	}

	/**
	 * Initialize database run and store system prompt and processing functions
	 */
	async initializeDatabaseRun(testConfig: TestConfig, testCases: ProcessedTestCase[], isVerbose: boolean): Promise<string> {
		try {
			// Generate a sample system prompt to hash (using first test case)
			const sampleSystemPrompt = testCases.length > 0 
				? this.constructSystemPrompt(testCases[0].system_prompt_details, testConfig.system_prompt_name)
				: "default-system-prompt";

			// Store system prompt
			this.systemPromptHash = await upsertSystemPrompt({
				name: testConfig.system_prompt_name,
				content: sampleSystemPrompt
			});

			// Store processing functions
			this.processingFunctionsHash = await upsertProcessingFunctions({
				name: `${testConfig.parsing_function}-${testConfig.diff_edit_function}`,
				parsing_function: testConfig.parsing_function,
				diff_edit_function: testConfig.diff_edit_function
			});

			// Create benchmark run
			const runDescription = `Model: ${testConfig.model_id}, Cases: ${testCases.length}, Runs per case: ${testConfig.number_of_runs}`;
			this.currentRunId = await createBenchmarkRun({
				description: runDescription,
				system_prompt_hash: this.systemPromptHash
			});

			log(isVerbose, `✓ Database run initialized: ${this.currentRunId}`);
			
			// Create case records
			await this.createDatabaseCases(testCases, isVerbose);

			return this.currentRunId;
		} catch (error) {
			console.error("Failed to initialize database run:", error);
			throw error;
		}
	}

	/**
	 * Initialize multi-model database run (one run for all models)
	 */
	async initializeMultiModelRun(testCases: ProcessedTestCase[], systemPromptName: string, parsingFunction: string, diffEditFunction: string, runDescription: string, isVerbose: boolean): Promise<string> {
		try {
			// Generate a sample system prompt to hash (using first test case)
			const sampleSystemPrompt = testCases.length > 0 
				? this.constructSystemPrompt(testCases[0].system_prompt_details, systemPromptName)
				: "default-system-prompt";

			// Store system prompt
			this.systemPromptHash = await upsertSystemPrompt({
				name: systemPromptName,
				content: sampleSystemPrompt
			});

			// Store processing functions
			this.processingFunctionsHash = await upsertProcessingFunctions({
				name: `${parsingFunction}-${diffEditFunction}`,
				parsing_function: parsingFunction,
				diff_edit_function: diffEditFunction
			});

			// Create benchmark run
			this.currentRunId = await createBenchmarkRun({
				description: runDescription,
				system_prompt_hash: this.systemPromptHash
			});

			log(isVerbose, `✓ Multi-model database run initialized: ${this.currentRunId}`);
			
			// Create case records
			await this.createDatabaseCases(testCases, isVerbose);

			return this.currentRunId;
		} catch (error) {
			console.error("Failed to initialize multi-model database run:", error);
			throw error;
		}
	}

	/**
	 * Create database case records for all test cases
	 */
	async createDatabaseCases(testCases: ProcessedTestCase[], isVerbose: boolean): Promise<void> {
		if (!this.currentRunId || !this.systemPromptHash) {
			throw new Error("Database run not initialized");
		}

		for (const testCase of testCases) {
			try {
				// Store file content if available
				let fileHash: string | undefined;
				if (testCase.file_contents && testCase.file_path) {
					fileHash = await upsertFile({
						filepath: testCase.file_path,
						content: testCase.file_contents
					});
				}

				// Calculate tokens in context (approximate)
				const tokensInContext = this.estimateTokens(testCase.messages);

				// Create case record
				const caseId = await createCase({
					run_id: this.currentRunId,
					description: testCase.test_id,
					system_prompt_hash: this.systemPromptHash,
					task_id: testCase.test_id,
					tokens_in_context: tokensInContext,
					file_hash: fileHash
				});

				this.caseIdMap.set(testCase.test_id, caseId);
			} catch (error) {
				console.error(`Failed to create database case for ${testCase.test_id}:`, error);
				// Continue with other cases
			}
		}

		log(isVerbose, `✓ Created ${this.caseIdMap.size} database case records`);
	}

	/**
	 * Store replay result in database, copying original data but with new diffing results
	 */
	async storeReplayResultInDatabase(replayResult: TestResult, originalResult: any, testId: string, newCaseId: string): Promise<void> {
		if (!this.currentRunId || !this.processingFunctionsHash) {
			return; // Skip if database not initialized
		}

		try {
			// Map error string to error enum (simple mapping)
			const errorEnum = this.mapErrorToEnum(replayResult.error);

			// Store diff edit content if available
			let fileEditedHash: string | undefined;
			if (replayResult.diffEdit) {
				fileEditedHash = await upsertFile({
					filepath: `diff-edit-${testId}`,
					content: replayResult.diffEdit
				});
			}

			// Calculate basic metrics from diff edit if available
			let numEdits = 0;
			let numLinesAdded = 0;
			let numLinesDeleted = 0;
			
			if (replayResult.diffEdit) {
				// Simple parsing to count edits - count SEARCH/REPLACE blocks
				const searchBlocks = (replayResult.diffEdit.match(/------- SEARCH/g) || []).length;
				numEdits = searchBlocks;
				
				// Count added/deleted lines (rough approximation)
				const lines = replayResult.diffEdit.split('\n');
				for (const line of lines) {
					if (line.startsWith('+') && !line.startsWith('+++')) {
						numLinesAdded++;
					} else if (line.startsWith('-') && !line.startsWith('---')) {
						numLinesDeleted++;
					}
				}
			}

			// Copy original result data but update replay-specific fields
			const resultInput: CreateResultInput = {
				run_id: this.currentRunId, // New run ID
				case_id: newCaseId, // New case ID
				model_id: originalResult.model_id, // Copy from original
				processing_functions_hash: this.processingFunctionsHash, // New processing functions
				succeeded: replayResult.success && (replayResult.diffEditSuccess ?? false), // New result
				error_enum: errorEnum, // New error if any
				num_edits: numEdits || originalResult.num_edits, // New or original
				num_lines_deleted: numLinesDeleted || originalResult.num_lines_deleted, // New or original
				num_lines_added: numLinesAdded || originalResult.num_lines_added, // New or original
				// Copy timing and cost data from original (since we didn't make API calls)
				time_to_first_token_ms: originalResult.time_to_first_token_ms,
				time_to_first_edit_ms: originalResult.time_to_first_edit_ms,
				time_round_trip_ms: originalResult.time_round_trip_ms,
				cost_usd: originalResult.cost_usd,
				completion_tokens: originalResult.completion_tokens,
				// Use original model output (since we're replaying)
				raw_model_output: originalResult.raw_model_output,
				file_edited_hash: fileEditedHash || originalResult.file_edited_hash,
				parsed_tool_call_json: replayResult.toolCalls ? JSON.stringify(replayResult.toolCalls) : originalResult.parsed_tool_call_json
			};

			await insertResult(resultInput);
		} catch (error) {
			console.error(`Failed to store replay result in database for ${testId}:`, error);
			// Continue execution - don't fail the test run
		}
	}

	/**
	 * Store test result in database
	 */
	async storeResultInDatabase(result: TestResult, testId: string, modelId: string): Promise<void> {
		if (!this.currentRunId || !this.processingFunctionsHash) {
			return; // Skip if database not initialized
		}

		const caseId = this.caseIdMap.get(testId);
		if (!caseId) {
			return; // Skip if case not found
		}

		try {
			// Map error string to error enum (simple mapping)
			const errorEnum = this.mapErrorToEnum(result.error);

			// Store diff edit content if available
			let fileEditedHash: string | undefined;
			if (result.diffEdit) {
				fileEditedHash = await upsertFile({
					filepath: `diff-edit-${testId}`,
					content: result.diffEdit
				});
			}

			// Calculate basic metrics from diff edit if available
			let numEdits = 0;
			let numLinesAdded = 0;
			let numLinesDeleted = 0;
			
			if (result.diffEdit) {
				// Simple parsing to count edits - count SEARCH/REPLACE blocks
				const searchBlocks = (result.diffEdit.match(/------- SEARCH/g) || []).length;
				numEdits = searchBlocks;
				
				// Count added/deleted lines (rough approximation)
				const lines = result.diffEdit.split('\n');
				for (const line of lines) {
					if (line.startsWith('+') && !line.startsWith('+++')) {
						numLinesAdded++;
					} else if (line.startsWith('-') && !line.startsWith('---')) {
						numLinesDeleted++;
					}
				}
			}

			const resultInput: CreateResultInput = {
				run_id: this.currentRunId,
				case_id: caseId,
				model_id: modelId,
				processing_functions_hash: this.processingFunctionsHash,
				succeeded: result.success && (result.diffEditSuccess ?? false),
				error_enum: errorEnum,
				num_edits: numEdits || undefined,
				num_lines_deleted: numLinesDeleted || undefined,
				num_lines_added: numLinesAdded || undefined,
				time_to_first_token_ms: result.streamResult?.timing?.timeToFirstTokenMs,
				time_to_first_edit_ms: result.streamResult?.timing?.timeToFirstEditMs,
				time_round_trip_ms: result.streamResult?.timing?.totalRoundTripMs,
				cost_usd: result.streamResult?.usage?.totalCost,
				completion_tokens: result.streamResult?.usage?.outputTokens,
				raw_model_output: result.streamResult?.assistantMessage,
				file_edited_hash: fileEditedHash,
				parsed_tool_call_json: result.toolCalls ? JSON.stringify(result.toolCalls) : undefined
			};

			await insertResult(resultInput);
		} catch (error) {
			console.error(`Failed to store result in database for ${testId}:`, error);
			// Continue execution - don't fail the test run
		}
	}

	/**
	 * Estimate token count for messages (rough approximation)
	 */
	public estimateTokens(messages: Anthropic.Messages.MessageParam[]): number { // Made public
		let totalText = "";
		for (const message of messages) {
			if (Array.isArray(message.content)) {
				for (const block of message.content) {
					if (block.type === 'text') {
						totalText += block.text + "\n";
					}
				}
			} else if (typeof message.content === 'string') {
				totalText += message.content + "\n";
			}
		}
		return encoding.encode(totalText).length;
	}

	/**
	 * Map error string to error enum
	 */
	private mapErrorToEnum(error?: string): number | undefined {
		if (!error) return undefined;
		
		const errorMap: Record<string, number> = {
			'no_tool_calls': 1,
			'parsing_error': 2,
			'diff_edit_error': 3,
			'missing_original_diff_edit_tool_call_message': 4,
			'api_error': 5,
			'wrong_tool_call': 6,
			'wrong_file_edited': 7,
			'multi_tool_calls': 8,
			'tool_call_params_undefined': 9,
			'other_error': 99
		};

		return errorMap[error] || 99; // 99 for unknown errors
	}

	/**
	 * convert our messages array into a properly formatted Anthropic messages array
	 */
	transformMessages(messages: InputMessage[]): Anthropic.Messages.MessageParam[] {
		return messages.map((msg) => {
			// Use TextBlockParam here for constructing the input message
			const content: (Anthropic.TextBlockParam | Anthropic.ImageBlockParam)[] = []

			if (msg.text) {
				// This object now correctly matches the TextBlockParam type
				content.push({ type: "text", text: msg.text })
			}

			if (msg.images && Array.isArray(msg.images)) {
				const imageBlocks = formatResponse.imageBlocks(msg.images)
				content.push(...imageBlocks)
			}

			return {
				role: msg.role,
				content: content,
			}
		})
	}

	/**
	 * Generate the system prompt on the fly
	 */
	constructSystemPrompt(systemPromptDetails: SystemPromptDetails, systemPromptName: string) {
		const systemPromptGenerator = systemPromptGeneratorLookup[systemPromptName]

		const { cwd_value, browser_use, width, height, os_value, shell_value, home_value, mcp_string, user_custom_instructions } =
			systemPromptDetails

		const systemPrompt = systemPromptGenerator(
			cwd_value,
			browser_use,
			width,
			height,
			os_value,
			shell_value,
			home_value,
			mcp_string,
			user_custom_instructions,
		)

		return systemPrompt
	}

	/**
	 * Loads our test cases from a directory of json files
	 */
	loadTestCases(testDirectoryPath: string, isVerbose: boolean): TestCase[] {
		const testCasesArray: TestCase[] = []
		const dirents = fs.readdirSync(testDirectoryPath, { withFileTypes: true })

		for (const dirent of dirents) {
			if (dirent.isFile() && dirent.name.endsWith(".json")) {
				const testFilePath = path.join(testDirectoryPath, dirent.name)
				const fileContent = fs.readFileSync(testFilePath, "utf8")
				const testCase: TestCase = JSON.parse(fileContent)

				// Use the filename (without extension) as the test_id if not provided
				if (!testCase.test_id) {
					testCase.test_id = path.parse(dirent.name).name
				}

				// Filter out cases with missing file_contents
				if (!testCase.file_contents || testCase.file_contents.trim() === "") {
					log(isVerbose, `Skipping case ${testCase.test_id}: missing or empty file_contents.`);
					continue;
				}
				testCasesArray.push(testCase)
			}
		}
		return testCasesArray
	}

	/**
	 * Saves the test results to the specified output directory.
	 */
	saveTestResults(results: TestResultSet, outputPath: string) {
		// Ensure output directory exists
		if (!fs.existsSync(outputPath)) {
			fs.mkdirSync(outputPath, { recursive: true })
		}

		// Write each test result to its own file
		for (const testId in results) {
			const outputFilePath = path.join(outputPath, `${testId}.json`)
			const testResult = results[testId]
			fs.writeFileSync(outputFilePath, JSON.stringify(testResult, null, 2))
		}
	}

	async runDatabaseReplay(replayRunId: string, diffApplyFile: string, isVerbose: boolean) {
		log(isVerbose, `Starting database replay for run_id: ${replayRunId}`)
		log(isVerbose, `Using diff apply file: ${diffApplyFile}`)

		// 1. Get the correct diffing function
		const diffEditingFunctions: Record<string, any> = {
			"diff-06-06-25": constructNewFileContent_06_06_25,
			"diff-06-23-25": constructNewFileContent_06_23_25,
			"diff-06-25-25": constructNewFileContent_06_25_25,
			"diff-06-26-25": constructNewFileContent_06_26_25,
			constructNewFileContentV3: constructNewFileContentV3,
		}
		const constructNewFileContent = diffEditingFunctions[diffApplyFile]

		if (!constructNewFileContent) {
			throw new Error(`Could not find diff apply function for: ${diffApplyFile}`)
		}
		log(isVerbose, `Successfully loaded diff apply function: ${diffApplyFile}`)

		// 2. Fetch original run data
		const originalResults = await getResultsByRun(replayRunId)
		if (originalResults.length === 0) {
			throw new Error(`No results found for run_id: ${replayRunId}`)
		}
		log(isVerbose, `Found ${originalResults.length} results to replay.`)

		const originalRun = await getBenchmarkRun(replayRunId)
		if (!originalRun) {
			throw new Error(`Could not find original run with id ${replayRunId}`)
		}

		// 3. Create a new benchmark run for the replay
		const replayRunDescription = `Replay of run ${replayRunId} using ${diffApplyFile}`
		this.currentRunId = await createBenchmarkRun({
			description: replayRunDescription,
			system_prompt_hash: originalRun.system_prompt_hash,
		})
		log(isVerbose, `Created new run for replay: ${this.currentRunId}`)

		// 4. Set up processing functions for the new run
		this.processingFunctionsHash = await upsertProcessingFunctions({
			name: `replay-${diffApplyFile}`,
			parsing_function: "parseAssistantMessageV2",
			diff_edit_function: diffApplyFile,
		})

		// 5. Process each result from the original run
		let replayedCount = 0
		const caseIdMirror: Map<string, string> = new Map()

		for (const originalResult of originalResults) {
			// 5a. Basic validation to ensure we can even process this
			if (!originalResult.case_id) {
				log(isVerbose, `Skipping result ${originalResult.result_id} due to missing case_id.`)
				continue
			}

			// 5b. Mirror the case for the new run, reusing if already created
			let newCaseId = caseIdMirror.get(originalResult.case_id)
			if (!newCaseId) {
				const originalCase = await getCaseById(originalResult.case_id)
				if (!originalCase) {
					log(isVerbose, `Skipping result ${originalResult.result_id} because original case could not be found.`)
					continue
				}
				newCaseId = await createCase({
					run_id: this.currentRunId,
					description: `Replay of case ${originalCase.case_id} from run ${replayRunId}`,
					system_prompt_hash: originalCase.system_prompt_hash,
					task_id: originalCase.task_id,
					tokens_in_context: originalCase.tokens_in_context,
					file_hash: originalCase.file_hash,
				})
				caseIdMirror.set(originalResult.case_id, newCaseId)
			}

			// 5c. Determine if the original attempt was a "valid attempt"
			const isValidOriginalAttempt = originalResult.error_enum === null || originalResult.error_enum === 3 // 3 is diff_edit_error

			const newResultInput: CreateResultInput = {
				...(originalResult as any),
				run_id: this.currentRunId,
				case_id: newCaseId,
				processing_functions_hash: this.processingFunctionsHash,
			}
			delete (newResultInput as any).result_id

			if (isValidOriginalAttempt) {
				// This was a valid attempt. Re-run the diff algorithm.
				const originalCase = await getCaseById(originalResult.case_id)
				if (!originalCase) {
					log(isVerbose, `  [WARN] Replay for result ${originalResult.result_id}: Could not find original case. Copying original result.`)
					newResultInput.succeeded = originalResult.succeeded
					newResultInput.error_enum = originalResult.error_enum
				} else {
					const originalFile = originalCase.file_hash ? await getFileByHash(originalCase.file_hash) : null
					const parsedToolCall = originalResult.parsed_tool_call_json ? JSON.parse(originalResult.parsed_tool_call_json)[0] : null
					const diffContent = parsedToolCall?.input?.diff

					if (originalFile && diffContent) {
						let diffSuccess = false
						try {
							await constructNewFileContent(diffContent, originalFile.content, true)
							diffSuccess = true
							log(isVerbose, `  [OK] Replay for task ${originalCase.task_id}: Diff applied successfully.`)
						} catch (e) {
							diffSuccess = false
							log(isVerbose, `  [FAIL] Replay for task ${originalCase.task_id}: New diff algorithm failed.`)
						}
						newResultInput.succeeded = diffSuccess
						newResultInput.error_enum = diffSuccess ? undefined : 3 // 3 = diff_edit_error
					} else {
						// Something is wrong with the ground truth data, just copy it.
						log(
							isVerbose,
							`  [WARN] Replay for task ${originalCase.task_id}: Valid original attempt but missing file or diff content. Copying original result.`,
						)
						newResultInput.succeeded = originalResult.succeeded
						newResultInput.error_enum = originalResult.error_enum
					}
				}
			} else {
				// This was not a valid attempt. Just copy the original result's outcome.
				log(isVerbose, `  [SKIP] Replay for task ${originalResult.case_id}: Invalid original attempt. Copying original result.`)
				newResultInput.succeeded = originalResult.succeeded
				newResultInput.error_enum = originalResult.error_enum
			}

			await insertResult(newResultInput)
			replayedCount++
		}

		log(isVerbose, `\n✓ Database replay completed successfully.`)
		log(isVerbose, `  Total original results: ${originalResults.length}`)
		log(isVerbose, `  Total replayed results: ${replayedCount}`)
		log(isVerbose, `  New run ID: ${this.currentRunId}`)
	}

	/**
	 * Run a single test example
	 */
	async runSingleTest(testCase: ProcessedTestCase, testConfig: TestConfig, isVerbose: boolean = false): Promise<TestResult> {
		if (testConfig.replay && !testCase.original_diff_edit_tool_call_message) {
			return {
				success: false,
				error: "missing_original_diff_edit_tool_call_message",
				errorString: `Test case ${testCase.test_id} is missing 'original_diff_edit_tool_call_message' for replay.`,
			}
		}

		const customSystemPrompt = this.constructSystemPrompt(testCase.system_prompt_details, testConfig.system_prompt_name)

		// messages don't include system prompt and are everything up to the first replace_in_file tool call which results in a diff edit error
		const input: TestInput = {
			apiKey: this.apiKey,
			systemPrompt: customSystemPrompt,
			messages: testCase.messages,
			modelId: testConfig.model_id,
			originalFile: testCase.file_contents,
			originalFilePath: testCase.file_path,
			parsingFunction: testConfig.parsing_function,
			diffEditFunction: testConfig.diff_edit_function,
			thinkingBudgetTokens: testConfig.thinking_tokens_budget,
			originalDiffEditToolCallMessage: testConfig.replay ? testCase.original_diff_edit_tool_call_message : undefined,
			diffApplyFile: testConfig.diff_apply_file,
			provider: this.provider,
			isVerbose: isVerbose,
		}

		if (isVerbose) {
			log(isVerbose, `    Sending request to ${testConfig.model_id} for test case ${testCase.test_id}...`);
		}
		
		return await runSingleEvaluation(input)
	}

	/**
	 * Runs all the text examples synchonously
	 */
	async runAllTests(testCases: ProcessedTestCase[], testConfig: TestConfig, isVerbose: boolean): Promise<TestResultSet> {
		const results: TestResultSet = {}

		// Initialize database run
		try {
			await this.initializeDatabaseRun(testConfig, testCases, isVerbose);
		} catch (error) {
			log(isVerbose, `Warning: Failed to initialize database: ${error}`);
		}

		for (const testCase of testCases) {
			results[testCase.test_id] = []

			log(isVerbose, `-Running test: ${testCase.test_id}`)
			for (let i = 0; i < testConfig.number_of_runs; i++) {
				log(isVerbose, `  Attempt ${i+1}/${testConfig.number_of_runs} for ${testCase.test_id}...`);
				const result = await this.runSingleTest(testCase, testConfig, isVerbose)
				results[testCase.test_id].push(result)
				
				// Log result status
				if (isVerbose) {
					if (result.success) {
						log(isVerbose, `  ✓ Attempt ${i+1} completed successfully`);
					} else {
						log(isVerbose, `  ✗ Attempt ${i+1} failed (error: ${result.error || 'unknown'})`);
					}
				}
				
				// Store result in database
				try {
					await this.storeResultInDatabase(result, testCase.test_id, testConfig.model_id);
				} catch (error) {
					log(isVerbose, `Warning: Failed to store result in database: ${error}`);
				}
			}
		}
		return results
	}

	/**
	 * Runs all of the text examples asynchronously, with concurrency limit
	 */
	async runAllTestsParallel(
		testCases: ProcessedTestCase[],
		testConfig: TestConfig,
		isVerbose: boolean,
		maxConcurrency: number = 20,
	): Promise<TestResultSet> {
		const results: TestResultSet = {}
		testCases.forEach((tc) => {
			results[tc.test_id] = []
		})

		// Initialize database run
		try {
			await this.initializeDatabaseRun(testConfig, testCases, isVerbose);
		} catch (error) {
			log(isVerbose, `Warning: Failed to initialize database: ${error}`);
		}

		// Create a flat list of all individual runs we need to execute
		const allRuns = testCases.flatMap((testCase) =>
			Array(testConfig.number_of_runs)
				.fill(null)
				.map(() => testCase),
		)

		for (let i = 0; i < allRuns.length; i += maxConcurrency) {
			const batch = allRuns.slice(i, i + maxConcurrency)

			const batchPromises = batch.map((testCase) => {
				log(isVerbose, `  Running test for ${testCase.test_id}...`);
				return this.runSingleTest(testCase, testConfig, isVerbose).then((result) => ({
					...result,
					test_id: testCase.test_id,
				}))
			})

			const batchResults = await Promise.all(batchPromises)

			// Calculate the total cost for this batch
			const batchCost = batchResults.reduce((total, result) => {
				return total + (result.streamResult?.usage?.totalCost || 0)
			}, 0)

			// Populate the results dictionary and store in database
			for (const result of batchResults) {
				if (result.test_id) {
					results[result.test_id].push(result)
					
					// Store result in database
					try {
						await this.storeResultInDatabase(result, result.test_id, testConfig.model_id);
					} catch (error) {
						log(isVerbose, `Warning: Failed to store result in database: ${error}`);
					}
				}
			}

			const batchNumber = i / maxConcurrency + 1
			const totalBatches = Math.ceil(allRuns.length / maxConcurrency)
			log(isVerbose, `-Completed batch ${batchNumber} of ${totalBatches}... (Batch Cost: $${batchCost.toFixed(6)})`)
		}

		return results
	}

	/**
	 * Check if a test result is a valid attempt (no error_enum 1, 6, or 7)
	 */
	isValidAttempt(result: TestResult): boolean {
		// Invalid if error is one of: no_tool_calls, wrong_tool_call, wrong_file_edited
		const invalidErrors = ['no_tool_calls', 'wrong_tool_call', 'wrong_file_edited'];
		return !invalidErrors.includes(result.error || '');
	}

	/**
	 * Runs all tests for a specific model (assumes database run already initialized)
	 * Keeps retrying until we get the requested number of valid attempts per case
	 */
	async runAllTestsForModel(testCases: ProcessedTestCase[], testConfig: TestConfig, isVerbose: boolean): Promise<TestResultSet> {
		const results: TestResultSet = {}

		for (const testCase of testCases) {
			results[testCase.test_id] = []
			let validAttempts = 0;
			let totalAttempts = 0;

			log(isVerbose, `-Running test: ${testCase.test_id}`)
			
			// Keep trying until we get the requested number of valid attempts
			while (validAttempts < testConfig.number_of_runs) {
				totalAttempts++;
				log(isVerbose, `  Attempt ${totalAttempts} for ${testCase.test_id} (${validAttempts}/${testConfig.number_of_runs} valid so far)...`);
				
				const result = await this.runSingleTest(testCase, testConfig, isVerbose)
				results[testCase.test_id].push(result)
				
				// Check if this was a valid attempt
				const isValid = this.isValidAttempt(result);
				if (isValid) {
					validAttempts++;
					log(isVerbose, `  ✓ Valid attempt ${validAttempts}/${testConfig.number_of_runs} completed (${result.success ? 'SUCCESS' : 'FAILED'})`);
				} else {
					log(isVerbose, `  ✗ Invalid attempt (error: ${result.error || 'unknown'})`);
				}
				
				// Store result in database
				try {
					await this.storeResultInDatabase(result, testCase.test_id, testConfig.model_id);
				} catch (error) {
					log(isVerbose, `Warning: Failed to store result in database: ${error}`);
				}
				
				// Safety check to prevent infinite loops - use configurable max attempts limit
				if (totalAttempts >= testConfig.max_attempts_per_case) {
					log(isVerbose, `  ⚠️ Reached maximum attempts (${totalAttempts}) for test case ${testCase.test_id}. Only got ${validAttempts}/${testConfig.number_of_runs} valid attempts.`);
					break;
				}
			}
			
			log(isVerbose, `  ✓ Completed test case ${testCase.test_id}: ${validAttempts}/${testConfig.number_of_runs} valid attempts (${totalAttempts} total attempts)`);
		}
		return results
	}

	/**
	 * Print output of the tests
	 */
	printSummary(results: TestResultSet, isVerbose: boolean) {
		let totalRuns = 0
		let totalPasses = 0
		let totalInputTokens = 0
		let totalOutputTokens = 0
		let totalCost = 0
		let runsWithUsageData = 0
		let totalDiffEditSuccesses = 0
		let totalRunsWithToolCalls = 0
		const testCaseIds = Object.keys(results)

		log(isVerbose, "\n=== TEST SUMMARY ===")

		for (const testId of testCaseIds) {
			const testResults = results[testId]
			const passedCount = testResults.filter((r) => r.success && r.diffEditSuccess).length
			const runCount = testResults.length

			totalRuns += runCount
			totalPasses += passedCount

			const runsWithToolCalls = testResults.filter((r) => r.success === true).length
			const diffEditSuccesses = passedCount
			totalRunsWithToolCalls += runsWithToolCalls
			totalDiffEditSuccesses += diffEditSuccesses

			// Accumulate token and cost data
			for (const result of testResults) {
				if (result.streamResult?.usage) {
					totalInputTokens += result.streamResult.usage.inputTokens
					totalOutputTokens += result.streamResult.usage.outputTokens
					totalCost += result.streamResult.usage.totalCost
					runsWithUsageData++
				}
			}

			log(isVerbose, `\n--- Test Case: ${testId} ---`)
			log(isVerbose, `  Runs: ${runCount}`)
			log(isVerbose, `  Passed: ${passedCount}`)
			log(isVerbose, `  Success Rate: ${runCount > 0 ? ((passedCount / runCount) * 100).toFixed(1) : "N/A"}%`)
		}

		log(isVerbose, "\n\n=== OVERALL SUMMARY ===")
		log(isVerbose, `Total Test Cases: ${testCaseIds.length}`)
		log(isVerbose, `Total Runs Executed: ${totalRuns}`)
		log(isVerbose, `Overall Passed: ${totalPasses}`)
		log(isVerbose, `Overall Failed: ${totalRuns - totalPasses}`)
		log(isVerbose, `Overall Success Rate: ${totalRuns > 0 ? ((totalPasses / totalRuns) * 100).toFixed(1) : "N/A"}%`)

		log(isVerbose, "\n\n=== OVERALL DIFF EDIT SUCCESS RATE ===")
		if (totalRunsWithToolCalls > 0) {
			const diffSuccessRate = (totalDiffEditSuccesses / totalRunsWithToolCalls) * 100
			log(isVerbose, `Total Runs with Successful Tool Calls: ${totalRunsWithToolCalls}`)
			log(isVerbose, `Total Runs with Successful Diff Edits: ${totalDiffEditSuccesses}`)
			log(isVerbose, `Diff Edit Success Rate: ${diffSuccessRate.toFixed(1)}%`)
		} else {
			log(isVerbose, "No successful tool calls to analyze for diff edit success.")
		}

		log(isVerbose, "\n\n=== TOKEN & COST ANALYSIS ===")
		if (runsWithUsageData > 0) {
			log(isVerbose, `Total Input Tokens: ${totalInputTokens.toLocaleString()}`)
			log(isVerbose, `Total Output Tokens: ${totalOutputTokens.toLocaleString()}`)
			log(isVerbose, `Total Cost: $${totalCost.toFixed(6)}`)
			log(isVerbose, "---")
			log(
				isVerbose,
				`Avg Input Tokens / Run: ${(totalInputTokens / runsWithUsageData).toLocaleString(undefined, {
					maximumFractionDigits: 0,
				})}`,
			)
			log(
				isVerbose,
				`Avg Output Tokens / Run: ${(totalOutputTokens / runsWithUsageData).toLocaleString(undefined, {
					maximumFractionDigits: 0,
				})}`,
			)
			log(isVerbose, `Avg Cost / Run: $${(totalCost / runsWithUsageData).toFixed(6)}`)
		} else {
			log(isVerbose, "No usage data available to analyze.")
		}
	}
}

async function main() {
	interface EvaluationTask {
		modelId: string;
		testCase: ProcessedTestCase;
		testConfig: TestConfig;
	}

	const program = new Command()

	const defaultTestPath = path.join(__dirname, "cases")
	const defaultOutputPath = path.join(__dirname, "results")

	program
		.name("TestRunner")
		.description("Run evaluation tests for diff editing")
		.version("1.0.0")
		.option("--test-path <path>", "Path to the directory containing test case JSON files", defaultTestPath)
		.option("--output-path <path>", "Path to the directory to save the test output JSON files", defaultOutputPath)
		.option("--model-ids <model_ids>", "Comma-separated list of model IDs to test")
		.option("--system-prompt-name <name>", "The name of the system prompt to use", "basicSystemPrompt")
		.option("-n, --valid-attempts-per-case <number>", "Number of valid attempts per test case per model (will retry until this many valid attempts are collected)", "1")
		.option("--max-attempts-per-case <number>", "Maximum total attempts per test case (default: 10x valid attempts)")
		.option("--max-cases <number>", "Maximum number of test cases to run (limits total cases loaded)")
		.option("--parsing-function <name>", "The parsing function to use", "parseAssistantMessageV2")
		.option("--diff-edit-function <name>", "The diff editing function to use", "diff-06-26-25")
		.option("--thinking-budget <tokens>", "Set the thinking tokens budget", "0")
		.option("--provider <provider>", "API provider to use (openrouter, openai)", "openrouter")
		.option("--parallel", "Run tests in parallel", false)
		.option("--replay", "Run evaluation from a pre-recorded LLM output, skipping the API call", false)
		.option("--replay-run-id <run_id>", "The ID of the run to replay from the database")
		.option("--diff-apply-file <filename>", "The name of the diff apply file to use for the replay")
		.option("--save-locally", "Save results to local JSON files in addition to database", false)
		.option("-v, --verbose", "Enable verbose logging", false)
		.option("--max-concurrency <number>", "Maximum number of parallel requests", "80")


	program.parse(process.argv)

	const options = program.opts()
	const isVerbose = options.verbose
	const testPath = options.testPath
	const outputPath = options.outputPath
	const saveLocally = options.saveLocally
	const maxConcurrency = parseInt(options.maxConcurrency, 10);

	// Parse model IDs from comma-separated string
	const modelIds = options.modelIds ? options.modelIds.split(',').map(id => id.trim()) : [];
	if (modelIds.length === 0) {
		console.error("Error: --model-ids is required and must contain at least one model ID");
		process.exit(1);
	}

	const validAttemptsPerCase = parseInt(options.validAttemptsPerCase, 10);
	
	// Compute dynamic default for max attempts: 10x valid attempts if not specified
	const maxAttemptsPerCase = options.maxAttemptsPerCase 
		? parseInt(options.maxAttemptsPerCase, 10)
		: validAttemptsPerCase * 10;

	const runner = new NodeTestRunner(options.replay || !!options.replayRunId, options.provider)

	if (options.replayRunId) {
		if (!options.diffApplyFile) {
			console.error("Error: --diff-apply-file is required when using --replay-run-id")
			process.exit(1)
		}
		await runner.runDatabaseReplay(options.replayRunId, options.diffApplyFile, isVerbose)
		return
	}
	
	try {
		const startTime = Date.now()

		// Load OpenRouter model data first
		openRouterModelDataGlobal = await loadOpenRouterModelData(isVerbose);
		if (Object.keys(openRouterModelDataGlobal).length === 0 && isVerbose) {
			log(isVerbose, "Warning: Could not load OpenRouter model data. Context window filtering might be affected for OpenRouter models.");
		}

		const runner = new NodeTestRunner(options.replay, options.provider)
		let allLoadedTestCases = runner.loadTestCases(testPath, isVerbose) // Pass isVerbose
		
		const allProcessedTestCasesGlobal: ProcessedTestCase[] = allLoadedTestCases.map((tc) => ({
			...tc,
			messages: runner.transformMessages(tc.messages),
		}));

		log(isVerbose, `-Loaded ${allLoadedTestCases.length} initial test cases.`)
		log(isVerbose, `-Testing ${modelIds.length} model(s): ${modelIds.join(', ')}`)
		log(isVerbose, `-Target: ${validAttemptsPerCase} valid attempts per test case per model (will retry until this many valid attempts are collected)`)
		if (options.replay) {
			log(isVerbose, `-Running in REPLAY mode. No API calls will be made.`)
		}
		log(isVerbose, "Starting tests...\n")

		// Determine the smallest context window among all specified models
		let smallestContextWindow = Infinity;
		for (const modelId of modelIds) {
			let modelInfo = openRouterModelDataGlobal[modelId];
			if (!modelInfo) {
				const foundKey = Object.keys(openRouterModelDataGlobal).find(
					key => key.includes(modelId) || modelId.includes(key)
				);
				if (foundKey) modelInfo = openRouterModelDataGlobal[foundKey];
			}
			const currentModelContext = modelInfo?.contextWindow;
			if (currentModelContext && currentModelContext > 0) {
				if (currentModelContext < smallestContextWindow) {
					smallestContextWindow = currentModelContext;
				}
			} else {
				log(isVerbose, `Warning: Context window for model ${modelId} is unknown or zero. It will not constrain the test case selection.`);
			}
		}

		if (smallestContextWindow === Infinity) {
			log(isVerbose, "Warning: Could not determine a common smallest context window. Proceeding with all loaded cases, context issues may occur.");
		} else {
			log(isVerbose, `Smallest common context window (with padding consideration) across specified models: ${smallestContextWindow} (target for filtering: ${smallestContextWindow - 20000})`);
		}
		
		let eligibleCasesForThisRun = [...allLoadedTestCases];
		if (smallestContextWindow !== Infinity && smallestContextWindow > 20000) { // Only filter if a valid smallest window is found
			const originalCaseCount = eligibleCasesForThisRun.length;
			eligibleCasesForThisRun = eligibleCasesForThisRun.filter(tc => {
				const systemPromptText = runner.constructSystemPrompt(tc.system_prompt_details, options.systemPromptName);
				const systemPromptTokens = encoding.encode(systemPromptText).length;
				const messagesTokens = runner.estimateTokens(runner.transformMessages(tc.messages));
				const totalInputTokens = systemPromptTokens + messagesTokens;
				return totalInputTokens + 20000 <= smallestContextWindow; // 20k padding
			});
			log(isVerbose, `Filtered to ${eligibleCasesForThisRun.length} cases (from ${originalCaseCount}) to fit smallest context window of ${smallestContextWindow} (with padding).`);
		}

		// Apply max-cases limit if specified, to the context-filtered list
		if (options.maxCases && options.maxCases > 0 && eligibleCasesForThisRun.length > options.maxCases) {
			log(isVerbose, `Limiting to ${options.maxCases} test cases (out of ${eligibleCasesForThisRun.length} eligible).`);
			eligibleCasesForThisRun = eligibleCasesForThisRun.slice(0, options.maxCases);
		}

		if (eligibleCasesForThisRun.length === 0) {
			log(isVerbose, `No eligible test cases found after filtering for all specified models. Exiting.`);
			process.exit(0);
		}
		
		const processedEligibleCasesForRun: ProcessedTestCase[] = eligibleCasesForThisRun.map((tc) => ({
			...tc,
			messages: runner.transformMessages(tc.messages),
		}));

		// Initialize ONE database run for ALL models using the commonly eligible cases
		const runDescription = `Models: ${modelIds.join(', ')}, Common Cases: ${processedEligibleCasesForRun.length}, Valid attempts per case: ${validAttemptsPerCase}`;
		await runner.initializeMultiModelRun(processedEligibleCasesForRun, options.systemPromptName, options.parsingFunction, options.diffEditFunction, runDescription, isVerbose);

		// Create a global task queue
		const globalTaskQueue: EvaluationTask[] = modelIds.flatMap(modelId => 
			processedEligibleCasesForRun.map(testCase => ({
				modelId,
				testCase,
				testConfig: {
					model_id: modelId,
					system_prompt_name: options.systemPromptName,
					number_of_runs: validAttemptsPerCase,
					max_attempts_per_case: maxAttemptsPerCase,
					parsing_function: options.parsingFunction,
					diff_edit_function: options.diffEditFunction,
					thinking_tokens_budget: parseInt(options.thinkingBudget, 10),
					replay: options.replay,
				}
			}))
		);

		const results: TestResultSet = {};
		const taskStates: Record<string, { valid: number; total: number; pending: number }> = {};

		globalTaskQueue.forEach(({ modelId, testCase }) => {
			const taskId = `${modelId}-${testCase.test_id}`;
			taskStates[taskId] = { valid: 0, total: 0, pending: 0 };
			if (!results[testCase.test_id]) {
				results[testCase.test_id] = [];
			}
		});

		let remainingTasks = [...globalTaskQueue];

		while (remainingTasks.length > 0) {
			const batch: EvaluationTask[] = [];
			for (const task of remainingTasks) {
				if (batch.length >= maxConcurrency) break;
				const taskId = `${task.modelId}-${task.testCase.test_id}`;
				if ((taskStates[taskId].valid + taskStates[taskId].pending) < validAttemptsPerCase) {
					batch.push(task);
					taskStates[taskId].pending++;
				}
			}

			if (batch.length === 0) {
				await new Promise(resolve => setTimeout(resolve, 100));
				continue;
			}

			const batchPromises = batch.map(task => {
				const taskId = `${task.modelId}-${task.testCase.test_id}`;
				taskStates[taskId].total++;
				log(isVerbose, `  Attempt ${taskStates[taskId].total} for ${task.testCase.test_id} with ${task.modelId} (${taskStates[taskId].valid} valid, ${taskStates[taskId].pending - 1} pending)...`);
				return runner.runSingleTest(task.testCase, task.testConfig, isVerbose).then(result => ({
					...result,
					test_id: task.testCase.test_id,
					modelId: task.modelId,
				}));
			});

			const batchResults = await Promise.all(batchPromises);

			for (const result of batchResults) {
				const taskId = `${result.modelId}-${result.test_id}`;
				taskStates[taskId].pending--;
				results[result.test_id].push(result);

				if (runner.isValidAttempt(result)) {
					taskStates[taskId].valid++;
					log(isVerbose, `  ✓ Valid attempt ${taskStates[taskId].valid}/${validAttemptsPerCase} for ${result.test_id} with ${result.modelId} completed (${result.success ? 'SUCCESS' : 'FAILED'})`);
				} else {
					log(isVerbose, `  ✗ Invalid attempt for ${result.test_id} with ${result.modelId} (error: ${result.error || 'unknown'})`);
				}

				await runner.storeResultInDatabase(result, result.test_id, result.modelId);
			}

			remainingTasks = remainingTasks.filter(task => {
				const taskId = `${task.modelId}-${task.testCase.test_id}`;
				if (taskStates[taskId].total >= task.testConfig.max_attempts_per_case) {
					log(isVerbose, `  ⚠️ Reached maximum attempts for ${task.testCase.test_id} with ${task.modelId}.`);
					return false;
				}
				return taskStates[taskId].valid < validAttemptsPerCase;
			});

			const batchCost = batchResults.reduce((total, result) => total + (result.streamResult?.usage?.totalCost || 0), 0);
			log(isVerbose, `-Completed batch... (Batch Cost: $${batchCost.toFixed(6)}, Remaining tasks: ${remainingTasks.length})`);
		}

		// Print summary for each model
		for (const modelId of modelIds) {
			const modelResults: TestResultSet = {};
			Object.keys(results).forEach(testId => {
				modelResults[testId] = results[testId].filter(r => (r as any).modelId === modelId);
			});
			log(isVerbose, `\n=== Results for Model: ${modelId} ===`);
			runner.printSummary(modelResults, isVerbose);
		}

		const endTime = Date.now()
		const durationSeconds = ((endTime - startTime) / 1000).toFixed(2)
		log(isVerbose, `\n-Total execution time: ${durationSeconds} seconds`)

		// Save results locally if requested
		if (saveLocally) {
			runner.saveTestResults(results, outputPath);
			log(isVerbose, `✓ Results also saved to JSON files in ${outputPath}`);
		}

		log(isVerbose, `\n✓ All results stored in database. Use the dashboard to view results.`)
	} catch (error) {
		console.error("\nError running tests:", error)
		process.exit(1)
	}
}

if (require.main === module) {
	main()
}