TestRunner.ts 44 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185
  1. import { runSingleEvaluation, TestInput, TestResult } from "./ClineWrapper"
  2. import { parseAssistantMessageV2, AssistantMessageContent } from "./parsing/parse-assistant-message-06-06-25"
  3. import { constructNewFileContent as constructNewFileContent_06_06_25 } from "./diff-apply/diff-06-06-25"
  4. import { constructNewFileContent as constructNewFileContent_06_23_25 } from "./diff-apply/diff-06-23-25"
  5. import { constructNewFileContent as constructNewFileContent_06_25_25 } from "./diff-apply/diff-06-25-25"
  6. import { constructNewFileContent as constructNewFileContent_06_26_25 } from "./diff-apply/diff-06-26-25"
  7. import { constructNewFileContent as constructNewFileContentV3 } from "../../src/core/assistant-message/diff"
  8. import { basicSystemPrompt } from "./prompts/basicSystemPrompt-06-06-25"
  9. import { claude4SystemPrompt } from "./prompts/claude4SystemPrompt-06-06-25"
  10. import { formatResponse, log } from "./helpers"
  11. import { Anthropic } from "@anthropic-ai/sdk"
  12. import * as fs from "fs"
  13. import * as path from "path"
  14. import { Command } from "commander"
  15. import { InputMessage, ProcessedTestCase, TestCase, TestConfig, SystemPromptDetails, ConstructSystemPromptFn } from "./types"
  16. import { loadOpenRouterModelData, EvalOpenRouterModelInfo } from "./openRouterModelsHelper" // Added import
  17. import {
  18. getDatabase,
  19. upsertSystemPrompt,
  20. upsertProcessingFunctions,
  21. upsertFile,
  22. createBenchmarkRun,
  23. createCase,
  24. insertResult,
  25. DatabaseClient,
  26. CreateResultInput,
  27. getResultsByRun,
  28. getCaseById,
  29. getFileByHash,
  30. getBenchmarkRun,
  31. } from "./database"
  32. // Load environment variables from .env file
  33. import * as dotenv from "dotenv"
  34. dotenv.config({ path: path.join(__dirname, "../.env") })
  35. // tiktoken for token counting
  36. import { get_encoding } from "tiktoken";
  37. const encoding = get_encoding("cl100k_base");
  38. let openRouterModelDataGlobal: Record<string, EvalOpenRouterModelInfo> = {}; // Global to store fetched data
  39. const systemPromptGeneratorLookup: Record<string, ConstructSystemPromptFn> = {
  40. basicSystemPrompt: basicSystemPrompt,
  41. claude4SystemPrompt: claude4SystemPrompt,
  42. }
  43. type TestResultSet = { [test_id: string]: (TestResult & { test_id?: string })[] }
  44. class NodeTestRunner {
  45. private apiKey: string | undefined
  46. private provider: string
  47. private currentRunId: string | null = null
  48. private systemPromptHash: string | null = null
  49. private processingFunctionsHash: string | null = null
  50. private caseIdMap: Map<string, string> = new Map() // test_id -> case_id mapping
  51. constructor(isReplay: boolean, provider: string = "openrouter") {
  52. this.provider = provider
  53. if (!isReplay) {
  54. if (provider === "openai") {
  55. this.apiKey = process.env.OPENAI_API_KEY
  56. if (!this.apiKey) {
  57. throw new Error("OPENAI_API_KEY environment variable not set for a non-replay run with OpenAI provider.")
  58. }
  59. } else {
  60. this.apiKey = process.env.OPENROUTER_API_KEY
  61. if (!this.apiKey) {
  62. throw new Error("OPENROUTER_API_KEY environment variable not set for a non-replay run with OpenRouter provider.")
  63. }
  64. }
  65. }
  66. }
  67. /**
  68. * Initialize database run and store system prompt and processing functions
  69. */
  70. async initializeDatabaseRun(testConfig: TestConfig, testCases: ProcessedTestCase[], isVerbose: boolean): Promise<string> {
  71. try {
  72. // Generate a sample system prompt to hash (using first test case)
  73. const sampleSystemPrompt = testCases.length > 0
  74. ? this.constructSystemPrompt(testCases[0].system_prompt_details, testConfig.system_prompt_name)
  75. : "default-system-prompt";
  76. // Store system prompt
  77. this.systemPromptHash = await upsertSystemPrompt({
  78. name: testConfig.system_prompt_name,
  79. content: sampleSystemPrompt
  80. });
  81. // Store processing functions
  82. this.processingFunctionsHash = await upsertProcessingFunctions({
  83. name: `${testConfig.parsing_function}-${testConfig.diff_edit_function}`,
  84. parsing_function: testConfig.parsing_function,
  85. diff_edit_function: testConfig.diff_edit_function
  86. });
  87. // Create benchmark run
  88. const runDescription = `Model: ${testConfig.model_id}, Cases: ${testCases.length}, Runs per case: ${testConfig.number_of_runs}`;
  89. this.currentRunId = await createBenchmarkRun({
  90. description: runDescription,
  91. system_prompt_hash: this.systemPromptHash
  92. });
  93. log(isVerbose, `✓ Database run initialized: ${this.currentRunId}`);
  94. // Create case records
  95. await this.createDatabaseCases(testCases, isVerbose);
  96. return this.currentRunId;
  97. } catch (error) {
  98. console.error("Failed to initialize database run:", error);
  99. throw error;
  100. }
  101. }
  102. /**
  103. * Initialize multi-model database run (one run for all models)
  104. */
  105. async initializeMultiModelRun(testCases: ProcessedTestCase[], systemPromptName: string, parsingFunction: string, diffEditFunction: string, runDescription: string, isVerbose: boolean): Promise<string> {
  106. try {
  107. // Generate a sample system prompt to hash (using first test case)
  108. const sampleSystemPrompt = testCases.length > 0
  109. ? this.constructSystemPrompt(testCases[0].system_prompt_details, systemPromptName)
  110. : "default-system-prompt";
  111. // Store system prompt
  112. this.systemPromptHash = await upsertSystemPrompt({
  113. name: systemPromptName,
  114. content: sampleSystemPrompt
  115. });
  116. // Store processing functions
  117. this.processingFunctionsHash = await upsertProcessingFunctions({
  118. name: `${parsingFunction}-${diffEditFunction}`,
  119. parsing_function: parsingFunction,
  120. diff_edit_function: diffEditFunction
  121. });
  122. // Create benchmark run
  123. this.currentRunId = await createBenchmarkRun({
  124. description: runDescription,
  125. system_prompt_hash: this.systemPromptHash
  126. });
  127. log(isVerbose, `✓ Multi-model database run initialized: ${this.currentRunId}`);
  128. // Create case records
  129. await this.createDatabaseCases(testCases, isVerbose);
  130. return this.currentRunId;
  131. } catch (error) {
  132. console.error("Failed to initialize multi-model database run:", error);
  133. throw error;
  134. }
  135. }
  136. /**
  137. * Create database case records for all test cases
  138. */
  139. async createDatabaseCases(testCases: ProcessedTestCase[], isVerbose: boolean): Promise<void> {
  140. if (!this.currentRunId || !this.systemPromptHash) {
  141. throw new Error("Database run not initialized");
  142. }
  143. for (const testCase of testCases) {
  144. try {
  145. // Store file content if available
  146. let fileHash: string | undefined;
  147. if (testCase.file_contents && testCase.file_path) {
  148. fileHash = await upsertFile({
  149. filepath: testCase.file_path,
  150. content: testCase.file_contents
  151. });
  152. }
  153. // Calculate tokens in context (approximate)
  154. const tokensInContext = this.estimateTokens(testCase.messages);
  155. // Create case record
  156. const caseId = await createCase({
  157. run_id: this.currentRunId,
  158. description: testCase.test_id,
  159. system_prompt_hash: this.systemPromptHash,
  160. task_id: testCase.test_id,
  161. tokens_in_context: tokensInContext,
  162. file_hash: fileHash
  163. });
  164. this.caseIdMap.set(testCase.test_id, caseId);
  165. } catch (error) {
  166. console.error(`Failed to create database case for ${testCase.test_id}:`, error);
  167. // Continue with other cases
  168. }
  169. }
  170. log(isVerbose, `✓ Created ${this.caseIdMap.size} database case records`);
  171. }
  172. /**
  173. * Store replay result in database, copying original data but with new diffing results
  174. */
  175. async storeReplayResultInDatabase(replayResult: TestResult, originalResult: any, testId: string, newCaseId: string): Promise<void> {
  176. if (!this.currentRunId || !this.processingFunctionsHash) {
  177. return; // Skip if database not initialized
  178. }
  179. try {
  180. // Map error string to error enum (simple mapping)
  181. const errorEnum = this.mapErrorToEnum(replayResult.error);
  182. // Store diff edit content if available
  183. let fileEditedHash: string | undefined;
  184. if (replayResult.diffEdit) {
  185. fileEditedHash = await upsertFile({
  186. filepath: `diff-edit-${testId}`,
  187. content: replayResult.diffEdit
  188. });
  189. }
  190. // Calculate basic metrics from diff edit if available
  191. let numEdits = 0;
  192. let numLinesAdded = 0;
  193. let numLinesDeleted = 0;
  194. if (replayResult.diffEdit) {
  195. // Simple parsing to count edits - count SEARCH/REPLACE blocks
  196. const searchBlocks = (replayResult.diffEdit.match(/------- SEARCH/g) || []).length;
  197. numEdits = searchBlocks;
  198. // Count added/deleted lines (rough approximation)
  199. const lines = replayResult.diffEdit.split('\n');
  200. for (const line of lines) {
  201. if (line.startsWith('+') && !line.startsWith('+++')) {
  202. numLinesAdded++;
  203. } else if (line.startsWith('-') && !line.startsWith('---')) {
  204. numLinesDeleted++;
  205. }
  206. }
  207. }
  208. // Copy original result data but update replay-specific fields
  209. const resultInput: CreateResultInput = {
  210. run_id: this.currentRunId, // New run ID
  211. case_id: newCaseId, // New case ID
  212. model_id: originalResult.model_id, // Copy from original
  213. processing_functions_hash: this.processingFunctionsHash, // New processing functions
  214. succeeded: replayResult.success && (replayResult.diffEditSuccess ?? false), // New result
  215. error_enum: errorEnum, // New error if any
  216. num_edits: numEdits || originalResult.num_edits, // New or original
  217. num_lines_deleted: numLinesDeleted || originalResult.num_lines_deleted, // New or original
  218. num_lines_added: numLinesAdded || originalResult.num_lines_added, // New or original
  219. // Copy timing and cost data from original (since we didn't make API calls)
  220. time_to_first_token_ms: originalResult.time_to_first_token_ms,
  221. time_to_first_edit_ms: originalResult.time_to_first_edit_ms,
  222. time_round_trip_ms: originalResult.time_round_trip_ms,
  223. cost_usd: originalResult.cost_usd,
  224. completion_tokens: originalResult.completion_tokens,
  225. // Use original model output (since we're replaying)
  226. raw_model_output: originalResult.raw_model_output,
  227. file_edited_hash: fileEditedHash || originalResult.file_edited_hash,
  228. parsed_tool_call_json: replayResult.toolCalls ? JSON.stringify(replayResult.toolCalls) : originalResult.parsed_tool_call_json
  229. };
  230. await insertResult(resultInput);
  231. } catch (error) {
  232. console.error(`Failed to store replay result in database for ${testId}:`, error);
  233. // Continue execution - don't fail the test run
  234. }
  235. }
  236. /**
  237. * Store test result in database
  238. */
  239. async storeResultInDatabase(result: TestResult, testId: string, modelId: string): Promise<void> {
  240. if (!this.currentRunId || !this.processingFunctionsHash) {
  241. return; // Skip if database not initialized
  242. }
  243. const caseId = this.caseIdMap.get(testId);
  244. if (!caseId) {
  245. return; // Skip if case not found
  246. }
  247. try {
  248. // Map error string to error enum (simple mapping)
  249. const errorEnum = this.mapErrorToEnum(result.error);
  250. // Store diff edit content if available
  251. let fileEditedHash: string | undefined;
  252. if (result.diffEdit) {
  253. fileEditedHash = await upsertFile({
  254. filepath: `diff-edit-${testId}`,
  255. content: result.diffEdit
  256. });
  257. }
  258. // Calculate basic metrics from diff edit if available
  259. let numEdits = 0;
  260. let numLinesAdded = 0;
  261. let numLinesDeleted = 0;
  262. if (result.diffEdit) {
  263. // Simple parsing to count edits - count SEARCH/REPLACE blocks
  264. const searchBlocks = (result.diffEdit.match(/------- SEARCH/g) || []).length;
  265. numEdits = searchBlocks;
  266. // Count added/deleted lines (rough approximation)
  267. const lines = result.diffEdit.split('\n');
  268. for (const line of lines) {
  269. if (line.startsWith('+') && !line.startsWith('+++')) {
  270. numLinesAdded++;
  271. } else if (line.startsWith('-') && !line.startsWith('---')) {
  272. numLinesDeleted++;
  273. }
  274. }
  275. }
  276. const resultInput: CreateResultInput = {
  277. run_id: this.currentRunId,
  278. case_id: caseId,
  279. model_id: modelId,
  280. processing_functions_hash: this.processingFunctionsHash,
  281. succeeded: result.success && (result.diffEditSuccess ?? false),
  282. error_enum: errorEnum,
  283. num_edits: numEdits || undefined,
  284. num_lines_deleted: numLinesDeleted || undefined,
  285. num_lines_added: numLinesAdded || undefined,
  286. time_to_first_token_ms: result.streamResult?.timing?.timeToFirstTokenMs,
  287. time_to_first_edit_ms: result.streamResult?.timing?.timeToFirstEditMs,
  288. time_round_trip_ms: result.streamResult?.timing?.totalRoundTripMs,
  289. cost_usd: result.streamResult?.usage?.totalCost,
  290. completion_tokens: result.streamResult?.usage?.outputTokens,
  291. raw_model_output: result.streamResult?.assistantMessage,
  292. file_edited_hash: fileEditedHash,
  293. parsed_tool_call_json: result.toolCalls ? JSON.stringify(result.toolCalls) : undefined
  294. };
  295. await insertResult(resultInput);
  296. } catch (error) {
  297. console.error(`Failed to store result in database for ${testId}:`, error);
  298. // Continue execution - don't fail the test run
  299. }
  300. }
  301. /**
  302. * Estimate token count for messages (rough approximation)
  303. */
  304. public estimateTokens(messages: Anthropic.Messages.MessageParam[]): number { // Made public
  305. let totalText = "";
  306. for (const message of messages) {
  307. if (Array.isArray(message.content)) {
  308. for (const block of message.content) {
  309. if (block.type === 'text') {
  310. totalText += block.text + "\n";
  311. }
  312. }
  313. } else if (typeof message.content === 'string') {
  314. totalText += message.content + "\n";
  315. }
  316. }
  317. return encoding.encode(totalText).length;
  318. }
  319. /**
  320. * Map error string to error enum
  321. */
  322. private mapErrorToEnum(error?: string): number | undefined {
  323. if (!error) return undefined;
  324. const errorMap: Record<string, number> = {
  325. 'no_tool_calls': 1,
  326. 'parsing_error': 2,
  327. 'diff_edit_error': 3,
  328. 'missing_original_diff_edit_tool_call_message': 4,
  329. 'api_error': 5,
  330. 'wrong_tool_call': 6,
  331. 'wrong_file_edited': 7,
  332. 'multi_tool_calls': 8,
  333. 'tool_call_params_undefined': 9,
  334. 'other_error': 99
  335. };
  336. return errorMap[error] || 99; // 99 for unknown errors
  337. }
  338. /**
  339. * convert our messages array into a properly formatted Anthropic messages array
  340. */
  341. transformMessages(messages: InputMessage[]): Anthropic.Messages.MessageParam[] {
  342. return messages.map((msg) => {
  343. // Use TextBlockParam here for constructing the input message
  344. const content: (Anthropic.TextBlockParam | Anthropic.ImageBlockParam)[] = []
  345. if (msg.text) {
  346. // This object now correctly matches the TextBlockParam type
  347. content.push({ type: "text", text: msg.text })
  348. }
  349. if (msg.images && Array.isArray(msg.images)) {
  350. const imageBlocks = formatResponse.imageBlocks(msg.images)
  351. content.push(...imageBlocks)
  352. }
  353. return {
  354. role: msg.role,
  355. content: content,
  356. }
  357. })
  358. }
  359. /**
  360. * Generate the system prompt on the fly
  361. */
  362. constructSystemPrompt(systemPromptDetails: SystemPromptDetails, systemPromptName: string) {
  363. const systemPromptGenerator = systemPromptGeneratorLookup[systemPromptName]
  364. const { cwd_value, browser_use, width, height, os_value, shell_value, home_value, mcp_string, user_custom_instructions } =
  365. systemPromptDetails
  366. const systemPrompt = systemPromptGenerator(
  367. cwd_value,
  368. browser_use,
  369. width,
  370. height,
  371. os_value,
  372. shell_value,
  373. home_value,
  374. mcp_string,
  375. user_custom_instructions,
  376. )
  377. return systemPrompt
  378. }
  379. /**
  380. * Loads our test cases from a directory of json files
  381. */
  382. loadTestCases(testDirectoryPath: string, isVerbose: boolean): TestCase[] {
  383. const testCasesArray: TestCase[] = []
  384. const dirents = fs.readdirSync(testDirectoryPath, { withFileTypes: true })
  385. for (const dirent of dirents) {
  386. if (dirent.isFile() && dirent.name.endsWith(".json")) {
  387. const testFilePath = path.join(testDirectoryPath, dirent.name)
  388. const fileContent = fs.readFileSync(testFilePath, "utf8")
  389. const testCase: TestCase = JSON.parse(fileContent)
  390. // Use the filename (without extension) as the test_id if not provided
  391. if (!testCase.test_id) {
  392. testCase.test_id = path.parse(dirent.name).name
  393. }
  394. // Filter out cases with missing file_contents
  395. if (!testCase.file_contents || testCase.file_contents.trim() === "") {
  396. log(isVerbose, `Skipping case ${testCase.test_id}: missing or empty file_contents.`);
  397. continue;
  398. }
  399. testCasesArray.push(testCase)
  400. }
  401. }
  402. return testCasesArray
  403. }
  404. /**
  405. * Saves the test results to the specified output directory.
  406. */
  407. saveTestResults(results: TestResultSet, outputPath: string) {
  408. // Ensure output directory exists
  409. if (!fs.existsSync(outputPath)) {
  410. fs.mkdirSync(outputPath, { recursive: true })
  411. }
  412. // Write each test result to its own file
  413. for (const testId in results) {
  414. const outputFilePath = path.join(outputPath, `${testId}.json`)
  415. const testResult = results[testId]
  416. fs.writeFileSync(outputFilePath, JSON.stringify(testResult, null, 2))
  417. }
  418. }
  419. async runDatabaseReplay(replayRunId: string, diffApplyFile: string, isVerbose: boolean) {
  420. log(isVerbose, `Starting database replay for run_id: ${replayRunId}`)
  421. log(isVerbose, `Using diff apply file: ${diffApplyFile}`)
  422. // 1. Get the correct diffing function
  423. const diffEditingFunctions: Record<string, any> = {
  424. "diff-06-06-25": constructNewFileContent_06_06_25,
  425. "diff-06-23-25": constructNewFileContent_06_23_25,
  426. "diff-06-25-25": constructNewFileContent_06_25_25,
  427. "diff-06-26-25": constructNewFileContent_06_26_25,
  428. constructNewFileContentV3: constructNewFileContentV3,
  429. }
  430. const constructNewFileContent = diffEditingFunctions[diffApplyFile]
  431. if (!constructNewFileContent) {
  432. throw new Error(`Could not find diff apply function for: ${diffApplyFile}`)
  433. }
  434. log(isVerbose, `Successfully loaded diff apply function: ${diffApplyFile}`)
  435. // 2. Fetch original run data
  436. const originalResults = await getResultsByRun(replayRunId)
  437. if (originalResults.length === 0) {
  438. throw new Error(`No results found for run_id: ${replayRunId}`)
  439. }
  440. log(isVerbose, `Found ${originalResults.length} results to replay.`)
  441. const originalRun = await getBenchmarkRun(replayRunId)
  442. if (!originalRun) {
  443. throw new Error(`Could not find original run with id ${replayRunId}`)
  444. }
  445. // 3. Create a new benchmark run for the replay
  446. const replayRunDescription = `Replay of run ${replayRunId} using ${diffApplyFile}`
  447. this.currentRunId = await createBenchmarkRun({
  448. description: replayRunDescription,
  449. system_prompt_hash: originalRun.system_prompt_hash,
  450. })
  451. log(isVerbose, `Created new run for replay: ${this.currentRunId}`)
  452. // 4. Set up processing functions for the new run
  453. this.processingFunctionsHash = await upsertProcessingFunctions({
  454. name: `replay-${diffApplyFile}`,
  455. parsing_function: "parseAssistantMessageV2",
  456. diff_edit_function: diffApplyFile,
  457. })
  458. // 5. Process each result from the original run
  459. let replayedCount = 0
  460. const caseIdMirror: Map<string, string> = new Map()
  461. for (const originalResult of originalResults) {
  462. // 5a. Basic validation to ensure we can even process this
  463. if (!originalResult.case_id) {
  464. log(isVerbose, `Skipping result ${originalResult.result_id} due to missing case_id.`)
  465. continue
  466. }
  467. // 5b. Mirror the case for the new run, reusing if already created
  468. let newCaseId = caseIdMirror.get(originalResult.case_id)
  469. if (!newCaseId) {
  470. const originalCase = await getCaseById(originalResult.case_id)
  471. if (!originalCase) {
  472. log(isVerbose, `Skipping result ${originalResult.result_id} because original case could not be found.`)
  473. continue
  474. }
  475. newCaseId = await createCase({
  476. run_id: this.currentRunId,
  477. description: `Replay of case ${originalCase.case_id} from run ${replayRunId}`,
  478. system_prompt_hash: originalCase.system_prompt_hash,
  479. task_id: originalCase.task_id,
  480. tokens_in_context: originalCase.tokens_in_context,
  481. file_hash: originalCase.file_hash,
  482. })
  483. caseIdMirror.set(originalResult.case_id, newCaseId)
  484. }
  485. // 5c. Determine if the original attempt was a "valid attempt"
  486. const isValidOriginalAttempt = originalResult.error_enum === null || originalResult.error_enum === 3 // 3 is diff_edit_error
  487. const newResultInput: CreateResultInput = {
  488. ...(originalResult as any),
  489. run_id: this.currentRunId,
  490. case_id: newCaseId,
  491. processing_functions_hash: this.processingFunctionsHash,
  492. }
  493. delete (newResultInput as any).result_id
  494. if (isValidOriginalAttempt) {
  495. // This was a valid attempt. Re-run the diff algorithm.
  496. const originalCase = await getCaseById(originalResult.case_id)
  497. if (!originalCase) {
  498. log(isVerbose, ` [WARN] Replay for result ${originalResult.result_id}: Could not find original case. Copying original result.`)
  499. newResultInput.succeeded = originalResult.succeeded
  500. newResultInput.error_enum = originalResult.error_enum
  501. } else {
  502. const originalFile = originalCase.file_hash ? await getFileByHash(originalCase.file_hash) : null
  503. const parsedToolCall = originalResult.parsed_tool_call_json ? JSON.parse(originalResult.parsed_tool_call_json)[0] : null
  504. const diffContent = parsedToolCall?.input?.diff
  505. if (originalFile && diffContent) {
  506. let diffSuccess = false
  507. try {
  508. await constructNewFileContent(diffContent, originalFile.content, true)
  509. diffSuccess = true
  510. log(isVerbose, ` [OK] Replay for task ${originalCase.task_id}: Diff applied successfully.`)
  511. } catch (e) {
  512. diffSuccess = false
  513. log(isVerbose, ` [FAIL] Replay for task ${originalCase.task_id}: New diff algorithm failed.`)
  514. }
  515. newResultInput.succeeded = diffSuccess
  516. newResultInput.error_enum = diffSuccess ? undefined : 3 // 3 = diff_edit_error
  517. } else {
  518. // Something is wrong with the ground truth data, just copy it.
  519. log(
  520. isVerbose,
  521. ` [WARN] Replay for task ${originalCase.task_id}: Valid original attempt but missing file or diff content. Copying original result.`,
  522. )
  523. newResultInput.succeeded = originalResult.succeeded
  524. newResultInput.error_enum = originalResult.error_enum
  525. }
  526. }
  527. } else {
  528. // This was not a valid attempt. Just copy the original result's outcome.
  529. log(isVerbose, ` [SKIP] Replay for task ${originalResult.case_id}: Invalid original attempt. Copying original result.`)
  530. newResultInput.succeeded = originalResult.succeeded
  531. newResultInput.error_enum = originalResult.error_enum
  532. }
  533. await insertResult(newResultInput)
  534. replayedCount++
  535. }
  536. log(isVerbose, `\n✓ Database replay completed successfully.`)
  537. log(isVerbose, ` Total original results: ${originalResults.length}`)
  538. log(isVerbose, ` Total replayed results: ${replayedCount}`)
  539. log(isVerbose, ` New run ID: ${this.currentRunId}`)
  540. }
  541. /**
  542. * Run a single test example
  543. */
  544. async runSingleTest(testCase: ProcessedTestCase, testConfig: TestConfig, isVerbose: boolean = false): Promise<TestResult> {
  545. if (testConfig.replay && !testCase.original_diff_edit_tool_call_message) {
  546. return {
  547. success: false,
  548. error: "missing_original_diff_edit_tool_call_message",
  549. errorString: `Test case ${testCase.test_id} is missing 'original_diff_edit_tool_call_message' for replay.`,
  550. }
  551. }
  552. const customSystemPrompt = this.constructSystemPrompt(testCase.system_prompt_details, testConfig.system_prompt_name)
  553. // messages don't include system prompt and are everything up to the first replace_in_file tool call which results in a diff edit error
  554. const input: TestInput = {
  555. apiKey: this.apiKey,
  556. systemPrompt: customSystemPrompt,
  557. messages: testCase.messages,
  558. modelId: testConfig.model_id,
  559. originalFile: testCase.file_contents,
  560. originalFilePath: testCase.file_path,
  561. parsingFunction: testConfig.parsing_function,
  562. diffEditFunction: testConfig.diff_edit_function,
  563. thinkingBudgetTokens: testConfig.thinking_tokens_budget,
  564. originalDiffEditToolCallMessage: testConfig.replay ? testCase.original_diff_edit_tool_call_message : undefined,
  565. diffApplyFile: testConfig.diff_apply_file,
  566. provider: this.provider,
  567. isVerbose: isVerbose,
  568. }
  569. if (isVerbose) {
  570. log(isVerbose, ` Sending request to ${testConfig.model_id} for test case ${testCase.test_id}...`);
  571. }
  572. return await runSingleEvaluation(input)
  573. }
  574. /**
  575. * Runs all the text examples synchonously
  576. */
  577. async runAllTests(testCases: ProcessedTestCase[], testConfig: TestConfig, isVerbose: boolean): Promise<TestResultSet> {
  578. const results: TestResultSet = {}
  579. // Initialize database run
  580. try {
  581. await this.initializeDatabaseRun(testConfig, testCases, isVerbose);
  582. } catch (error) {
  583. log(isVerbose, `Warning: Failed to initialize database: ${error}`);
  584. }
  585. for (const testCase of testCases) {
  586. results[testCase.test_id] = []
  587. log(isVerbose, `-Running test: ${testCase.test_id}`)
  588. for (let i = 0; i < testConfig.number_of_runs; i++) {
  589. log(isVerbose, ` Attempt ${i+1}/${testConfig.number_of_runs} for ${testCase.test_id}...`);
  590. const result = await this.runSingleTest(testCase, testConfig, isVerbose)
  591. results[testCase.test_id].push(result)
  592. // Log result status
  593. if (isVerbose) {
  594. if (result.success) {
  595. log(isVerbose, ` ✓ Attempt ${i+1} completed successfully`);
  596. } else {
  597. log(isVerbose, ` ✗ Attempt ${i+1} failed (error: ${result.error || 'unknown'})`);
  598. }
  599. }
  600. // Store result in database
  601. try {
  602. await this.storeResultInDatabase(result, testCase.test_id, testConfig.model_id);
  603. } catch (error) {
  604. log(isVerbose, `Warning: Failed to store result in database: ${error}`);
  605. }
  606. }
  607. }
  608. return results
  609. }
  610. /**
  611. * Runs all of the text examples asynchronously, with concurrency limit
  612. */
  613. async runAllTestsParallel(
  614. testCases: ProcessedTestCase[],
  615. testConfig: TestConfig,
  616. isVerbose: boolean,
  617. maxConcurrency: number = 20,
  618. ): Promise<TestResultSet> {
  619. const results: TestResultSet = {}
  620. testCases.forEach((tc) => {
  621. results[tc.test_id] = []
  622. })
  623. // Initialize database run
  624. try {
  625. await this.initializeDatabaseRun(testConfig, testCases, isVerbose);
  626. } catch (error) {
  627. log(isVerbose, `Warning: Failed to initialize database: ${error}`);
  628. }
  629. // Create a flat list of all individual runs we need to execute
  630. const allRuns = testCases.flatMap((testCase) =>
  631. Array(testConfig.number_of_runs)
  632. .fill(null)
  633. .map(() => testCase),
  634. )
  635. for (let i = 0; i < allRuns.length; i += maxConcurrency) {
  636. const batch = allRuns.slice(i, i + maxConcurrency)
  637. const batchPromises = batch.map((testCase) => {
  638. log(isVerbose, ` Running test for ${testCase.test_id}...`);
  639. return this.runSingleTest(testCase, testConfig, isVerbose).then((result) => ({
  640. ...result,
  641. test_id: testCase.test_id,
  642. }))
  643. })
  644. const batchResults = await Promise.all(batchPromises)
  645. // Calculate the total cost for this batch
  646. const batchCost = batchResults.reduce((total, result) => {
  647. return total + (result.streamResult?.usage?.totalCost || 0)
  648. }, 0)
  649. // Populate the results dictionary and store in database
  650. for (const result of batchResults) {
  651. if (result.test_id) {
  652. results[result.test_id].push(result)
  653. // Store result in database
  654. try {
  655. await this.storeResultInDatabase(result, result.test_id, testConfig.model_id);
  656. } catch (error) {
  657. log(isVerbose, `Warning: Failed to store result in database: ${error}`);
  658. }
  659. }
  660. }
  661. const batchNumber = i / maxConcurrency + 1
  662. const totalBatches = Math.ceil(allRuns.length / maxConcurrency)
  663. log(isVerbose, `-Completed batch ${batchNumber} of ${totalBatches}... (Batch Cost: $${batchCost.toFixed(6)})`)
  664. }
  665. return results
  666. }
  667. /**
  668. * Check if a test result is a valid attempt (no error_enum 1, 6, or 7)
  669. */
  670. isValidAttempt(result: TestResult): boolean {
  671. // Invalid if error is one of: no_tool_calls, wrong_tool_call, wrong_file_edited
  672. const invalidErrors = ['no_tool_calls', 'wrong_tool_call', 'wrong_file_edited'];
  673. return !invalidErrors.includes(result.error || '');
  674. }
  675. /**
  676. * Runs all tests for a specific model (assumes database run already initialized)
  677. * Keeps retrying until we get the requested number of valid attempts per case
  678. */
  679. async runAllTestsForModel(testCases: ProcessedTestCase[], testConfig: TestConfig, isVerbose: boolean): Promise<TestResultSet> {
  680. const results: TestResultSet = {}
  681. for (const testCase of testCases) {
  682. results[testCase.test_id] = []
  683. let validAttempts = 0;
  684. let totalAttempts = 0;
  685. log(isVerbose, `-Running test: ${testCase.test_id}`)
  686. // Keep trying until we get the requested number of valid attempts
  687. while (validAttempts < testConfig.number_of_runs) {
  688. totalAttempts++;
  689. log(isVerbose, ` Attempt ${totalAttempts} for ${testCase.test_id} (${validAttempts}/${testConfig.number_of_runs} valid so far)...`);
  690. const result = await this.runSingleTest(testCase, testConfig, isVerbose)
  691. results[testCase.test_id].push(result)
  692. // Check if this was a valid attempt
  693. const isValid = this.isValidAttempt(result);
  694. if (isValid) {
  695. validAttempts++;
  696. log(isVerbose, ` ✓ Valid attempt ${validAttempts}/${testConfig.number_of_runs} completed (${result.success ? 'SUCCESS' : 'FAILED'})`);
  697. } else {
  698. log(isVerbose, ` ✗ Invalid attempt (error: ${result.error || 'unknown'})`);
  699. }
  700. // Store result in database
  701. try {
  702. await this.storeResultInDatabase(result, testCase.test_id, testConfig.model_id);
  703. } catch (error) {
  704. log(isVerbose, `Warning: Failed to store result in database: ${error}`);
  705. }
  706. // Safety check to prevent infinite loops - use configurable max attempts limit
  707. if (totalAttempts >= testConfig.max_attempts_per_case) {
  708. log(isVerbose, ` ⚠️ Reached maximum attempts (${totalAttempts}) for test case ${testCase.test_id}. Only got ${validAttempts}/${testConfig.number_of_runs} valid attempts.`);
  709. break;
  710. }
  711. }
  712. log(isVerbose, ` ✓ Completed test case ${testCase.test_id}: ${validAttempts}/${testConfig.number_of_runs} valid attempts (${totalAttempts} total attempts)`);
  713. }
  714. return results
  715. }
  716. /**
  717. * Print output of the tests
  718. */
  719. printSummary(results: TestResultSet, isVerbose: boolean) {
  720. let totalRuns = 0
  721. let totalPasses = 0
  722. let totalInputTokens = 0
  723. let totalOutputTokens = 0
  724. let totalCost = 0
  725. let runsWithUsageData = 0
  726. let totalDiffEditSuccesses = 0
  727. let totalRunsWithToolCalls = 0
  728. const testCaseIds = Object.keys(results)
  729. log(isVerbose, "\n=== TEST SUMMARY ===")
  730. for (const testId of testCaseIds) {
  731. const testResults = results[testId]
  732. const passedCount = testResults.filter((r) => r.success && r.diffEditSuccess).length
  733. const runCount = testResults.length
  734. totalRuns += runCount
  735. totalPasses += passedCount
  736. const runsWithToolCalls = testResults.filter((r) => r.success === true).length
  737. const diffEditSuccesses = passedCount
  738. totalRunsWithToolCalls += runsWithToolCalls
  739. totalDiffEditSuccesses += diffEditSuccesses
  740. // Accumulate token and cost data
  741. for (const result of testResults) {
  742. if (result.streamResult?.usage) {
  743. totalInputTokens += result.streamResult.usage.inputTokens
  744. totalOutputTokens += result.streamResult.usage.outputTokens
  745. totalCost += result.streamResult.usage.totalCost
  746. runsWithUsageData++
  747. }
  748. }
  749. log(isVerbose, `\n--- Test Case: ${testId} ---`)
  750. log(isVerbose, ` Runs: ${runCount}`)
  751. log(isVerbose, ` Passed: ${passedCount}`)
  752. log(isVerbose, ` Success Rate: ${runCount > 0 ? ((passedCount / runCount) * 100).toFixed(1) : "N/A"}%`)
  753. }
  754. log(isVerbose, "\n\n=== OVERALL SUMMARY ===")
  755. log(isVerbose, `Total Test Cases: ${testCaseIds.length}`)
  756. log(isVerbose, `Total Runs Executed: ${totalRuns}`)
  757. log(isVerbose, `Overall Passed: ${totalPasses}`)
  758. log(isVerbose, `Overall Failed: ${totalRuns - totalPasses}`)
  759. log(isVerbose, `Overall Success Rate: ${totalRuns > 0 ? ((totalPasses / totalRuns) * 100).toFixed(1) : "N/A"}%`)
  760. log(isVerbose, "\n\n=== OVERALL DIFF EDIT SUCCESS RATE ===")
  761. if (totalRunsWithToolCalls > 0) {
  762. const diffSuccessRate = (totalDiffEditSuccesses / totalRunsWithToolCalls) * 100
  763. log(isVerbose, `Total Runs with Successful Tool Calls: ${totalRunsWithToolCalls}`)
  764. log(isVerbose, `Total Runs with Successful Diff Edits: ${totalDiffEditSuccesses}`)
  765. log(isVerbose, `Diff Edit Success Rate: ${diffSuccessRate.toFixed(1)}%`)
  766. } else {
  767. log(isVerbose, "No successful tool calls to analyze for diff edit success.")
  768. }
  769. log(isVerbose, "\n\n=== TOKEN & COST ANALYSIS ===")
  770. if (runsWithUsageData > 0) {
  771. log(isVerbose, `Total Input Tokens: ${totalInputTokens.toLocaleString()}`)
  772. log(isVerbose, `Total Output Tokens: ${totalOutputTokens.toLocaleString()}`)
  773. log(isVerbose, `Total Cost: $${totalCost.toFixed(6)}`)
  774. log(isVerbose, "---")
  775. log(
  776. isVerbose,
  777. `Avg Input Tokens / Run: ${(totalInputTokens / runsWithUsageData).toLocaleString(undefined, {
  778. maximumFractionDigits: 0,
  779. })}`,
  780. )
  781. log(
  782. isVerbose,
  783. `Avg Output Tokens / Run: ${(totalOutputTokens / runsWithUsageData).toLocaleString(undefined, {
  784. maximumFractionDigits: 0,
  785. })}`,
  786. )
  787. log(isVerbose, `Avg Cost / Run: $${(totalCost / runsWithUsageData).toFixed(6)}`)
  788. } else {
  789. log(isVerbose, "No usage data available to analyze.")
  790. }
  791. }
  792. }
  793. async function main() {
  794. interface EvaluationTask {
  795. modelId: string;
  796. testCase: ProcessedTestCase;
  797. testConfig: TestConfig;
  798. }
  799. const program = new Command()
  800. const defaultTestPath = path.join(__dirname, "cases")
  801. const defaultOutputPath = path.join(__dirname, "results")
  802. program
  803. .name("TestRunner")
  804. .description("Run evaluation tests for diff editing")
  805. .version("1.0.0")
  806. .option("--test-path <path>", "Path to the directory containing test case JSON files", defaultTestPath)
  807. .option("--output-path <path>", "Path to the directory to save the test output JSON files", defaultOutputPath)
  808. .option("--model-ids <model_ids>", "Comma-separated list of model IDs to test")
  809. .option("--system-prompt-name <name>", "The name of the system prompt to use", "basicSystemPrompt")
  810. .option("-n, --valid-attempts-per-case <number>", "Number of valid attempts per test case per model (will retry until this many valid attempts are collected)", "1")
  811. .option("--max-attempts-per-case <number>", "Maximum total attempts per test case (default: 10x valid attempts)")
  812. .option("--max-cases <number>", "Maximum number of test cases to run (limits total cases loaded)")
  813. .option("--parsing-function <name>", "The parsing function to use", "parseAssistantMessageV2")
  814. .option("--diff-edit-function <name>", "The diff editing function to use", "diff-06-26-25")
  815. .option("--thinking-budget <tokens>", "Set the thinking tokens budget", "0")
  816. .option("--provider <provider>", "API provider to use (openrouter, openai)", "openrouter")
  817. .option("--parallel", "Run tests in parallel", false)
  818. .option("--replay", "Run evaluation from a pre-recorded LLM output, skipping the API call", false)
  819. .option("--replay-run-id <run_id>", "The ID of the run to replay from the database")
  820. .option("--diff-apply-file <filename>", "The name of the diff apply file to use for the replay")
  821. .option("--save-locally", "Save results to local JSON files in addition to database", false)
  822. .option("-v, --verbose", "Enable verbose logging", false)
  823. .option("--max-concurrency <number>", "Maximum number of parallel requests", "80")
  824. program.parse(process.argv)
  825. const options = program.opts()
  826. const isVerbose = options.verbose
  827. const testPath = options.testPath
  828. const outputPath = options.outputPath
  829. const saveLocally = options.saveLocally
  830. const maxConcurrency = parseInt(options.maxConcurrency, 10);
  831. // Parse model IDs from comma-separated string
  832. const modelIds = options.modelIds ? options.modelIds.split(',').map(id => id.trim()) : [];
  833. if (modelIds.length === 0) {
  834. console.error("Error: --model-ids is required and must contain at least one model ID");
  835. process.exit(1);
  836. }
  837. const validAttemptsPerCase = parseInt(options.validAttemptsPerCase, 10);
  838. // Compute dynamic default for max attempts: 10x valid attempts if not specified
  839. const maxAttemptsPerCase = options.maxAttemptsPerCase
  840. ? parseInt(options.maxAttemptsPerCase, 10)
  841. : validAttemptsPerCase * 10;
  842. const runner = new NodeTestRunner(options.replay || !!options.replayRunId, options.provider)
  843. if (options.replayRunId) {
  844. if (!options.diffApplyFile) {
  845. console.error("Error: --diff-apply-file is required when using --replay-run-id")
  846. process.exit(1)
  847. }
  848. await runner.runDatabaseReplay(options.replayRunId, options.diffApplyFile, isVerbose)
  849. return
  850. }
  851. try {
  852. const startTime = Date.now()
  853. // Load OpenRouter model data first
  854. openRouterModelDataGlobal = await loadOpenRouterModelData(isVerbose);
  855. if (Object.keys(openRouterModelDataGlobal).length === 0 && isVerbose) {
  856. log(isVerbose, "Warning: Could not load OpenRouter model data. Context window filtering might be affected for OpenRouter models.");
  857. }
  858. const runner = new NodeTestRunner(options.replay, options.provider)
  859. let allLoadedTestCases = runner.loadTestCases(testPath, isVerbose) // Pass isVerbose
  860. const allProcessedTestCasesGlobal: ProcessedTestCase[] = allLoadedTestCases.map((tc) => ({
  861. ...tc,
  862. messages: runner.transformMessages(tc.messages),
  863. }));
  864. log(isVerbose, `-Loaded ${allLoadedTestCases.length} initial test cases.`)
  865. log(isVerbose, `-Testing ${modelIds.length} model(s): ${modelIds.join(', ')}`)
  866. log(isVerbose, `-Target: ${validAttemptsPerCase} valid attempts per test case per model (will retry until this many valid attempts are collected)`)
  867. if (options.replay) {
  868. log(isVerbose, `-Running in REPLAY mode. No API calls will be made.`)
  869. }
  870. log(isVerbose, "Starting tests...\n")
  871. // Determine the smallest context window among all specified models
  872. let smallestContextWindow = Infinity;
  873. for (const modelId of modelIds) {
  874. let modelInfo = openRouterModelDataGlobal[modelId];
  875. if (!modelInfo) {
  876. const foundKey = Object.keys(openRouterModelDataGlobal).find(
  877. key => key.includes(modelId) || modelId.includes(key)
  878. );
  879. if (foundKey) modelInfo = openRouterModelDataGlobal[foundKey];
  880. }
  881. const currentModelContext = modelInfo?.contextWindow;
  882. if (currentModelContext && currentModelContext > 0) {
  883. if (currentModelContext < smallestContextWindow) {
  884. smallestContextWindow = currentModelContext;
  885. }
  886. } else {
  887. log(isVerbose, `Warning: Context window for model ${modelId} is unknown or zero. It will not constrain the test case selection.`);
  888. }
  889. }
  890. if (smallestContextWindow === Infinity) {
  891. log(isVerbose, "Warning: Could not determine a common smallest context window. Proceeding with all loaded cases, context issues may occur.");
  892. } else {
  893. log(isVerbose, `Smallest common context window (with padding consideration) across specified models: ${smallestContextWindow} (target for filtering: ${smallestContextWindow - 20000})`);
  894. }
  895. let eligibleCasesForThisRun = [...allLoadedTestCases];
  896. if (smallestContextWindow !== Infinity && smallestContextWindow > 20000) { // Only filter if a valid smallest window is found
  897. const originalCaseCount = eligibleCasesForThisRun.length;
  898. eligibleCasesForThisRun = eligibleCasesForThisRun.filter(tc => {
  899. const systemPromptText = runner.constructSystemPrompt(tc.system_prompt_details, options.systemPromptName);
  900. const systemPromptTokens = encoding.encode(systemPromptText).length;
  901. const messagesTokens = runner.estimateTokens(runner.transformMessages(tc.messages));
  902. const totalInputTokens = systemPromptTokens + messagesTokens;
  903. return totalInputTokens + 20000 <= smallestContextWindow; // 20k padding
  904. });
  905. log(isVerbose, `Filtered to ${eligibleCasesForThisRun.length} cases (from ${originalCaseCount}) to fit smallest context window of ${smallestContextWindow} (with padding).`);
  906. }
  907. // Apply max-cases limit if specified, to the context-filtered list
  908. if (options.maxCases && options.maxCases > 0 && eligibleCasesForThisRun.length > options.maxCases) {
  909. log(isVerbose, `Limiting to ${options.maxCases} test cases (out of ${eligibleCasesForThisRun.length} eligible).`);
  910. eligibleCasesForThisRun = eligibleCasesForThisRun.slice(0, options.maxCases);
  911. }
  912. if (eligibleCasesForThisRun.length === 0) {
  913. log(isVerbose, `No eligible test cases found after filtering for all specified models. Exiting.`);
  914. process.exit(0);
  915. }
  916. const processedEligibleCasesForRun: ProcessedTestCase[] = eligibleCasesForThisRun.map((tc) => ({
  917. ...tc,
  918. messages: runner.transformMessages(tc.messages),
  919. }));
  920. // Initialize ONE database run for ALL models using the commonly eligible cases
  921. const runDescription = `Models: ${modelIds.join(', ')}, Common Cases: ${processedEligibleCasesForRun.length}, Valid attempts per case: ${validAttemptsPerCase}`;
  922. await runner.initializeMultiModelRun(processedEligibleCasesForRun, options.systemPromptName, options.parsingFunction, options.diffEditFunction, runDescription, isVerbose);
  923. // Create a global task queue
  924. const globalTaskQueue: EvaluationTask[] = modelIds.flatMap(modelId =>
  925. processedEligibleCasesForRun.map(testCase => ({
  926. modelId,
  927. testCase,
  928. testConfig: {
  929. model_id: modelId,
  930. system_prompt_name: options.systemPromptName,
  931. number_of_runs: validAttemptsPerCase,
  932. max_attempts_per_case: maxAttemptsPerCase,
  933. parsing_function: options.parsingFunction,
  934. diff_edit_function: options.diffEditFunction,
  935. thinking_tokens_budget: parseInt(options.thinkingBudget, 10),
  936. replay: options.replay,
  937. }
  938. }))
  939. );
  940. const results: TestResultSet = {};
  941. const taskStates: Record<string, { valid: number; total: number; pending: number }> = {};
  942. globalTaskQueue.forEach(({ modelId, testCase }) => {
  943. const taskId = `${modelId}-${testCase.test_id}`;
  944. taskStates[taskId] = { valid: 0, total: 0, pending: 0 };
  945. if (!results[testCase.test_id]) {
  946. results[testCase.test_id] = [];
  947. }
  948. });
  949. let remainingTasks = [...globalTaskQueue];
  950. while (remainingTasks.length > 0) {
  951. const batch: EvaluationTask[] = [];
  952. for (const task of remainingTasks) {
  953. if (batch.length >= maxConcurrency) break;
  954. const taskId = `${task.modelId}-${task.testCase.test_id}`;
  955. if ((taskStates[taskId].valid + taskStates[taskId].pending) < validAttemptsPerCase) {
  956. batch.push(task);
  957. taskStates[taskId].pending++;
  958. }
  959. }
  960. if (batch.length === 0) {
  961. await new Promise(resolve => setTimeout(resolve, 100));
  962. continue;
  963. }
  964. const batchPromises = batch.map(task => {
  965. const taskId = `${task.modelId}-${task.testCase.test_id}`;
  966. taskStates[taskId].total++;
  967. log(isVerbose, ` Attempt ${taskStates[taskId].total} for ${task.testCase.test_id} with ${task.modelId} (${taskStates[taskId].valid} valid, ${taskStates[taskId].pending - 1} pending)...`);
  968. return runner.runSingleTest(task.testCase, task.testConfig, isVerbose).then(result => ({
  969. ...result,
  970. test_id: task.testCase.test_id,
  971. modelId: task.modelId,
  972. }));
  973. });
  974. const batchResults = await Promise.all(batchPromises);
  975. for (const result of batchResults) {
  976. const taskId = `${result.modelId}-${result.test_id}`;
  977. taskStates[taskId].pending--;
  978. results[result.test_id].push(result);
  979. if (runner.isValidAttempt(result)) {
  980. taskStates[taskId].valid++;
  981. log(isVerbose, ` ✓ Valid attempt ${taskStates[taskId].valid}/${validAttemptsPerCase} for ${result.test_id} with ${result.modelId} completed (${result.success ? 'SUCCESS' : 'FAILED'})`);
  982. } else {
  983. log(isVerbose, ` ✗ Invalid attempt for ${result.test_id} with ${result.modelId} (error: ${result.error || 'unknown'})`);
  984. }
  985. await runner.storeResultInDatabase(result, result.test_id, result.modelId);
  986. }
  987. remainingTasks = remainingTasks.filter(task => {
  988. const taskId = `${task.modelId}-${task.testCase.test_id}`;
  989. if (taskStates[taskId].total >= task.testConfig.max_attempts_per_case) {
  990. log(isVerbose, ` ⚠️ Reached maximum attempts for ${task.testCase.test_id} with ${task.modelId}.`);
  991. return false;
  992. }
  993. return taskStates[taskId].valid < validAttemptsPerCase;
  994. });
  995. const batchCost = batchResults.reduce((total, result) => total + (result.streamResult?.usage?.totalCost || 0), 0);
  996. log(isVerbose, `-Completed batch... (Batch Cost: $${batchCost.toFixed(6)}, Remaining tasks: ${remainingTasks.length})`);
  997. }
  998. // Print summary for each model
  999. for (const modelId of modelIds) {
  1000. const modelResults: TestResultSet = {};
  1001. Object.keys(results).forEach(testId => {
  1002. modelResults[testId] = results[testId].filter(r => (r as any).modelId === modelId);
  1003. });
  1004. log(isVerbose, `\n=== Results for Model: ${modelId} ===`);
  1005. runner.printSummary(modelResults, isVerbose);
  1006. }
  1007. const endTime = Date.now()
  1008. const durationSeconds = ((endTime - startTime) / 1000).toFixed(2)
  1009. log(isVerbose, `\n-Total execution time: ${durationSeconds} seconds`)
  1010. // Save results locally if requested
  1011. if (saveLocally) {
  1012. runner.saveTestResults(results, outputPath);
  1013. log(isVerbose, `✓ Results also saved to JSON files in ${outputPath}`);
  1014. }
  1015. log(isVerbose, `\n✓ All results stored in database. Use the dashboard to view results.`)
  1016. } catch (error) {
  1017. console.error("\nError running tests:", error)
  1018. process.exit(1)
  1019. }
  1020. }
  1021. if (require.main === module) {
  1022. main()
  1023. }