| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351 |
- import { OpenRouterHandler } from "../../src/api/providers/openrouter"
- import { OpenAiNativeHandler } from "../../src/api/providers/openai-native"
- import { Anthropic } from "@anthropic-ai/sdk"
- import {
- parseAssistantMessageV2,
- AssistantMessageContent,
- } from "./parsing/parse-assistant-message-06-06-25" // "../../src/core/assistant-message"
- import { constructNewFileContent as constructNewFileContent_06_06_25 } from "./diff-apply/diff-06-06-25"
- import { constructNewFileContent as constructNewFileContent_06_23_25 } from "./diff-apply/diff-06-23-25"
- import { constructNewFileContent as constructNewFileContent_06_25_25 } from "./diff-apply/diff-06-25-25"
- import { constructNewFileContent as constructNewFileContent_06_26_25 } from "./diff-apply/diff-06-26-25"
- type ParseAssistantMessageFn = (message: string) => AssistantMessageContent[]
- type ConstructNewFileContentFn = (diff: string, original: string, strict: boolean) => Promise<string | any>
- const parsingFunctions: Record<string, ParseAssistantMessageFn> = {
- parseAssistantMessageV2: parseAssistantMessageV2,
- }
- const diffEditingFunctions: Record<string, ConstructNewFileContentFn> = {
- "diff-06-06-25": constructNewFileContent_06_06_25,
- "diff-06-23-25": constructNewFileContent_06_23_25,
- "diff-06-25-25": constructNewFileContent_06_25_25,
- "diff-06-26-25": constructNewFileContent_06_26_25,
- }
- import { TestInput, TestResult, ExtractedToolCall } from "./types"
- import { log } from "./helpers"
- export { TestInput, TestResult, ExtractedToolCall }
- interface StreamResult {
- assistantMessage: string
- reasoningMessage: string
- usage: {
- inputTokens: number
- outputTokens: number
- cacheWriteTokens: number
- cacheReadTokens: number
- totalCost: number
- }
- timing?: {
- timeToFirstTokenMs: number
- timeToFirstEditMs?: number
- totalRoundTripMs: number
- }
- }
- /**
- * Process the stream and return full response with timing data
- */
- async function processStream(
- handler: OpenRouterHandler | OpenAiNativeHandler,
- systemPrompt: string,
- messages: Anthropic.Messages.MessageParam[],
- ): Promise<StreamResult> {
- const startTime = Date.now()
- const stream = handler.createMessage(systemPrompt, messages)
- let assistantMessage = ""
- let reasoningMessage = ""
- let inputTokens = 0
- let outputTokens = 0
- let cacheWriteTokens = 0
- let cacheReadTokens = 0
- let totalCost = 0
-
- // Timing tracking
- let timeToFirstTokenMs: number | null = null
- let timeToFirstEditMs: number | null = null
- for await (const chunk of stream) {
- if (!chunk) {
- continue
- }
- // Capture time to first token (any chunk type)
- if (timeToFirstTokenMs === null) {
- timeToFirstTokenMs = Date.now() - startTime
- }
- switch (chunk.type) {
- case "usage":
- inputTokens += chunk.inputTokens
- outputTokens += chunk.outputTokens
- cacheWriteTokens += chunk.cacheWriteTokens ?? 0
- cacheReadTokens += chunk.cacheReadTokens ?? 0
- if (chunk.totalCost) {
- totalCost = chunk.totalCost
- }
- break
- case "reasoning":
- reasoningMessage += chunk.reasoning
- break
- case "text":
- assistantMessage += chunk.text
-
- // Try to detect first tool call by parsing accumulated message
- if (timeToFirstEditMs === null) {
- try {
- const parsed = parseAssistantMessageV2(assistantMessage)
- const hasToolCall = parsed.some(block => block.type === "tool_use")
- if (hasToolCall) {
- timeToFirstEditMs = Date.now() - startTime
- }
- } catch {
- // Parsing failed, continue accumulating
- }
- }
- break
- }
- }
- const totalRoundTripMs = Date.now() - startTime
- return {
- assistantMessage,
- reasoningMessage,
- usage: {
- inputTokens,
- outputTokens,
- cacheWriteTokens,
- cacheReadTokens,
- totalCost,
- },
- timing: {
- timeToFirstTokenMs: timeToFirstTokenMs || 0,
- timeToFirstEditMs: timeToFirstEditMs || undefined,
- totalRoundTripMs,
- },
- }
- }
- /**
- * Main evaluation function:
- * 1. create and process stream
- * 2. extract any tool calls from the stream
- * 3. if no diff edit, considered a failure (or rerun) - otherwise attempt to apply the diff edit
- */
- export async function runSingleEvaluation(input: TestInput): Promise<TestResult> {
- try {
- // Extract parameters
- const {
- apiKey,
- systemPrompt,
- messages,
- modelId,
- originalFile,
- originalFilePath,
- parsingFunction,
- diffEditFunction,
- thinkingBudgetTokens,
- originalDiffEditToolCallMessage,
- diffApplyFile,
- } = input
- const requiredParams = {
- systemPrompt,
- messages,
- modelId,
- originalFile,
- originalFilePath,
- parsingFunction,
- diffEditFunction,
- }
- const missingParams = Object.entries(requiredParams)
- .filter(([, value]) => !value)
- .map(([key]) => key)
- if (missingParams.length > 0) {
- return {
- success: false,
- error: "missing_required_parameters",
- errorString: `Missing required parameters: ${missingParams.join(", ")}`,
- }
- }
- const parseAssistantMessage = parsingFunctions[parsingFunction]
- const constructNewFileContent = diffEditingFunctions[diffApplyFile || diffEditFunction]
- if (!parseAssistantMessage || !constructNewFileContent) {
- return {
- success: false,
- error: "invalid_functions",
- }
- }
- const provider = input.provider || "openrouter"
- // Get the output of streaming output of this llm call
- let streamResult: StreamResult
- if (originalDiffEditToolCallMessage !== undefined) {
- // Replay mode: mock the stream result
- streamResult = {
- assistantMessage: originalDiffEditToolCallMessage,
- reasoningMessage: "",
- usage: { inputTokens: 0, outputTokens: 0, cacheWriteTokens: 0, cacheReadTokens: 0, totalCost: 0 },
- }
- } else {
- // Live mode: provider-specific API call logic
- try {
- let handler: OpenRouterHandler | OpenAiNativeHandler
-
- if (provider === "openai") {
- const openAiOptions = {
- openAiNativeApiKey: apiKey,
- apiModelId: modelId,
- }
- handler = new OpenAiNativeHandler(openAiOptions)
- } else {
- const openRouterOptions = {
- openRouterApiKey: apiKey,
- openRouterModelId: modelId,
- thinkingBudgetTokens: thinkingBudgetTokens,
- openRouterModelInfo: {
- maxTokens: 10_000,
- contextWindow: 1_000_000,
- supportsImages: true,
- supportsPromptCache: true,
- inputPrice: 0,
- outputPrice: 0,
- },
- }
- handler = new OpenRouterHandler(openRouterOptions)
- }
-
- streamResult = await processStream(handler, systemPrompt, messages)
- } catch (error: any) {
- return {
- success: false,
- error: "llm_stream_error",
- errorString: error.message || error.toString(),
- }
- }
- }
- // process the assistant message into its constituent tool calls & text blocks
- const assistantContentBlocks: AssistantMessageContent[] = parseAssistantMessage(streamResult.assistantMessage)
- const detectedToolCalls: ExtractedToolCall[] = []
- for (const block of assistantContentBlocks) {
- if (block.type === "tool_use") {
- detectedToolCalls.push({
- name: block.name,
- input: block.params,
- })
- }
- }
- // check if there are any tool calls, if there are none then its a clear error
- if (detectedToolCalls.length === 0) {
- return {
- success: false,
- streamResult: streamResult,
- toolCalls: detectedToolCalls,
- error: "no_tool_calls",
- }
- }
- // check that there is exactly one tool call, otherwise an error
- if (detectedToolCalls.length > 1) {
- return {
- success: false,
- streamResult: streamResult,
- toolCalls: detectedToolCalls,
- error: "multi_tool_calls",
- }
- }
- // check that the tool call is diff edit tool call
- if (detectedToolCalls[0].name !== "replace_in_file") {
- return {
- success: false,
- streamResult: streamResult,
- toolCalls: detectedToolCalls,
- error: "wrong_tool_call",
- }
- }
- const toolCall = detectedToolCalls[0]
- const diffToolPath = toolCall.input.path
- const diffToolContent = toolCall.input.diff
- if (!diffToolPath || !diffToolContent) {
- return {
- success: false,
- streamResult: streamResult,
- toolCalls: detectedToolCalls,
- error: "tool_call_params_undefined",
- }
- }
- // check that we are editing the correct file path
- log(input.isVerbose, `Expected file path: "${originalFilePath}"`)
- log(input.isVerbose, `Actual file path used: "${diffToolPath}"`)
- if (diffToolPath !== originalFilePath) {
- log(input.isVerbose, `❌ File path mismatch detected!`)
- // Enhanced logging:
- if (streamResult?.assistantMessage) {
- log(input.isVerbose, ` Full model output (assistantMessage):`)
- log(input.isVerbose, ` -----------------------------------------`)
- log(input.isVerbose, ` ${streamResult.assistantMessage}`)
- log(input.isVerbose, ` -----------------------------------------`)
- }
- if (toolCall) {
- log(input.isVerbose, ` Parsed tool call that caused mismatch:`)
- log(input.isVerbose, ` ${JSON.stringify(toolCall, null, 2)}`)
- log(input.isVerbose, ` -----------------------------------------`)
- }
- return {
- success: false,
- streamResult: streamResult,
- toolCalls: detectedToolCalls,
- error: "wrong_file_edited",
- }
- }
- // checking if the diff edit succeeds, if it failed it will throw an error
- let diffSuccess = true
- let replacementData: any = undefined
- try {
- const result = await constructNewFileContent(diffToolContent, originalFile, true)
-
- // Check if result is an object with replacements (new format)
- if (typeof result === 'object' && result !== null && 'replacements' in result) {
- replacementData = result.replacements
- }
- // If it's just a string, diffSuccess stays true and replacementData stays undefined
- } catch (error: any) {
- diffSuccess = false
- log(input.isVerbose, `ERROR: ${error}`)
- }
- return {
- success: true,
- streamResult: streamResult,
- toolCalls: detectedToolCalls,
- diffEdit: diffToolContent,
- diffEditSuccess: diffSuccess,
- replacementData: replacementData,
- }
- } catch (error: any) {
- return {
- success: false,
- error: "other_error",
- errorString: error.message || error.toString(),
- }
- }
- }
|