ClineWrapper.ts 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351
  1. import { OpenRouterHandler } from "../../src/api/providers/openrouter"
  2. import { OpenAiNativeHandler } from "../../src/api/providers/openai-native"
  3. import { Anthropic } from "@anthropic-ai/sdk"
  4. import {
  5. parseAssistantMessageV2,
  6. AssistantMessageContent,
  7. } from "./parsing/parse-assistant-message-06-06-25" // "../../src/core/assistant-message"
  8. import { constructNewFileContent as constructNewFileContent_06_06_25 } from "./diff-apply/diff-06-06-25"
  9. import { constructNewFileContent as constructNewFileContent_06_23_25 } from "./diff-apply/diff-06-23-25"
  10. import { constructNewFileContent as constructNewFileContent_06_25_25 } from "./diff-apply/diff-06-25-25"
  11. import { constructNewFileContent as constructNewFileContent_06_26_25 } from "./diff-apply/diff-06-26-25"
  12. type ParseAssistantMessageFn = (message: string) => AssistantMessageContent[]
  13. type ConstructNewFileContentFn = (diff: string, original: string, strict: boolean) => Promise<string | any>
  14. const parsingFunctions: Record<string, ParseAssistantMessageFn> = {
  15. parseAssistantMessageV2: parseAssistantMessageV2,
  16. }
  17. const diffEditingFunctions: Record<string, ConstructNewFileContentFn> = {
  18. "diff-06-06-25": constructNewFileContent_06_06_25,
  19. "diff-06-23-25": constructNewFileContent_06_23_25,
  20. "diff-06-25-25": constructNewFileContent_06_25_25,
  21. "diff-06-26-25": constructNewFileContent_06_26_25,
  22. }
  23. import { TestInput, TestResult, ExtractedToolCall } from "./types"
  24. import { log } from "./helpers"
  25. export { TestInput, TestResult, ExtractedToolCall }
  26. interface StreamResult {
  27. assistantMessage: string
  28. reasoningMessage: string
  29. usage: {
  30. inputTokens: number
  31. outputTokens: number
  32. cacheWriteTokens: number
  33. cacheReadTokens: number
  34. totalCost: number
  35. }
  36. timing?: {
  37. timeToFirstTokenMs: number
  38. timeToFirstEditMs?: number
  39. totalRoundTripMs: number
  40. }
  41. }
  42. /**
  43. * Process the stream and return full response with timing data
  44. */
  45. async function processStream(
  46. handler: OpenRouterHandler | OpenAiNativeHandler,
  47. systemPrompt: string,
  48. messages: Anthropic.Messages.MessageParam[],
  49. ): Promise<StreamResult> {
  50. const startTime = Date.now()
  51. const stream = handler.createMessage(systemPrompt, messages)
  52. let assistantMessage = ""
  53. let reasoningMessage = ""
  54. let inputTokens = 0
  55. let outputTokens = 0
  56. let cacheWriteTokens = 0
  57. let cacheReadTokens = 0
  58. let totalCost = 0
  59. // Timing tracking
  60. let timeToFirstTokenMs: number | null = null
  61. let timeToFirstEditMs: number | null = null
  62. for await (const chunk of stream) {
  63. if (!chunk) {
  64. continue
  65. }
  66. // Capture time to first token (any chunk type)
  67. if (timeToFirstTokenMs === null) {
  68. timeToFirstTokenMs = Date.now() - startTime
  69. }
  70. switch (chunk.type) {
  71. case "usage":
  72. inputTokens += chunk.inputTokens
  73. outputTokens += chunk.outputTokens
  74. cacheWriteTokens += chunk.cacheWriteTokens ?? 0
  75. cacheReadTokens += chunk.cacheReadTokens ?? 0
  76. if (chunk.totalCost) {
  77. totalCost = chunk.totalCost
  78. }
  79. break
  80. case "reasoning":
  81. reasoningMessage += chunk.reasoning
  82. break
  83. case "text":
  84. assistantMessage += chunk.text
  85. // Try to detect first tool call by parsing accumulated message
  86. if (timeToFirstEditMs === null) {
  87. try {
  88. const parsed = parseAssistantMessageV2(assistantMessage)
  89. const hasToolCall = parsed.some(block => block.type === "tool_use")
  90. if (hasToolCall) {
  91. timeToFirstEditMs = Date.now() - startTime
  92. }
  93. } catch {
  94. // Parsing failed, continue accumulating
  95. }
  96. }
  97. break
  98. }
  99. }
  100. const totalRoundTripMs = Date.now() - startTime
  101. return {
  102. assistantMessage,
  103. reasoningMessage,
  104. usage: {
  105. inputTokens,
  106. outputTokens,
  107. cacheWriteTokens,
  108. cacheReadTokens,
  109. totalCost,
  110. },
  111. timing: {
  112. timeToFirstTokenMs: timeToFirstTokenMs || 0,
  113. timeToFirstEditMs: timeToFirstEditMs || undefined,
  114. totalRoundTripMs,
  115. },
  116. }
  117. }
  118. /**
  119. * Main evaluation function:
  120. * 1. create and process stream
  121. * 2. extract any tool calls from the stream
  122. * 3. if no diff edit, considered a failure (or rerun) - otherwise attempt to apply the diff edit
  123. */
  124. export async function runSingleEvaluation(input: TestInput): Promise<TestResult> {
  125. try {
  126. // Extract parameters
  127. const {
  128. apiKey,
  129. systemPrompt,
  130. messages,
  131. modelId,
  132. originalFile,
  133. originalFilePath,
  134. parsingFunction,
  135. diffEditFunction,
  136. thinkingBudgetTokens,
  137. originalDiffEditToolCallMessage,
  138. diffApplyFile,
  139. } = input
  140. const requiredParams = {
  141. systemPrompt,
  142. messages,
  143. modelId,
  144. originalFile,
  145. originalFilePath,
  146. parsingFunction,
  147. diffEditFunction,
  148. }
  149. const missingParams = Object.entries(requiredParams)
  150. .filter(([, value]) => !value)
  151. .map(([key]) => key)
  152. if (missingParams.length > 0) {
  153. return {
  154. success: false,
  155. error: "missing_required_parameters",
  156. errorString: `Missing required parameters: ${missingParams.join(", ")}`,
  157. }
  158. }
  159. const parseAssistantMessage = parsingFunctions[parsingFunction]
  160. const constructNewFileContent = diffEditingFunctions[diffApplyFile || diffEditFunction]
  161. if (!parseAssistantMessage || !constructNewFileContent) {
  162. return {
  163. success: false,
  164. error: "invalid_functions",
  165. }
  166. }
  167. const provider = input.provider || "openrouter"
  168. // Get the output of streaming output of this llm call
  169. let streamResult: StreamResult
  170. if (originalDiffEditToolCallMessage !== undefined) {
  171. // Replay mode: mock the stream result
  172. streamResult = {
  173. assistantMessage: originalDiffEditToolCallMessage,
  174. reasoningMessage: "",
  175. usage: { inputTokens: 0, outputTokens: 0, cacheWriteTokens: 0, cacheReadTokens: 0, totalCost: 0 },
  176. }
  177. } else {
  178. // Live mode: provider-specific API call logic
  179. try {
  180. let handler: OpenRouterHandler | OpenAiNativeHandler
  181. if (provider === "openai") {
  182. const openAiOptions = {
  183. openAiNativeApiKey: apiKey,
  184. apiModelId: modelId,
  185. }
  186. handler = new OpenAiNativeHandler(openAiOptions)
  187. } else {
  188. const openRouterOptions = {
  189. openRouterApiKey: apiKey,
  190. openRouterModelId: modelId,
  191. thinkingBudgetTokens: thinkingBudgetTokens,
  192. openRouterModelInfo: {
  193. maxTokens: 10_000,
  194. contextWindow: 1_000_000,
  195. supportsImages: true,
  196. supportsPromptCache: true,
  197. inputPrice: 0,
  198. outputPrice: 0,
  199. },
  200. }
  201. handler = new OpenRouterHandler(openRouterOptions)
  202. }
  203. streamResult = await processStream(handler, systemPrompt, messages)
  204. } catch (error: any) {
  205. return {
  206. success: false,
  207. error: "llm_stream_error",
  208. errorString: error.message || error.toString(),
  209. }
  210. }
  211. }
  212. // process the assistant message into its constituent tool calls & text blocks
  213. const assistantContentBlocks: AssistantMessageContent[] = parseAssistantMessage(streamResult.assistantMessage)
  214. const detectedToolCalls: ExtractedToolCall[] = []
  215. for (const block of assistantContentBlocks) {
  216. if (block.type === "tool_use") {
  217. detectedToolCalls.push({
  218. name: block.name,
  219. input: block.params,
  220. })
  221. }
  222. }
  223. // check if there are any tool calls, if there are none then its a clear error
  224. if (detectedToolCalls.length === 0) {
  225. return {
  226. success: false,
  227. streamResult: streamResult,
  228. toolCalls: detectedToolCalls,
  229. error: "no_tool_calls",
  230. }
  231. }
  232. // check that there is exactly one tool call, otherwise an error
  233. if (detectedToolCalls.length > 1) {
  234. return {
  235. success: false,
  236. streamResult: streamResult,
  237. toolCalls: detectedToolCalls,
  238. error: "multi_tool_calls",
  239. }
  240. }
  241. // check that the tool call is diff edit tool call
  242. if (detectedToolCalls[0].name !== "replace_in_file") {
  243. return {
  244. success: false,
  245. streamResult: streamResult,
  246. toolCalls: detectedToolCalls,
  247. error: "wrong_tool_call",
  248. }
  249. }
  250. const toolCall = detectedToolCalls[0]
  251. const diffToolPath = toolCall.input.path
  252. const diffToolContent = toolCall.input.diff
  253. if (!diffToolPath || !diffToolContent) {
  254. return {
  255. success: false,
  256. streamResult: streamResult,
  257. toolCalls: detectedToolCalls,
  258. error: "tool_call_params_undefined",
  259. }
  260. }
  261. // check that we are editing the correct file path
  262. log(input.isVerbose, `Expected file path: "${originalFilePath}"`)
  263. log(input.isVerbose, `Actual file path used: "${diffToolPath}"`)
  264. if (diffToolPath !== originalFilePath) {
  265. log(input.isVerbose, `❌ File path mismatch detected!`)
  266. // Enhanced logging:
  267. if (streamResult?.assistantMessage) {
  268. log(input.isVerbose, ` Full model output (assistantMessage):`)
  269. log(input.isVerbose, ` -----------------------------------------`)
  270. log(input.isVerbose, ` ${streamResult.assistantMessage}`)
  271. log(input.isVerbose, ` -----------------------------------------`)
  272. }
  273. if (toolCall) {
  274. log(input.isVerbose, ` Parsed tool call that caused mismatch:`)
  275. log(input.isVerbose, ` ${JSON.stringify(toolCall, null, 2)}`)
  276. log(input.isVerbose, ` -----------------------------------------`)
  277. }
  278. return {
  279. success: false,
  280. streamResult: streamResult,
  281. toolCalls: detectedToolCalls,
  282. error: "wrong_file_edited",
  283. }
  284. }
  285. // checking if the diff edit succeeds, if it failed it will throw an error
  286. let diffSuccess = true
  287. let replacementData: any = undefined
  288. try {
  289. const result = await constructNewFileContent(diffToolContent, originalFile, true)
  290. // Check if result is an object with replacements (new format)
  291. if (typeof result === 'object' && result !== null && 'replacements' in result) {
  292. replacementData = result.replacements
  293. }
  294. // If it's just a string, diffSuccess stays true and replacementData stays undefined
  295. } catch (error: any) {
  296. diffSuccess = false
  297. log(input.isVerbose, `ERROR: ${error}`)
  298. }
  299. return {
  300. success: true,
  301. streamResult: streamResult,
  302. toolCalls: detectedToolCalls,
  303. diffEdit: diffToolContent,
  304. diffEditSuccess: diffSuccess,
  305. replacementData: replacementData,
  306. }
  307. } catch (error: any) {
  308. return {
  309. success: false,
  310. error: "other_error",
  311. errorString: error.message || error.toString(),
  312. }
  313. }
  314. }