Selaa lähdekoodia

Ironing out some kinks when running evals with high parallelism (#2280)

Chris Estreich 9 kuukautta sitten
vanhempi
sitoutus
8f356711f1

+ 94 - 129
evals/apps/cli/src/index.ts

@@ -33,13 +33,17 @@ import { IpcServer, IpcClient } from "@evals/ipc"
 import { __dirname, extensionDevelopmentPath, exercisesPath } from "./paths.js"
 import { getExercises } from "./exercises.js"
 
-const maxConcurrency = 2
-const taskTimeLimit = 5 * 60 * 1_000
+type TaskResult = { success: boolean; retry: boolean }
+type TaskPromise = Promise<TaskResult>
+
+const MAX_CONCURRENCY = 20
+const TASK_TIMEOUT = 10 * 60 * 1_000
+const UNIT_TEST_TIMEOUT = 60 * 1_000
 
 const testCommands: Record<ExerciseLanguage, { commands: string[]; timeout?: number; cwd?: string }> = {
 	go: { commands: ["go test"] }, // timeout 15s bash -c "cd '$dir' && go test > /dev/null 2>&1"
 	java: { commands: ["./gradlew test"] }, // timeout --foreground 15s bash -c "cd '$dir' && ./gradlew test > /dev/null 2>&1"
-	javascript: { commands: ["pnpm install", "pnpm test"], timeout: 30_000 }, // timeout 30s bash -c "cd '$dir' && pnpm install >/dev/null 2>&1 && pnpm test >/dev/null 2>&1"
+	javascript: { commands: ["pnpm install", "pnpm test"] }, // timeout 15s bash -c "cd '$dir' && pnpm install >/dev/null 2>&1 && pnpm test >/dev/null 2>&1"
 	python: { commands: ["uv run python3 -m pytest -o markers=task *_test.py"] }, // timeout 15s bash -c "cd '$dir' && uv run python3 -m pytest -o markers=task *_test.py"
 	rust: { commands: ["cargo test"] }, // timeout 15s bash -c "cd '$dir' && cargo test > /dev/null 2>&1"
 }
@@ -107,40 +111,42 @@ const run = async (toolbox: GluegunToolbox) => {
 	const server = new IpcServer(run.socketPath, () => {})
 	server.listen()
 
-	// server.on(IpcMessageType.Connect, (clientId) => {
-	// 	server.send(clientId, {
-	// 		type: IpcMessageType.TaskEvent,
-	// 		origin: IpcOrigin.Server,
-	// 		data: { eventName: RooCodeEventName.Connect, taskId: -1 },
-	// 	})
-	// })
-
-	const runningPromises: Promise<void>[] = []
+	const runningPromises: TaskPromise[] = []
 
+	// Retries aren't implemented yet, but the return values are set up to
+	// support them.
 	const processTask = async (task: Task) => {
 		if (task.finishedAt === null) {
-			await runExercise({ run, task, server })
+			const { retry } = await runExercise({ run, task, server })
+
+			if (retry) {
+				return { success: false, retry: true }
+			}
 		}
 
 		if (task.passed === null) {
 			const passed = await runUnitTest({ task })
 			await updateTask(task.id, { passed })
+			return { success: passed, retry: false }
+		} else {
+			return { success: task.passed, retry: false }
 		}
 	}
 
-	for (const task of tasks) {
-		const taskPromise = processTask(task)
-		runningPromises.push(taskPromise)
+	const processTaskResult = async (task: Task, promise: TaskPromise) => {
+		const index = runningPromises.indexOf(promise)
 
-		taskPromise.finally(() => {
-			const index = runningPromises.indexOf(taskPromise)
+		if (index > -1) {
+			runningPromises.splice(index, 1)
+		}
+	}
 
-			if (index > -1) {
-				runningPromises.splice(index, 1)
-			}
-		})
+	for (const task of tasks) {
+		const promise = processTask(task)
+		runningPromises.push(promise)
+		promise.then(() => processTaskResult(task, promise))
 
-		if (runningPromises.length >= maxConcurrency) {
+		if (runningPromises.length > MAX_CONCURRENCY) {
 			await Promise.race(runningPromises)
 		}
 	}
@@ -148,89 +154,61 @@ const run = async (toolbox: GluegunToolbox) => {
 	await Promise.all(runningPromises)
 
 	const result = await finishRun(run.id)
-	try {
-		console.log("[cli#run]", result)
-		// eslint-disable-next-line @typescript-eslint/no-unused-vars
-	} catch (error) {
-		// console.error(error)
-	}
+	console.log("[cli#run]", result)
 
 	console.log(await execa({ cwd: exercisesPath })`git add .`)
 	console.log(await execa({ cwd: exercisesPath })`git commit -m ${`Run #${run.id}`} --no-verify`)
 }
 
-const runExercise = async ({ run, task, server }: { run: Run; task: Task; server: IpcServer }) => {
+const runExercise = async ({ run, task, server }: { run: Run; task: Task; server: IpcServer }): TaskPromise => {
 	const { language, exercise } = task
 	const prompt = fs.readFileSync(path.resolve(exercisesPath, `prompts/${language}.md`), "utf-8")
 	const dirname = path.dirname(run.socketPath)
+	const workspacePath = path.resolve(exercisesPath, language, exercise)
 	const taskSocketPath = path.resolve(dirname, `${dirname}/task-${task.id}.sock`)
 
-	const controller = new AbortController()
-	const cancelSignal = controller.signal
-
 	// If debugging:
 	// Use --wait --log trace or --verbose.
-	const codeCommand = `code --disable-workspace-trust`
+	// Don't await execa and store result as subprocess.
+	// subprocess.stdout.pipe(process.stdout)
+
+	// Sleep for a random amount of time before opening a new VSCode window.
+	await new Promise((resolve) => setTimeout(resolve, 1_000 + Math.random() * MAX_CONCURRENCY * 1_000))
+	console.log(`Opening new VS Code window at ${workspacePath}`)
 
 	await execa({
 		env: {
 			ROO_CODE_IPC_SOCKET_PATH: taskSocketPath,
 		},
 		shell: "/bin/bash",
-		cancelSignal,
-	})`${codeCommand} -n ${path.resolve(exercisesPath, language, exercise)}`
+	})`code --disable-workspace-trust -n ${workspacePath}`
 
-	// If debugging:
-	// Don't await execa and store result as subprocess.
-	// subprocess.stdout.pipe(process.stdout)
-
-	// Give VSCode some time to spawn before connectint to its unix socket.
-	await new Promise((resolve) => setTimeout(resolve, 1_000))
+	// Give VSCode some time to spawn before connecting to its unix socket.
+	await new Promise((resolve) => setTimeout(resolve, 1_000 + Math.random() * 4_000))
 	console.log(`Connecting to ${taskSocketPath}`)
+	const client = new IpcClient(taskSocketPath)
 
-	const createClient = (taskSocketPath: string) => {
-		const ipcClient = new IpcClient(taskSocketPath)
-
-		ipcClient.on(IpcMessageType.Ack, (ack) => {
-			console.log(`[cli#runExercise | ${language} / ${exercise}] ack`, ack)
-		})
-
-		return ipcClient
-	}
-
-	let tries = 0
-	let client = createClient(taskSocketPath)
-
-	while (++tries < 5) {
-		try {
-			await pWaitFor(() => client.isReady, { interval: 100, timeout: 5_000 })
-			break
-		} catch (error) {
-			console.error(error)
-			client.disconnect()
-			client = createClient(taskSocketPath)
-		}
+	try {
+		await pWaitFor(() => client.isReady, { interval: 250, timeout: 5_000 })
+		// eslint-disable-next-line @typescript-eslint/no-unused-vars
+	} catch (error) {
+		console.log(`[cli#runExercise | ${language} / ${exercise}] unable to connect`)
+		client.disconnect()
+		return { success: false, retry: false }
 	}
 
-	let isTaskFinished = false
+	let taskStartedAt = Date.now()
+	let taskFinishedAt: number | undefined
+	let taskMetricsId: number | undefined
+	let rooTaskId: string | undefined
 	let isClientDisconnected = false
 
-	client.on(IpcMessageType.Disconnect, async () => {
-		console.log(`[cli#runExercise | ${language} / ${exercise}] disconnect`)
-		isTaskFinished = true
-		isClientDisconnected = true
-	})
-
 	const ignoreEvents: RooCodeEventName[] = [
-		// RooCodeEventName.Message,
+		RooCodeEventName.Message,
 		RooCodeEventName.TaskTokenUsageUpdated,
 		RooCodeEventName.TaskAskResponded,
 	]
 
-	let taskStartedAt = Date.now()
-	let taskMetricsId: number | undefined
-	let rooTaskId: string | undefined
-
 	client.on(IpcMessageType.TaskEvent, async (taskEvent) => {
 		const { eventName, payload } = taskEvent
 
@@ -287,44 +265,43 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
 		}
 
 		if (eventName === RooCodeEventName.TaskCompleted || eventName === RooCodeEventName.TaskAborted) {
+			taskFinishedAt = Date.now()
 			await updateTask(task.id, { finishedAt: new Date() })
-			isTaskFinished = true
 		}
 	})
 
-	if (client.isReady) {
-		client.sendMessage({
-			type: IpcMessageType.TaskCommand,
-			origin: IpcOrigin.Client,
-			clientId: client.clientId!,
+	client.on(IpcMessageType.Disconnect, async () => {
+		console.log(`[cli#runExercise | ${language} / ${exercise}] disconnect`)
+		isClientDisconnected = true
+	})
+
+	console.log(`[cli#runExercise | ${language} / ${exercise}] starting task`)
+
+	client.sendMessage({
+		type: IpcMessageType.TaskCommand,
+		origin: IpcOrigin.Client,
+		clientId: client.clientId!,
+		data: {
+			commandName: TaskCommandName.StartNewTask,
 			data: {
-				commandName: TaskCommandName.StartNewTask,
-				data: {
-					configuration: {
-						...rooCodeDefaults,
-						openRouterApiKey: process.env.OPENROUTER_API_KEY!,
-						...run.settings,
-					},
-					text: prompt,
-					newTab: true,
+				configuration: {
+					...rooCodeDefaults,
+					openRouterApiKey: process.env.OPENROUTER_API_KEY!,
+					...run.settings,
 				},
+				text: prompt,
+				newTab: true,
 			},
-		})
-
-		console.log(`[cli#runExercise | ${language} / ${exercise}] starting task`)
-	} else {
-		console.log(`[cli#runExercise | ${language} / ${exercise}] unable to connect`)
-		client.disconnect()
-		isTaskFinished = true
-		isClientDisconnected = true
-	}
+		},
+	})
 
 	try {
-		await pWaitFor(() => isTaskFinished, { interval: 1_000, timeout: taskTimeLimit })
+		await pWaitFor(() => !!taskFinishedAt || isClientDisconnected, { interval: 1_000, timeout: TASK_TIMEOUT })
 		// eslint-disable-next-line @typescript-eslint/no-unused-vars
 	} catch (error) {
 		console.log(`[cli#runExercise | ${language} / ${exercise}] time limit reached`)
 
+		// Cancel the task.
 		if (rooTaskId && !isClientDisconnected) {
 			client.sendMessage({
 				type: IpcMessageType.TaskCommand,
@@ -333,35 +310,28 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
 				data: { commandName: TaskCommandName.CancelTask, data: rooTaskId },
 			})
 
-			await new Promise((resolve) => setTimeout(resolve, 2_000))
+			// Give the server some time to cancel the task.
+			await new Promise((resolve) => setTimeout(resolve, 5_000))
 		}
 
+		// TODO: Notify clients that the task timed out.
 		await updateTask(task.id, { finishedAt: new Date() })
 	}
 
 	if (!isClientDisconnected) {
-		try {
-			if (rooTaskId) {
-				client.sendMessage({
-					type: IpcMessageType.TaskCommand,
-					origin: IpcOrigin.Client,
-					clientId: client.clientId!,
-					data: { commandName: TaskCommandName.CloseTask, data: rooTaskId },
-				})
-			}
-
-			client.disconnect()
-		} catch (error) {
-			console.error(error)
+		if (rooTaskId) {
+			client.sendMessage({
+				type: IpcMessageType.TaskCommand,
+				origin: IpcOrigin.Client,
+				clientId: client.clientId!,
+				data: { commandName: TaskCommandName.CloseTask, data: rooTaskId },
+			})
 		}
+
+		client.disconnect()
 	}
 
-	// try {
-	// 	console.log(`[cli#runExercise | ${language} / ${exercise}] aborting subprocess`)
-	// 	controller.abort()
-	// 	await subprocess
-	// } catch (error) {
-	// }
+	return { success: !!taskFinishedAt, retry: false }
 }
 
 const runUnitTest = async ({ task }: { task: Task }) => {
@@ -373,22 +343,17 @@ const runUnitTest = async ({ task }: { task: Task }) => {
 	let passed = true
 
 	for (const command of commands) {
-		// const controller = new AbortController()
-		// const cancelSignal = controller.signal
-		// const timeout = setTimeout(() => controller.abort(), cmd.timeout ?? 15_000)
+		const timeout = cmd.timeout ?? UNIT_TEST_TIMEOUT
 
 		try {
-			const result = await execa({ cwd, shell: true, reject: false /* , cancelSignal */ })`${command}`
-			// console.log('[cli#run] execa result =', { ...result, cwd, command })
-
-			// clearTimeout(timeout)
+			const result = await execa({ cwd, shell: true, reject: false, timeout })`${command}`
 
 			if (result.failed) {
 				passed = false
 				break
 			}
 		} catch (error) {
-			console.log("[cli#run] execa error =", error)
+			console.log("[cli#runUnitTest]", error)
 			passed = false
 			break
 		}

+ 19 - 12
evals/apps/web/src/app/home.tsx

@@ -1,14 +1,14 @@
 "use client"
 
+import { useMemo } from "react"
 import { useRouter } from "next/navigation"
-import { Rocket } from "lucide-react"
+import Link from "next/link"
+import { ChevronRight, Rocket } from "lucide-react"
 
 import type { Run, TaskMetrics } from "@evals/db"
 
-import { formatCurrency, formatDuration } from "@/lib"
+import { formatCurrency, formatDuration, formatTokens } from "@/lib"
 import { Button, Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui"
-import { useMemo } from "react"
-import Link from "next/link"
 
 export function Home({ runs }: { runs: (Run & { taskMetrics: TaskMetrics | null })[] }) {
 	const router = useRouter()
@@ -20,32 +20,39 @@ export function Home({ runs }: { runs: (Run & { taskMetrics: TaskMetrics | null
 			<Table className="border border-t-0">
 				<TableHeader>
 					<TableRow>
-						<TableHead>ID</TableHead>
 						<TableHead>Model</TableHead>
-						<TableHead>Timestamp</TableHead>
 						<TableHead>Passed</TableHead>
 						<TableHead>Failed</TableHead>
 						<TableHead>% Correct</TableHead>
+						<TableHead className="text-center">Tokens In / Out</TableHead>
 						<TableHead>Cost</TableHead>
 						<TableHead>Duration</TableHead>
+						<TableHead />
 					</TableRow>
 				</TableHeader>
 				<TableBody>
 					{visibleRuns.length ? (
 						visibleRuns.map(({ taskMetrics, ...run }) => (
 							<TableRow key={run.id}>
-								<TableCell>
-									<Button variant="link" asChild>
-										<Link href={`/runs/${run.id}`}>{run.id}</Link>
-									</Button>
-								</TableCell>
 								<TableCell>{run.model}</TableCell>
-								<TableCell>{new Date(run.createdAt).toLocaleString()}</TableCell>
 								<TableCell>{run.passed}</TableCell>
 								<TableCell>{run.failed}</TableCell>
 								<TableCell>{((run.passed / (run.passed + run.failed)) * 100).toFixed(1)}%</TableCell>
+								<TableCell>
+									<div className="flex items-center justify-evenly">
+										<div>{formatTokens(taskMetrics!.tokensIn)}</div>/
+										<div>{formatTokens(taskMetrics!.tokensOut)}</div>
+									</div>
+								</TableCell>
 								<TableCell>{formatCurrency(taskMetrics!.cost)}</TableCell>
 								<TableCell>{formatDuration(taskMetrics!.duration)}</TableCell>
+								<TableCell>
+									<Button variant="ghost" size="icon" asChild>
+										<Link href={`/runs/${run.id}`}>
+											<ChevronRight />
+										</Link>
+									</Button>
+								</TableCell>
 							</TableRow>
 						))
 					) : (

+ 1 - 1
evals/apps/web/src/app/layout.tsx

@@ -11,7 +11,7 @@ const fontSans = Geist({ variable: "--font-sans", subsets: ["latin"] })
 const fontMono = Geist_Mono({ variable: "--font-mono", subsets: ["latin"] })
 
 export const metadata: Metadata = {
-	title: "Roo Code Benchmarks",
+	title: "Roo Code Evals",
 }
 
 export default function RootLayout({

+ 2 - 16
evals/apps/web/src/app/runs/[id]/run.tsx

@@ -1,6 +1,6 @@
 "use client"
 
-import { useState, useRef, useEffect } from "react"
+import { useState, useRef } from "react"
 import { LoaderCircle, SquareTerminal } from "lucide-react"
 
 import * as db from "@evals/db"
@@ -13,7 +13,6 @@ import {
 	DrawerHeader,
 	DrawerTitle,
 	ScrollArea,
-	Separator,
 	Table,
 	TableBody,
 	TableCell,
@@ -30,19 +29,6 @@ export function Run({ run }: { run: db.Run }) {
 	const scrollAreaRef = useRef<HTMLDivElement>(null)
 	const [selectedTask, setSelectedTask] = useState<db.Task>()
 
-	useEffect(() => {
-		if (selectedTask) {
-			const scrollArea = scrollAreaRef.current
-
-			if (scrollArea) {
-				scrollArea.scrollTo({
-					top: scrollArea.scrollHeight,
-					behavior: "smooth",
-				})
-			}
-		}
-	}, [selectedTask, outputCounts])
-
 	return (
 		<>
 			<div>
@@ -51,7 +37,7 @@ export function Run({ run }: { run: db.Run }) {
 						<div>{run.model}</div>
 						{run.description && <div className="text-sm text-muted-foreground">{run.description}</div>}
 					</div>
-					<ConnectionStatus status={status} pid={run.pid} />
+					{!run.taskMetricsId && <ConnectionStatus status={status} pid={run.pid} />}
 				</div>
 				{!tasks ? (
 					<LoaderCircle className="size-4 animate-spin" />

+ 17 - 3
evals/apps/web/src/app/runs/new/new-run.tsx

@@ -82,15 +82,29 @@ export function NewRun() {
 	const [model, suite, settings] = watch(["model", "suite", "settings"])
 
 	const onSubmit = useCallback(
-		async (data: FormValues) => {
+		async ({ settings, ...data }: FormValues) => {
 			try {
-				const { id } = await createRun(data)
+				const openRouterModel = models.data?.find(({ id }) => id === data.model)
+
+				if (!openRouterModel) {
+					throw new Error(`Model not found: ${data.model}`)
+				}
+
+				const { id } = await createRun({
+					...data,
+					settings: {
+						...settings,
+						openRouterModelId: openRouterModel.id,
+						openRouterModelInfo: openRouterModel.modelInfo,
+					},
+				})
+
 				router.push(`/runs/${id}`)
 			} catch (e) {
 				toast.error(e instanceof Error ? e.message : "An unknown error occurred.")
 			}
 		},
-		[router],
+		[router, models.data],
 	)
 
 	const onFilterModels = useCallback(

+ 42 - 3
evals/apps/web/src/hooks/use-open-router-models.ts

@@ -1,17 +1,41 @@
 import { z } from "zod"
 import { useQuery } from "@tanstack/react-query"
 
+import { type ModelInfo } from "@evals/types"
+
+const supportsPromptCache = ["anthropic/claude-3.7-sonnet", "anthropic/claude-3.5-sonnet", "anthropic/claude-3-5-haiku"]
+
+const supportsComputerUse = ["anthropic/claude-3.7-sonnet", "anthropic/claude-3.5-sonnet"]
+
+const supportsThinking = ["anthropic/claude-3.7-sonnet:thinking"]
+
+const parsePrice = (price?: string) => (price ? parseFloat(price) * 1_000_000 : undefined)
+
 export const openRouterModelSchema = z.object({
 	id: z.string(),
 	name: z.string(),
 	description: z.string(),
 	created: z.number(),
 	context_length: z.number(),
+	pricing: z.object({
+		prompt: z.string().optional(),
+		completion: z.string().optional(),
+	}),
+	top_provider: z
+		.object({
+			max_completion_tokens: z.number().nullish(),
+		})
+		.optional(),
+	architecture: z
+		.object({
+			modality: z.string(),
+		})
+		.optional(),
 })
 
-export type OpenRouterModel = z.infer<typeof openRouterModelSchema>
+export type OpenRouterModel = z.infer<typeof openRouterModelSchema> & { modelInfo: ModelInfo }
 
-export const getOpenRouterModels = async () => {
+export const getOpenRouterModels = async (): Promise<OpenRouterModel[]> => {
 	const response = await fetch("https://openrouter.ai/api/v1/models")
 
 	if (!response.ok) {
@@ -26,7 +50,22 @@ export const getOpenRouterModels = async () => {
 		return []
 	}
 
-	return result.data.data.sort((a, b) => a.name.localeCompare(b.name))
+	return result.data.data
+		.sort((a, b) => a.name.localeCompare(b.name))
+		.map((rawModel) => ({
+			...rawModel,
+			modelInfo: {
+				maxTokens: rawModel.top_provider?.max_completion_tokens ?? undefined,
+				contextWindow: rawModel.context_length,
+				supportsImages: rawModel.architecture?.modality?.includes("image"),
+				supportsPromptCache: supportsPromptCache.some((model) => rawModel.id.startsWith(model)),
+				supportsComputerUse: supportsComputerUse.some((model) => rawModel.id.startsWith(model)),
+				inputPrice: parsePrice(rawModel.pricing?.prompt),
+				outputPrice: parsePrice(rawModel.pricing?.completion),
+				description: rawModel.description,
+				thinking: supportsThinking.some((model) => rawModel.id.startsWith(model)),
+			},
+		}))
 }
 
 export const useOpenRouterModels = () =>

+ 9 - 1
evals/apps/web/src/lib/format-tokens.ts

@@ -3,5 +3,13 @@ export const formatTokens = (tokens: number) => {
 		return tokens.toString()
 	}
 
-	return `${(tokens / 1000).toFixed(1)}k`
+	if (tokens < 1000000) {
+		return `${(tokens / 1000).toFixed(1)}k`
+	}
+
+	if (tokens < 1000000000) {
+		return `${(tokens / 1000000).toFixed(1)}M`
+	}
+
+	return `${(tokens / 1000000000).toFixed(1)}B`
 }

+ 2 - 2
evals/apps/web/src/lib/schemas.ts

@@ -1,6 +1,6 @@
 import { z } from "zod"
 
-import { globalSettingsSchema } from "@evals/types"
+import { rooCodeSettingsSchema } from "@evals/types"
 
 /**
  * CreateRun
@@ -12,7 +12,7 @@ export const createRunSchema = z
 		description: z.string().optional(),
 		suite: z.enum(["full", "partial"]),
 		exercises: z.array(z.string()).optional(),
-		settings: globalSettingsSchema.optional(),
+		settings: rooCodeSettingsSchema.optional(),
 	})
 	.refine((data) => data.suite === "full" || (data.exercises || []).length > 0, {
 		message: "Exercises are required when running a partial suite.",

+ 2 - 1
evals/packages/db/package.json

@@ -14,7 +14,8 @@
 		"db:pull": "pnpm drizzle-kit pull",
 		"db:check": "pnpm drizzle-kit check",
 		"db:up": "pnpm drizzle-kit up",
-		"db:studio": "pnpm drizzle-kit studio"
+		"db:studio": "pnpm drizzle-kit studio",
+		"db:enable-wal": "dotenvx run -f ../../.env -- tsx scripts/enable-wal.mts"
 	},
 	"dependencies": {
 		"@evals/types": "workspace:^",

+ 23 - 0
evals/packages/db/scripts/enable-wal.mts

@@ -0,0 +1,23 @@
+import { db } from "../src/db.js"
+
+const main = async () => {
+	// Enable WAL mode for better performance and concurrency.
+	// https://til.simonwillison.net/sqlite/enabling-wal-mode
+	try {
+		const { rows } = await db.$client.execute("PRAGMA journal_mode = WAL;")
+		const row = rows[0]
+
+		if (row) {
+			console.log(`SQLite journal mode set to: ${row[0]}`)
+			process.exit(0)
+		} else {
+			console.error("Failed to enable WAL mode: no rows returned")
+			process.exit(1)
+		}
+	} catch (error) {
+		console.error(error)
+		process.exit(1)
+	}
+}
+
+main()

+ 6 - 1
evals/packages/db/src/db.ts

@@ -2,4 +2,9 @@ import { drizzle } from "drizzle-orm/libsql"
 
 import { schema } from "./schema.js"
 
-export const db = drizzle({ schema, connection: { url: process.env.BENCHMARKS_DB_PATH! } })
+const connection = {
+	url: process.env.BENCHMARKS_DB_PATH!,
+	concurrency: 50,
+}
+
+export const db = drizzle({ schema, connection })

+ 2 - 2
evals/packages/db/src/schema.ts

@@ -2,7 +2,7 @@ import { sqliteTable, text, real, integer, blob, uniqueIndex } from "drizzle-orm
 import { relations } from "drizzle-orm"
 import { createInsertSchema } from "drizzle-zod"
 
-import { GlobalSettings, exerciseLanguages, globalSettingsSchema } from "@evals/types"
+import { GlobalSettings, exerciseLanguages, rooCodeSettingsSchema } from "@evals/types"
 
 /**
  * runs
@@ -28,7 +28,7 @@ export const runsRelations = relations(runs, ({ one }) => ({
 export type Run = typeof runs.$inferSelect
 
 export const insertRunSchema = createInsertSchema(runs).omit({ id: true, createdAt: true }).extend({
-	settings: globalSettingsSchema.optional(),
+	settings: rooCodeSettingsSchema.optional(),
 })
 
 export type InsertRun = Omit<typeof runs.$inferInsert, "id" | "createdAt">

+ 16 - 1
evals/packages/types/src/roo-code-defaults.ts

@@ -4,6 +4,22 @@ export const rooCodeDefaults: RooCodeSettings = {
 	apiProvider: "openrouter",
 	openRouterModelId: "google/gemini-2.0-flash-001", // "anthropic/claude-3.7-sonnet",
 
+	// apiProvider: "openai",
+	// openAiBaseUrl: "http://hrudolph.duckdns.org:4269/api/v1",
+	// openAiApiKey: process.env.OPENAI_API_KEY,
+	// openAiModelId: "models/gemini-2.5-pro-exp-03-25",
+	// openAiCustomModelInfo: {
+	// 	maxTokens: 65536,
+	// 	contextWindow: 1000000,
+	// 	supportsImages: true,
+	// 	supportsPromptCache: false,
+	// 	inputPrice: 0,
+	// 	outputPrice: 0,
+	// 	description:
+	// 		"Gemini 2.5 Pro is Google’s state-of-the-art AI model designed for advanced reasoning, coding, mathematics, and scientific tasks. It employs “thinking” capabilities, enabling it to reason through responses with enhanced accuracy and nuanced context handling. Gemini 2.5 Pro achieves top-tier performance on multiple benchmarks, including first-place positioning on the LMArena leaderboard, reflecting superior human-preference alignment and complex problem-solving abilities.",
+	// 	thinking: false,
+	// },
+
 	pinnedApiConfigs: {},
 	lastShownAnnouncementId: "mar-20-2025-3-10",
 
@@ -47,7 +63,6 @@ export const rooCodeDefaults: RooCodeSettings = {
 	diffEnabled: true,
 	fuzzyMatchThreshold: 1.0,
 	experiments: {
-		multi_search_and_replace: false,
 		search_and_replace: true,
 		insert_content: false,
 		powerSteering: false,

+ 1 - 7
evals/packages/types/src/roo-code.ts

@@ -270,12 +270,7 @@ export type CustomSupportPrompts = z.infer<typeof customSupportPromptsSchema>
  * ExperimentId
  */
 
-export const experimentIds = [
-	"search_and_replace",
-	"insert_content",
-	"powerSteering",
-	"multi_search_and_replace",
-] as const
+export const experimentIds = ["search_and_replace", "insert_content", "powerSteering"] as const
 
 export const experimentIdsSchema = z.enum(experimentIds)
 
@@ -289,7 +284,6 @@ const experimentsSchema = z.object({
 	search_and_replace: z.boolean(),
 	insert_content: z.boolean(),
 	powerSteering: z.boolean(),
-	multi_search_and_replace: z.boolean(),
 })
 
 export type Experiments = z.infer<typeof experimentsSchema>

+ 1 - 0
evals/scripts/setup.sh

@@ -296,6 +296,7 @@ fi
 if [[ ! -s /tmp/evals.db ]]; then
   echo "🗄️ Creating database..."
   pnpm --filter @evals/db db:push || exit 1
+  pnpm --filter @evals/db db:enable-wal || exit 1
 fi
 
 if ! grep -q "OPENROUTER_API_KEY" .env; then

+ 14 - 10
src/core/Cline.ts

@@ -105,6 +105,7 @@ export type ClineOptions = {
 	enableCheckpoints?: boolean
 	checkpointStorage?: CheckpointStorage
 	fuzzyMatchThreshold?: number
+	consecutiveMistakeLimit?: number
 	task?: string
 	images?: string[]
 	historyItem?: HistoryItem
@@ -135,7 +136,7 @@ export class Cline extends EventEmitter<ClineEvents> {
 	customInstructions?: string
 	diffStrategy?: DiffStrategy
 	diffEnabled: boolean = false
-	fuzzyMatchThreshold: number = 1.0
+	fuzzyMatchThreshold: number
 
 	apiConversationHistory: (Anthropic.MessageParam & { ts?: number })[] = []
 	clineMessages: ClineMessage[] = []
@@ -144,10 +145,11 @@ export class Cline extends EventEmitter<ClineEvents> {
 	private askResponseText?: string
 	private askResponseImages?: string[]
 	private lastMessageTs?: number
-	// Not private since it needs to be accessible by tools
+	// Not private since it needs to be accessible by tools.
 	consecutiveMistakeCount: number = 0
+	consecutiveMistakeLimit: number
 	consecutiveMistakeCountForApplyDiff: Map<string, number> = new Map()
-	// Not private since it needs to be accessible by tools
+	// Not private since it needs to be accessible by tools.
 	providerRef: WeakRef<ClineProvider>
 	private abort: boolean = false
 	didFinishAbortingStream = false
@@ -178,10 +180,11 @@ export class Cline extends EventEmitter<ClineEvents> {
 		provider,
 		apiConfiguration,
 		customInstructions,
-		enableDiff,
+		enableDiff = false,
 		enableCheckpoints = true,
 		checkpointStorage = "task",
-		fuzzyMatchThreshold,
+		fuzzyMatchThreshold = 1.0,
+		consecutiveMistakeLimit = 3,
 		task,
 		images,
 		historyItem,
@@ -189,7 +192,7 @@ export class Cline extends EventEmitter<ClineEvents> {
 		startTask = true,
 		rootTask,
 		parentTask,
-		taskNumber,
+		taskNumber = -1,
 		onCreated,
 	}: ClineOptions) {
 		super()
@@ -211,8 +214,9 @@ export class Cline extends EventEmitter<ClineEvents> {
 		this.urlContentFetcher = new UrlContentFetcher(provider.context)
 		this.browserSession = new BrowserSession(provider.context)
 		this.customInstructions = customInstructions
-		this.diffEnabled = enableDiff ?? false
-		this.fuzzyMatchThreshold = fuzzyMatchThreshold ?? 1.0
+		this.diffEnabled = enableDiff
+		this.fuzzyMatchThreshold = fuzzyMatchThreshold
+		this.consecutiveMistakeLimit = consecutiveMistakeLimit
 		this.providerRef = new WeakRef(provider)
 		this.diffViewProvider = new DiffViewProvider(this.cwd)
 		this.enableCheckpoints = enableCheckpoints
@@ -220,7 +224,7 @@ export class Cline extends EventEmitter<ClineEvents> {
 
 		this.rootTask = rootTask
 		this.parentTask = parentTask
-		this.taskNumber = taskNumber ?? -1
+		this.taskNumber = taskNumber
 
 		if (historyItem) {
 			telemetryService.captureTaskRestarted(this.taskId)
@@ -1718,7 +1722,7 @@ export class Cline extends EventEmitter<ClineEvents> {
 			throw new Error(`[Cline#recursivelyMakeClineRequests] task ${this.taskId}.${this.instanceId} aborted`)
 		}
 
-		if (this.consecutiveMistakeCount >= 3) {
+		if (this.consecutiveMistakeCount >= this.consecutiveMistakeLimit) {
 			const { response, text, images } = await this.ask(
 				"mistake_limit_reached",
 				this.api.getModel().id.includes("claude")

+ 26 - 4
src/core/webview/ClineProvider.ts

@@ -447,10 +447,29 @@ export class ClineProvider extends EventEmitter<ClineProviderEvents> implements
 		return this.initClineWithTask(task, images, parent)
 	}
 
-	// when initializing a new task, (not from history but from a tool command new_task) there is no need to remove the previouse task
-	// since the new task is a sub task of the previous one, and when it finishes it is removed from the stack and the caller is resumed
-	// in this way we can have a chain of tasks, each one being a sub task of the previous one until the main task is finished
-	public async initClineWithTask(task?: string, images?: string[], parentTask?: Cline) {
+	// When initializing a new task, (not from history but from a tool command
+	// new_task) there is no need to remove the previouse task since the new
+	// task is a subtask of the previous one, and when it finishes it is removed
+	// from the stack and the caller is resumed in this way we can have a chain
+	// of tasks, each one being a sub task of the previous one until the main
+	// task is finished.
+	public async initClineWithTask(
+		task?: string,
+		images?: string[],
+		parentTask?: Cline,
+		options: Partial<
+			Pick<
+				ClineOptions,
+				| "customInstructions"
+				| "enableDiff"
+				| "enableCheckpoints"
+				| "checkpointStorage"
+				| "fuzzyMatchThreshold"
+				| "consecutiveMistakeLimit"
+				| "experiments"
+			>
+		> = {},
+	) {
 		const {
 			apiConfiguration,
 			customModePrompts,
@@ -481,12 +500,15 @@ export class ClineProvider extends EventEmitter<ClineProviderEvents> implements
 			parentTask,
 			taskNumber: this.clineStack.length + 1,
 			onCreated: (cline) => this.emit("clineCreated", cline),
+			...options,
 		})
 
 		await this.addClineToStack(cline)
+
 		this.log(
 			`[subtasks] ${cline.parentTask ? "child" : "parent"} task ${cline.taskId}.${cline.instanceId} instantiated`,
 		)
+
 		return cline
 	}
 

+ 65 - 28
src/exports/api.ts

@@ -1,11 +1,14 @@
 import { EventEmitter } from "events"
 import * as vscode from "vscode"
+import fs from "fs/promises"
+import * as path from "path"
 
+import { getWorkspacePath } from "../utils/path"
 import { ClineProvider } from "../core/webview/ClineProvider"
 import { openClineInNewTab } from "../activate/registerCommands"
-
 import { RooCodeSettings, RooCodeEvents, RooCodeEventName, ClineMessage } from "../schemas"
 import { IpcOrigin, IpcMessageType, TaskCommandName, TaskEvent } from "../schemas/ipc"
+
 import { RooCodeAPI } from "./interface"
 import { IpcServer } from "./ipc"
 import { outputChannelLog } from "./log"
@@ -18,6 +21,7 @@ export class API extends EventEmitter<RooCodeEvents> implements RooCodeAPI {
 	private readonly ipc?: IpcServer
 	private readonly taskMap = new Map<string, ClineProvider>()
 	private readonly log: (...args: unknown[]) => void
+	private logfile?: string
 
 	constructor(
 		outputChannel: vscode.OutputChannel,
@@ -31,12 +35,16 @@ export class API extends EventEmitter<RooCodeEvents> implements RooCodeAPI {
 		this.sidebarProvider = provider
 		this.context = provider.context
 
-		this.log = enableLogging
-			? (...args: unknown[]) => {
-					outputChannelLog(this.outputChannel, ...args)
-					console.log(args)
-				}
-			: () => {}
+		if (enableLogging) {
+			this.log = (...args: unknown[]) => {
+				outputChannelLog(this.outputChannel, ...args)
+				console.log(args)
+			}
+
+			this.logfile = path.join(getWorkspacePath(), "roo-code-messages.log")
+		} else {
+			this.log = () => {}
+		}
 
 		this.registerListeners(this.sidebarProvider)
 
@@ -89,6 +97,7 @@ export class API extends EventEmitter<RooCodeEvents> implements RooCodeAPI {
 		let provider: ClineProvider
 
 		if (newTab) {
+			await vscode.commands.executeCommand("workbench.action.files.revert")
 			await vscode.commands.executeCommand("workbench.action.closeAllEditors")
 
 			if (!this.tabProvider) {
@@ -116,7 +125,10 @@ export class API extends EventEmitter<RooCodeEvents> implements RooCodeAPI {
 		await provider.postMessageToWebview({ type: "action", action: "chatButtonClicked" })
 		await provider.postMessageToWebview({ type: "invoke", invoke: "newChat", text, images })
 
-		const { taskId } = await provider.initClineWithTask(text, images)
+		const { taskId } = await provider.initClineWithTask(text, images, undefined, {
+			consecutiveMistakeLimit: Number.MAX_SAFE_INTEGER,
+		})
+
 		return taskId
 	}
 
@@ -163,8 +175,7 @@ export class API extends EventEmitter<RooCodeEvents> implements RooCodeAPI {
 		await this.sidebarProvider.postStateToWebview()
 	}
 
-	public async createProfile(name: string): Promise<string> {
-		// Input validation
+	public async createProfile(name: string) {
 		if (!name || !name.trim()) {
 			throw new Error("Profile name cannot be empty")
 		}
@@ -176,32 +187,33 @@ export class API extends EventEmitter<RooCodeEvents> implements RooCodeAPI {
 			throw new Error(`A profile with the name "${name}" already exists`)
 		}
 
-		// Generate unique ID and create profile
 		const id = this.sidebarProvider.providerSettingsManager.generateId()
-		const newProfile = {
-			id,
-			name: name.trim(),
-			apiProvider: "openai" as const, // Type assertion for better type safety
-		}
 
-		// Update configuration with new profile
 		await this.setConfiguration({
 			...currentSettings,
-			listApiConfigMeta: [...profiles, newProfile],
+			listApiConfigMeta: [
+				...profiles,
+				{
+					id,
+					name: name.trim(),
+					apiProvider: "openai" as const,
+				},
+			],
 		})
+
 		return id
 	}
 
-	public getProfiles(): string[] {
-		const profiles = this.getConfiguration().listApiConfigMeta || []
-		return profiles.map((profile) => profile.name)
+	public getProfiles() {
+		return (this.getConfiguration().listApiConfigMeta || []).map((profile) => profile.name)
 	}
 
-	public async setActiveProfile(name: string): Promise<void> {
+	public async setActiveProfile(name: string) {
 		const currentSettings = this.getConfiguration()
 		const profiles = currentSettings.listApiConfigMeta || []
 
 		const profile = profiles.find((p) => p.name === name)
+
 		if (!profile) {
 			throw new Error(`Profile with name "${name}" does not exist`)
 		}
@@ -212,14 +224,15 @@ export class API extends EventEmitter<RooCodeEvents> implements RooCodeAPI {
 		})
 	}
 
-	public getActiveProfile(): string | undefined {
+	public getActiveProfile() {
 		return this.getConfiguration().currentApiConfigName
 	}
 
-	public async deleteProfile(name: string): Promise<void> {
+	public async deleteProfile(name: string) {
 		const currentSettings = this.getConfiguration()
 		const profiles = currentSettings.listApiConfigMeta || []
 		const targetIndex = profiles.findIndex((p) => p.name === name)
+
 		if (targetIndex === -1) {
 			throw new Error(`Profile with name "${name}" does not exist`)
 		}
@@ -227,7 +240,7 @@ export class API extends EventEmitter<RooCodeEvents> implements RooCodeAPI {
 		const profileToDelete = profiles[targetIndex]
 		profiles.splice(targetIndex, 1)
 
-		// If we're deleting the active profile, clear the currentApiConfigName
+		// If we're deleting the active profile, clear the currentApiConfigName.
 		const newSettings: RooCodeSettings = {
 			...currentSettings,
 			listApiConfigMeta: profiles,
@@ -236,6 +249,7 @@ export class API extends EventEmitter<RooCodeEvents> implements RooCodeAPI {
 					? undefined
 					: currentSettings.currentApiConfigName,
 		}
+
 		await this.setConfiguration(newSettings)
 	}
 
@@ -245,12 +259,19 @@ export class API extends EventEmitter<RooCodeEvents> implements RooCodeAPI {
 
 	private registerListeners(provider: ClineProvider) {
 		provider.on("clineCreated", (cline) => {
-			cline.on("taskStarted", () => {
+			cline.on("taskStarted", async () => {
 				this.emit(RooCodeEventName.TaskStarted, cline.taskId)
 				this.taskMap.set(cline.taskId, provider)
+				await this.fileLog(`[${new Date().toISOString()}] taskStarted -> ${cline.taskId}\n`)
 			})
 
-			cline.on("message", (message) => this.emit(RooCodeEventName.Message, { taskId: cline.taskId, ...message }))
+			cline.on("message", async (message) => {
+				this.emit(RooCodeEventName.Message, { taskId: cline.taskId, ...message })
+
+				if (message.message.partial !== true) {
+					await this.fileLog(`[${new Date().toISOString()}] ${JSON.stringify(message.message, null, 2)}\n`)
+				}
+			})
 
 			cline.on("taskModeSwitched", (taskId, mode) => this.emit(RooCodeEventName.TaskModeSwitched, taskId, mode))
 
@@ -265,9 +286,13 @@ export class API extends EventEmitter<RooCodeEvents> implements RooCodeAPI {
 				this.taskMap.delete(cline.taskId)
 			})
 
-			cline.on("taskCompleted", (_, usage) => {
+			cline.on("taskCompleted", async (_, usage) => {
 				this.emit(RooCodeEventName.TaskCompleted, cline.taskId, usage)
 				this.taskMap.delete(cline.taskId)
+
+				await this.fileLog(
+					`[${new Date().toISOString()}] taskCompleted -> ${cline.taskId} | ${JSON.stringify(usage, null, 2)}\n`,
+				)
 			})
 
 			cline.on("taskSpawned", (childTaskId) => this.emit(RooCodeEventName.TaskSpawned, cline.taskId, childTaskId))
@@ -277,4 +302,16 @@ export class API extends EventEmitter<RooCodeEvents> implements RooCodeAPI {
 			this.emit(RooCodeEventName.TaskCreated, cline.taskId)
 		})
 	}
+
+	private async fileLog(message: string) {
+		if (!this.logfile) {
+			return
+		}
+
+		try {
+			await fs.appendFile(this.logfile, message, "utf8")
+		} catch (_) {
+			this.logfile = undefined
+		}
+	}
 }