Browse Source

Evals improvements (#2555)

* Evals improvements

* Remove debugging
Chris Estreich 10 months ago
parent
commit
ef9b339027

+ 4 - 2
evals/apps/cli/package.json

@@ -16,10 +16,12 @@
 		"execa": "^9.5.2",
 		"execa": "^9.5.2",
 		"gluegun": "^5.1.2",
 		"gluegun": "^5.1.2",
 		"p-map": "^7.0.3",
 		"p-map": "^7.0.3",
-		"p-wait-for": "^5.0.2"
+		"p-wait-for": "^5.0.2",
+		"ps-tree": "^1.2.0"
 	},
 	},
 	"devDependencies": {
 	"devDependencies": {
 		"@evals/eslint-config": "workspace:^",
 		"@evals/eslint-config": "workspace:^",
-		"@evals/typescript-config": "workspace:^"
+		"@evals/typescript-config": "workspace:^",
+		"@types/ps-tree": "^1.1.6"
 	}
 	}
 }
 }

+ 76 - 35
evals/apps/cli/src/index.ts

@@ -6,6 +6,7 @@ import pMap from "p-map"
 import pWaitFor from "p-wait-for"
 import pWaitFor from "p-wait-for"
 import { execa, parseCommandString } from "execa"
 import { execa, parseCommandString } from "execa"
 import { build, filesystem, GluegunPrompt, GluegunToolbox } from "gluegun"
 import { build, filesystem, GluegunPrompt, GluegunToolbox } from "gluegun"
+import psTree from "ps-tree"
 
 
 import {
 import {
 	type ExerciseLanguage,
 	type ExerciseLanguage,
@@ -36,8 +37,9 @@ import { getExercises } from "./exercises.js"
 type TaskResult = { success: boolean; retry: boolean }
 type TaskResult = { success: boolean; retry: boolean }
 type TaskPromise = Promise<TaskResult>
 type TaskPromise = Promise<TaskResult>
 
 
-const TASK_TIMEOUT = 10 * 60 * 1_000
-const UNIT_TEST_TIMEOUT = 60 * 1_000
+const TASK_START_DELAY = 10 * 1_000
+const TASK_TIMEOUT = 5 * 60 * 1_000
+const UNIT_TEST_TIMEOUT = 2 * 60 * 1_000
 
 
 const testCommands: Record<ExerciseLanguage, { commands: string[]; timeout?: number; cwd?: string }> = {
 const testCommands: Record<ExerciseLanguage, { commands: string[]; timeout?: number; cwd?: string }> = {
 	go: { commands: ["go test"] }, // timeout 15s bash -c "cd '$dir' && go test > /dev/null 2>&1"
 	go: { commands: ["go test"] }, // timeout 15s bash -c "cd '$dir' && go test > /dev/null 2>&1"
@@ -98,13 +100,11 @@ const run = async (toolbox: GluegunToolbox) => {
 		throw new Error("No tasks found.")
 		throw new Error("No tasks found.")
 	}
 	}
 
 
-	console.log(await execa({ cwd: exercisesPath })`git config user.name "Roo Code"`)
-	console.log(await execa({ cwd: exercisesPath })`git config user.email "[email protected]"`)
-	console.log(await execa({ cwd: exercisesPath })`git checkout -f`)
-	console.log(await execa({ cwd: exercisesPath })`git clean -fd`)
-	console.log(
-		await execa({ cwd: exercisesPath })`git checkout -b runs/${run.id}-${crypto.randomUUID().slice(0, 8)} main`,
-	)
+	await execa({ cwd: exercisesPath })`git config user.name "Roo Code"`
+	await execa({ cwd: exercisesPath })`git config user.email "[email protected]"`
+	await execa({ cwd: exercisesPath })`git checkout -f`
+	await execa({ cwd: exercisesPath })`git clean -fd`
+	await execa({ cwd: exercisesPath })`git checkout -b runs/${run.id}-${crypto.randomUUID().slice(0, 8)} main`
 
 
 	fs.writeFileSync(
 	fs.writeFileSync(
 		path.resolve(exercisesPath, "settings.json"),
 		path.resolve(exercisesPath, "settings.json"),
@@ -145,11 +145,11 @@ const run = async (toolbox: GluegunToolbox) => {
 		}
 		}
 	}
 	}
 
 
-	let delay = 0
+	let delay = TASK_START_DELAY
 
 
 	for (const task of tasks) {
 	for (const task of tasks) {
 		const promise = processTask(task, delay)
 		const promise = processTask(task, delay)
-		delay = delay + 5_000
+		delay = delay + TASK_START_DELAY
 		runningPromises.push(promise)
 		runningPromises.push(promise)
 		promise.then(() => processTaskResult(task, promise))
 		promise.then(() => processTaskResult(task, promise))
 
 
@@ -162,10 +162,10 @@ const run = async (toolbox: GluegunToolbox) => {
 	await Promise.all(runningPromises)
 	await Promise.all(runningPromises)
 
 
 	const result = await finishRun(run.id)
 	const result = await finishRun(run.id)
-	console.log("[cli#run]", result)
+	console.log(`${Date.now()} [cli#run]`, result)
 
 
-	console.log(await execa({ cwd: exercisesPath })`git add .`)
-	console.log(await execa({ cwd: exercisesPath })`git commit -m ${`Run #${run.id}`} --no-verify`)
+	await execa({ cwd: exercisesPath })`git add .`
+	await execa({ cwd: exercisesPath })`git commit -m ${`Run #${run.id}`} --no-verify`
 }
 }
 
 
 const runExercise = async ({ run, task, server }: { run: Run; task: Task; server: IpcServer }): TaskPromise => {
 const runExercise = async ({ run, task, server }: { run: Run; task: Task; server: IpcServer }): TaskPromise => {
@@ -180,9 +180,7 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
 	// Don't await execa and store result as subprocess.
 	// Don't await execa and store result as subprocess.
 	// subprocess.stdout.pipe(process.stdout)
 	// subprocess.stdout.pipe(process.stdout)
 
 
-	// Sleep for a random amount of time before opening a new VSCode window.
-	await new Promise((resolve) => setTimeout(resolve, 1_000 + Math.random() * 5_000))
-	console.log(`Opening new VS Code window at ${workspacePath}`)
+	console.log(`${Date.now()} [cli#runExercise] Opening new VS Code window at ${workspacePath}`)
 
 
 	await execa({
 	await execa({
 		env: {
 		env: {
@@ -192,15 +190,15 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
 	})`code --disable-workspace-trust -n ${workspacePath}`
 	})`code --disable-workspace-trust -n ${workspacePath}`
 
 
 	// Give VSCode some time to spawn before connecting to its unix socket.
 	// Give VSCode some time to spawn before connecting to its unix socket.
-	await new Promise((resolve) => setTimeout(resolve, 1_000 + Math.random() * 4_000))
-	console.log(`Connecting to ${taskSocketPath}`)
+	await new Promise((resolve) => setTimeout(resolve, 3_000))
+	console.log(`${Date.now()} [cli#runExercise] Connecting to ${taskSocketPath}`)
 	const client = new IpcClient(taskSocketPath)
 	const client = new IpcClient(taskSocketPath)
 
 
 	try {
 	try {
 		await pWaitFor(() => client.isReady, { interval: 250, timeout: 5_000 })
 		await pWaitFor(() => client.isReady, { interval: 250, timeout: 5_000 })
 		// eslint-disable-next-line @typescript-eslint/no-unused-vars
 		// eslint-disable-next-line @typescript-eslint/no-unused-vars
 	} catch (error) {
 	} catch (error) {
-		console.log(`[cli#runExercise | ${language} / ${exercise}] unable to connect`)
+		console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] unable to connect`)
 		client.disconnect()
 		client.disconnect()
 		return { success: false, retry: false }
 		return { success: false, retry: false }
 	}
 	}
@@ -220,16 +218,20 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
 	client.on(IpcMessageType.TaskEvent, async (taskEvent) => {
 	client.on(IpcMessageType.TaskEvent, async (taskEvent) => {
 		const { eventName, payload } = taskEvent
 		const { eventName, payload } = taskEvent
 
 
-		server.broadcast({
-			type: IpcMessageType.TaskEvent,
-			origin: IpcOrigin.Server,
-			relayClientId: client.clientId!,
-			data: { ...taskEvent, taskId: task.id },
-		})
+		if (taskEvent.eventName !== RooCodeEventName.Message) {
+			server.broadcast({
+				type: IpcMessageType.TaskEvent,
+				origin: IpcOrigin.Server,
+				relayClientId: client.clientId!,
+				data: { ...taskEvent, taskId: task.id },
+			})
+		}
 
 
 		if (!ignoreEvents.includes(eventName)) {
 		if (!ignoreEvents.includes(eventName)) {
-			console.log(`[cli#runExercise | ${language} / ${exercise}] taskEvent -> ${eventName}`)
-			console.log(payload)
+			console.log(
+				`${Date.now()} [cli#runExercise | ${language} / ${exercise}] taskEvent -> ${eventName}`,
+				payload,
+			)
 		}
 		}
 
 
 		if (eventName === RooCodeEventName.TaskStarted) {
 		if (eventName === RooCodeEventName.TaskStarted) {
@@ -279,11 +281,11 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
 	})
 	})
 
 
 	client.on(IpcMessageType.Disconnect, async () => {
 	client.on(IpcMessageType.Disconnect, async () => {
-		console.log(`[cli#runExercise | ${language} / ${exercise}] disconnect`)
+		console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] disconnect`)
 		isClientDisconnected = true
 		isClientDisconnected = true
 	})
 	})
 
 
-	console.log(`[cli#runExercise | ${language} / ${exercise}] starting task`)
+	console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] starting task`)
 
 
 	client.sendMessage({
 	client.sendMessage({
 		type: IpcMessageType.TaskCommand,
 		type: IpcMessageType.TaskCommand,
@@ -307,7 +309,7 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
 		await pWaitFor(() => !!taskFinishedAt || isClientDisconnected, { interval: 1_000, timeout: TASK_TIMEOUT })
 		await pWaitFor(() => !!taskFinishedAt || isClientDisconnected, { interval: 1_000, timeout: TASK_TIMEOUT })
 		// eslint-disable-next-line @typescript-eslint/no-unused-vars
 		// eslint-disable-next-line @typescript-eslint/no-unused-vars
 	} catch (error) {
 	} catch (error) {
-		console.log(`[cli#runExercise | ${language} / ${exercise}] time limit reached`)
+		console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] time limit reached`)
 
 
 		// Cancel the task.
 		// Cancel the task.
 		if (rooTaskId && !isClientDisconnected) {
 		if (rooTaskId && !isClientDisconnected) {
@@ -351,17 +353,56 @@ const runUnitTest = async ({ task }: { task: Task }) => {
 	let passed = true
 	let passed = true
 
 
 	for (const command of commands) {
 	for (const command of commands) {
-		const timeout = cmd.timeout ?? UNIT_TEST_TIMEOUT
-
 		try {
 		try {
-			const result = await execa({ cwd, shell: true, reject: false, timeout })`${command}`
+			console.log(
+				`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] running "${command.join(" ")}"`,
+			)
+			const subprocess = execa({ cwd, shell: true, reject: false })`${command}`
+
+			const timeout = setTimeout(async () => {
+				const descendants = await new Promise<number[]>((resolve, reject) => {
+					psTree(subprocess.pid!, (err, children) => {
+						if (err) {
+							reject(err)
+						}
+
+						resolve(children.map((p) => parseInt(p.PID)))
+					})
+				})
+
+				if (descendants.length > 0) {
+					try {
+						console.log(
+							`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] killing ${descendants.join(" ")}`,
+						)
+
+						await execa`kill -9 ${descendants.join(" ")}`
+					} catch (error) {
+						console.error("Error killing descendant processes:", error)
+					}
+				}
+
+				console.log(
+					`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] killing ${subprocess.pid}`,
+				)
+
+				await execa`kill -9 ${subprocess.pid!}`
+			}, UNIT_TEST_TIMEOUT)
+
+			const result = await subprocess
+
+			console.log(
+				`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] "${command.join(" ")}" result -> ${JSON.stringify(result)}`,
+			)
+
+			clearTimeout(timeout)
 
 
 			if (result.failed) {
 			if (result.failed) {
 				passed = false
 				passed = false
 				break
 				break
 			}
 			}
 		} catch (error) {
 		} catch (error) {
-			console.log("[cli#runUnitTest]", error)
+			console.log(`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}]`, error)
 			passed = false
 			passed = false
 			break
 			break
 		}
 		}

+ 41 - 56
evals/apps/web/src/app/runs/[id]/run.tsx

@@ -1,33 +1,44 @@
 "use client"
 "use client"
 
 
-import { useState, useRef } from "react"
-import { LoaderCircle, SquareTerminal } from "lucide-react"
+import { useMemo } from "react"
+import { LoaderCircle } from "lucide-react"
 
 
 import * as db from "@evals/db"
 import * as db from "@evals/db"
 
 
 import { formatCurrency, formatDuration, formatTokens } from "@/lib"
 import { formatCurrency, formatDuration, formatTokens } from "@/lib"
 import { useRunStatus } from "@/hooks/use-run-status"
 import { useRunStatus } from "@/hooks/use-run-status"
-import {
-	Drawer,
-	DrawerContent,
-	DrawerHeader,
-	DrawerTitle,
-	ScrollArea,
-	Table,
-	TableBody,
-	TableCell,
-	TableHead,
-	TableHeader,
-	TableRow,
-} from "@/components/ui"
+import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui"
 
 
 import { TaskStatus } from "./task-status"
 import { TaskStatus } from "./task-status"
 import { ConnectionStatus } from "./connection-status"
 import { ConnectionStatus } from "./connection-status"
 
 
+type TaskMetrics = Pick<db.TaskMetrics, "tokensIn" | "tokensOut" | "tokensContext" | "duration" | "cost">
+
 export function Run({ run }: { run: db.Run }) {
 export function Run({ run }: { run: db.Run }) {
-	const { tasks, status, output, outputCounts } = useRunStatus(run)
-	const scrollAreaRef = useRef<HTMLDivElement>(null)
-	const [selectedTask, setSelectedTask] = useState<db.Task>()
+	const { tasks, status, tokenUsage, usageUpdatedAt } = useRunStatus(run)
+
+	const taskMetrics: Record<number, TaskMetrics> = useMemo(() => {
+		const metrics: Record<number, TaskMetrics> = {}
+
+		tasks?.forEach((task) => {
+			const usage = tokenUsage.get(task.id)
+
+			if (task.finishedAt && task.taskMetrics) {
+				metrics[task.id] = task.taskMetrics
+			} else if (usage) {
+				metrics[task.id] = {
+					tokensIn: usage.totalTokensIn,
+					tokensOut: usage.totalTokensOut,
+					tokensContext: usage.contextTokens,
+					duration: usage.duration ?? 0,
+					cost: usage.totalCost,
+				}
+			}
+		})
+
+		return metrics
+		// eslint-disable-next-line react-hooks/exhaustive-deps
+	}, [tasks, tokenUsage, usageUpdatedAt])
 
 
 	return (
 	return (
 		<>
 		<>
@@ -57,38 +68,33 @@ export function Run({ run }: { run: db.Run }) {
 								<TableRow key={task.id}>
 								<TableRow key={task.id}>
 									<TableCell>
 									<TableCell>
 										<div className="flex items-center gap-2">
 										<div className="flex items-center gap-2">
-											<TaskStatus task={task} />
+											<TaskStatus
+												task={task}
+												running={!!task.startedAt || !!tokenUsage.get(task.id)}
+											/>
 											<div>
 											<div>
 												{task.language}/{task.exercise}
 												{task.language}/{task.exercise}
 											</div>
 											</div>
-											{(outputCounts[task.id] ?? 0) > 0 && (
-												<div
-													className="flex items-center gap-1 cursor-pointer"
-													onClick={() => setSelectedTask(task)}>
-													<SquareTerminal className="size-4" />
-													<div className="font-mono text-xs text-foreground/50">
-														{outputCounts[task.id]}
-													</div>
-												</div>
-											)}
 										</div>
 										</div>
 									</TableCell>
 									</TableCell>
-									{task.taskMetrics ? (
+									{taskMetrics[task.id] ? (
 										<>
 										<>
 											<TableCell className="font-mono text-xs">
 											<TableCell className="font-mono text-xs">
 												<div className="flex items-center justify-evenly">
 												<div className="flex items-center justify-evenly">
-													<div>{formatTokens(task.taskMetrics.tokensIn)}</div>/
-													<div>{formatTokens(task.taskMetrics.tokensOut)}</div>
+													<div>{formatTokens(taskMetrics[task.id]!.tokensIn)}</div>/
+													<div>{formatTokens(taskMetrics[task.id]!.tokensOut)}</div>
 												</div>
 												</div>
 											</TableCell>
 											</TableCell>
 											<TableCell className="font-mono text-xs">
 											<TableCell className="font-mono text-xs">
-												{formatTokens(task.taskMetrics.tokensContext)}
+												{formatTokens(taskMetrics[task.id]!.tokensContext)}
 											</TableCell>
 											</TableCell>
 											<TableCell className="font-mono text-xs">
 											<TableCell className="font-mono text-xs">
-												{formatDuration(task.taskMetrics.duration)}
+												{taskMetrics[task.id]!.duration
+													? formatDuration(taskMetrics[task.id]!.duration)
+													: "-"}
 											</TableCell>
 											</TableCell>
 											<TableCell className="font-mono text-xs">
 											<TableCell className="font-mono text-xs">
-												{formatCurrency(task.taskMetrics.cost)}
+												{formatCurrency(taskMetrics[task.id]!.cost)}
 											</TableCell>
 											</TableCell>
 										</>
 										</>
 									) : (
 									) : (
@@ -100,27 +106,6 @@ export function Run({ run }: { run: db.Run }) {
 					</Table>
 					</Table>
 				)}
 				)}
 			</div>
 			</div>
-			<Drawer open={!!selectedTask} onOpenChange={() => setSelectedTask(undefined)}>
-				<DrawerContent>
-					<div className="mx-auto w-full max-w-2xl">
-						<DrawerHeader>
-							<DrawerTitle>
-								{selectedTask?.language}/{selectedTask?.exercise}
-							</DrawerTitle>
-						</DrawerHeader>
-						<div className="font-mono text-xs pb-12">
-							{selectedTask && (
-								<ScrollArea viewportRef={scrollAreaRef} className="h-96 rounded-sm border">
-									<div className="p-4">
-										<h4 className="mb-4 text-sm font-medium leading-none">Tags</h4>
-										{output.get(selectedTask.id)?.map((line, i) => <div key={i}>{line}</div>)}
-									</div>
-								</ScrollArea>
-							)}
-						</div>
-					</div>
-				</DrawerContent>
-			</Drawer>
 		</>
 		</>
 	)
 	)
 }
 }

+ 3 - 4
evals/apps/web/src/app/runs/[id]/task-status.tsx

@@ -4,16 +4,15 @@ import { type Task } from "@evals/db"
 
 
 type TaskStatusProps = {
 type TaskStatusProps = {
 	task: Task
 	task: Task
+	running: boolean
 }
 }
 
 
-export const TaskStatus = ({ task }: TaskStatusProps) => {
+export const TaskStatus = ({ task, running }: TaskStatusProps) => {
 	return task.passed === false ? (
 	return task.passed === false ? (
 		<CircleSlash className="size-4 text-destructive" />
 		<CircleSlash className="size-4 text-destructive" />
 	) : task.passed === true ? (
 	) : task.passed === true ? (
 		<CircleCheck className="size-4 text-green-500" />
 		<CircleCheck className="size-4 text-green-500" />
-	) : task.startedAt ? (
-		<LoaderCircle className="size-4 animate-spin" />
-	) : task.finishedAt ? (
+	) : running ? (
 		<LoaderCircle className="size-4 animate-spin" />
 		<LoaderCircle className="size-4 animate-spin" />
 	) : (
 	) : (
 		<CircleDashed className="size-4" />
 		<CircleDashed className="size-4" />

+ 13 - 1
evals/apps/web/src/app/runs/new/new-run.tsx

@@ -86,13 +86,25 @@ export function NewRun() {
 	const onSubmit = useCallback(
 	const onSubmit = useCallback(
 		async (values: FormValues) => {
 		async (values: FormValues) => {
 			try {
 			try {
+				if (mode === "openrouter") {
+					const openRouterModel = models.data?.find(({ id }) => id === model)
+
+					if (!openRouterModel) {
+						throw new Error("Model not found.")
+					}
+
+					const openRouterModelId = openRouterModel.id
+					const openRouterModelInfo = openRouterModel.modelInfo
+					values.settings = { ...(values.settings || {}), openRouterModelId, openRouterModelInfo }
+				}
+
 				const { id } = await createRun(values)
 				const { id } = await createRun(values)
 				router.push(`/runs/${id}`)
 				router.push(`/runs/${id}`)
 			} catch (e) {
 			} catch (e) {
 				toast.error(e instanceof Error ? e.message : "An unknown error occurred.")
 				toast.error(e instanceof Error ? e.message : "An unknown error occurred.")
 			}
 			}
 		},
 		},
-		[router],
+		[mode, model, models.data, router],
 	)
 	)
 
 
 	const onFilterModels = useCallback(
 	const onFilterModels = useCallback(

+ 1 - 0
evals/apps/web/src/hooks/use-process-tree.ts

@@ -7,4 +7,5 @@ export const useProcessList = (pid: number | null) =>
 		queryKey: ["process-tree", pid],
 		queryKey: ["process-tree", pid],
 		queryFn: () => (pid ? getProcessList(pid) : []),
 		queryFn: () => (pid ? getProcessList(pid) : []),
 		enabled: !!pid,
 		enabled: !!pid,
+		refetchInterval: 30_000,
 	})
 	})

+ 19 - 23
evals/apps/web/src/hooks/use-run-status.ts

@@ -1,7 +1,7 @@
 import { useState, useCallback, useRef } from "react"
 import { useState, useCallback, useRef } from "react"
 import { useQuery, keepPreviousData } from "@tanstack/react-query"
 import { useQuery, keepPreviousData } from "@tanstack/react-query"
 
 
-import { RooCodeEventName, taskEventSchema } from "@evals/types"
+import { RooCodeEventName, taskEventSchema, TokenUsage } from "@evals/types"
 import { Run } from "@evals/db"
 import { Run } from "@evals/db"
 
 
 import { getTasks } from "@/lib/server/tasks"
 import { getTasks } from "@/lib/server/tasks"
@@ -9,14 +9,16 @@ import { useEventSource } from "@/hooks/use-event-source"
 
 
 export const useRunStatus = (run: Run) => {
 export const useRunStatus = (run: Run) => {
 	const [tasksUpdatedAt, setTasksUpdatedAt] = useState<number>()
 	const [tasksUpdatedAt, setTasksUpdatedAt] = useState<number>()
-	const outputRef = useRef<Map<number, string[]>>(new Map())
-	const [outputCounts, setOutputCounts] = useState<Record<number, number>>({})
+	const [usageUpdatedAt, setUsageUpdatedAt] = useState<number>()
+
+	const tokenUsage = useRef<Map<number, TokenUsage & { duration?: number }>>(new Map())
+	const startTimes = useRef<Map<number, number>>(new Map())
 
 
 	const { data: tasks } = useQuery({
 	const { data: tasks } = useQuery({
 		queryKey: ["run", run.id, tasksUpdatedAt],
 		queryKey: ["run", run.id, tasksUpdatedAt],
 		queryFn: async () => getTasks(run.id),
 		queryFn: async () => getTasks(run.id),
 		placeholderData: keepPreviousData,
 		placeholderData: keepPreviousData,
-		refetchInterval: 10_000,
+		refetchInterval: 30_000,
 	})
 	})
 
 
 	const url = `/api/runs/${run.id}/stream`
 	const url = `/api/runs/${run.id}/stream`
@@ -47,28 +49,17 @@ export const useRunStatus = (run: Run) => {
 
 
 		switch (eventName) {
 		switch (eventName) {
 			case RooCodeEventName.TaskStarted:
 			case RooCodeEventName.TaskStarted:
+				startTimes.current.set(taskId, Date.now())
+				break
 			case RooCodeEventName.TaskCompleted:
 			case RooCodeEventName.TaskCompleted:
 			case RooCodeEventName.TaskAborted:
 			case RooCodeEventName.TaskAborted:
 				setTasksUpdatedAt(Date.now())
 				setTasksUpdatedAt(Date.now())
 				break
 				break
-			case RooCodeEventName.Message: {
-				const [
-					{
-						message: { text },
-					},
-				] = payload
-
-				if (text) {
-					outputRef.current.set(taskId, [...(outputRef.current.get(taskId) || []), text])
-					const outputCounts: Record<number, number> = {}
-
-					for (const [taskId, messages] of outputRef.current.entries()) {
-						outputCounts[taskId] = messages.length
-					}
-
-					setOutputCounts(outputCounts)
-				}
-
+			case RooCodeEventName.TaskTokenUsageUpdated: {
+				const startTime = startTimes.current.get(taskId)
+				const duration = startTime ? Date.now() - startTime : undefined
+				tokenUsage.current.set(taskId, { ...payload[1], duration })
+				setUsageUpdatedAt(Date.now())
 				break
 				break
 			}
 			}
 		}
 		}
@@ -76,5 +67,10 @@ export const useRunStatus = (run: Run) => {
 
 
 	const status = useEventSource({ url, onMessage })
 	const status = useEventSource({ url, onMessage })
 
 
-	return { tasks, status, output: outputRef.current, outputCounts }
+	return {
+		status,
+		tasks,
+		tokenUsage: tokenUsage.current,
+		usageUpdatedAt,
+	}
 }
 }

+ 2 - 2
evals/packages/db/src/schema.ts

@@ -2,7 +2,7 @@ import { sqliteTable, text, real, integer, blob, uniqueIndex } from "drizzle-orm
 import { relations } from "drizzle-orm"
 import { relations } from "drizzle-orm"
 import { createInsertSchema } from "drizzle-zod"
 import { createInsertSchema } from "drizzle-zod"
 
 
-import { GlobalSettings, exerciseLanguages, rooCodeSettingsSchema } from "@evals/types"
+import { GlobalSettings, RooCodeSettings, exerciseLanguages, rooCodeSettingsSchema } from "@evals/types"
 
 
 /**
 /**
  * runs
  * runs
@@ -13,7 +13,7 @@ export const runs = sqliteTable("runs", {
 	taskMetricsId: integer({ mode: "number" }).references(() => taskMetrics.id),
 	taskMetricsId: integer({ mode: "number" }).references(() => taskMetrics.id),
 	model: text().notNull(),
 	model: text().notNull(),
 	description: text(),
 	description: text(),
-	settings: blob({ mode: "json" }).$type<GlobalSettings>(),
+	settings: blob({ mode: "json" }).$type<RooCodeSettings>(),
 	pid: integer({ mode: "number" }),
 	pid: integer({ mode: "number" }),
 	socketPath: text().notNull(),
 	socketPath: text().notNull(),
 	concurrency: integer({ mode: "number" }).default(2).notNull(),
 	concurrency: integer({ mode: "number" }).default(2).notNull(),

+ 1 - 1
evals/packages/ipc/src/client.ts

@@ -65,7 +65,7 @@ export class IpcClient extends EventEmitter<IpcClientEvents> {
 		const result = ipcMessageSchema.safeParse(data)
 		const result = ipcMessageSchema.safeParse(data)
 
 
 		if (!result.success) {
 		if (!result.success) {
-			this.log("[client#onMessage] invalid payload", data)
+			this.log("[client#onMessage] invalid payload", result.error, data)
 			return
 			return
 		}
 		}
 
 

+ 1 - 1
evals/packages/ipc/src/server.ts

@@ -83,7 +83,7 @@ export class IpcServer extends EventEmitter<IpcServerEvents> {
 		const result = ipcMessageSchema.safeParse(data)
 		const result = ipcMessageSchema.safeParse(data)
 
 
 		if (!result.success) {
 		if (!result.success) {
-			this.log("[server#onMessage] invalid payload", result.error)
+			this.log("[server#onMessage] invalid payload", result.error, data)
 			return
 			return
 		}
 		}
 
 

+ 11 - 21
evals/packages/types/src/roo-code-defaults.ts

@@ -2,25 +2,9 @@ import { RooCodeSettings } from "./roo-code.js"
 
 
 export const rooCodeDefaults: RooCodeSettings = {
 export const rooCodeDefaults: RooCodeSettings = {
 	apiProvider: "openrouter",
 	apiProvider: "openrouter",
-	openRouterModelId: "google/gemini-2.0-flash-001", // "anthropic/claude-3.7-sonnet",
+	openRouterUseMiddleOutTransform: false,
 
 
-	// apiProvider: "openai",
-	// openAiBaseUrl: "http://hrudolph.duckdns.org:4269/api/v1",
-	// openAiApiKey: process.env.OPENAI_API_KEY,
-	// openAiModelId: "models/gemini-2.5-pro-exp-03-25",
-	// openAiCustomModelInfo: {
-	// 	maxTokens: 65536,
-	// 	contextWindow: 1000000,
-	// 	supportsImages: true,
-	// 	supportsPromptCache: false,
-	// 	inputPrice: 0,
-	// 	outputPrice: 0,
-	// 	description:
-	// 		"Gemini 2.5 Pro is Google’s state-of-the-art AI model designed for advanced reasoning, coding, mathematics, and scientific tasks. It employs “thinking” capabilities, enabling it to reason through responses with enhanced accuracy and nuanced context handling. Gemini 2.5 Pro achieves top-tier performance on multiple benchmarks, including first-place positioning on the LMArena leaderboard, reflecting superior human-preference alignment and complex problem-solving abilities.",
-	// 	thinking: false,
-	// },
-
-	modelTemperature: null,
+	// modelTemperature: null,
 	// reasoningEffort: "high",
 	// reasoningEffort: "high",
 
 
 	pinnedApiConfigs: {},
 	pinnedApiConfigs: {},
@@ -60,12 +44,18 @@ export const rooCodeDefaults: RooCodeSettings = {
 	maxReadFileLine: 500,
 	maxReadFileLine: 500,
 
 
 	terminalOutputLineLimit: 500,
 	terminalOutputLineLimit: 500,
-	terminalShellIntegrationTimeout: 15000,
+	terminalShellIntegrationTimeout: 30_000,
+	// terminalCommandDelay: 0,
+	// terminalPowershellCounter: false,
+	// terminalZshClearEolMark: true,
+	// terminalZshOhMy: true,
+	// terminalZshP10k: false,
+	// terminalZdotdir: true,
 
 
-	diffEnabled: true,
+	diffEnabled: false,
 	fuzzyMatchThreshold: 1.0,
 	fuzzyMatchThreshold: 1.0,
 	experiments: {
 	experiments: {
-		search_and_replace: true,
+		search_and_replace: false,
 		insert_content: false,
 		insert_content: false,
 		powerSteering: false,
 		powerSteering: false,
 	},
 	},

+ 14 - 0
evals/packages/types/src/roo-code.ts

@@ -396,6 +396,7 @@ const providerSettingsRecord: ProviderSettingsRecord = {
 	apiModelId: undefined,
 	apiModelId: undefined,
 	apiKey: undefined,
 	apiKey: undefined,
 	anthropicBaseUrl: undefined,
 	anthropicBaseUrl: undefined,
+	anthropicUseAuthToken: undefined,
 	// Glama
 	// Glama
 	glamaModelId: undefined,
 	glamaModelId: undefined,
 	glamaModelInfo: undefined,
 	glamaModelInfo: undefined,
@@ -523,6 +524,12 @@ export const globalSettingsSchema = z.object({
 
 
 	terminalOutputLineLimit: z.number().optional(),
 	terminalOutputLineLimit: z.number().optional(),
 	terminalShellIntegrationTimeout: z.number().optional(),
 	terminalShellIntegrationTimeout: z.number().optional(),
+	terminalCommandDelay: z.number().optional(),
+	terminalPowershellCounter: z.boolean().optional(),
+	terminalZshClearEolMark: z.boolean().optional(),
+	terminalZshOhMy: z.boolean().optional(),
+	terminalZshP10k: z.boolean().optional(),
+	terminalZdotdir: z.boolean().optional(),
 
 
 	diffEnabled: z.boolean().optional(),
 	diffEnabled: z.boolean().optional(),
 	fuzzyMatchThreshold: z.number().optional(),
 	fuzzyMatchThreshold: z.number().optional(),
@@ -592,6 +599,12 @@ const globalSettingsRecord: GlobalSettingsRecord = {
 
 
 	terminalOutputLineLimit: undefined,
 	terminalOutputLineLimit: undefined,
 	terminalShellIntegrationTimeout: undefined,
 	terminalShellIntegrationTimeout: undefined,
+	terminalCommandDelay: undefined,
+	terminalPowershellCounter: undefined,
+	terminalZshClearEolMark: undefined,
+	terminalZshOhMy: undefined,
+	terminalZshP10k: undefined,
+	terminalZdotdir: undefined,
 
 
 	diffEnabled: undefined,
 	diffEnabled: undefined,
 	fuzzyMatchThreshold: undefined,
 	fuzzyMatchThreshold: undefined,
@@ -731,6 +744,7 @@ export const clineSays = [
 	"new_task",
 	"new_task",
 	"checkpoint_saved",
 	"checkpoint_saved",
 	"rooignore_error",
 	"rooignore_error",
+	"diff_error",
 ] as const
 ] as const
 
 
 export const clineSaySchema = z.enum(clineSays)
 export const clineSaySchema = z.enum(clineSays)

+ 6 - 0
evals/pnpm-lock.yaml

@@ -62,6 +62,9 @@ importers:
       p-wait-for:
       p-wait-for:
         specifier: ^5.0.2
         specifier: ^5.0.2
         version: 5.0.2
         version: 5.0.2
+      ps-tree:
+        specifier: ^1.2.0
+        version: 1.2.0
     devDependencies:
     devDependencies:
       '@evals/eslint-config':
       '@evals/eslint-config':
         specifier: workspace:^
         specifier: workspace:^
@@ -69,6 +72,9 @@ importers:
       '@evals/typescript-config':
       '@evals/typescript-config':
         specifier: workspace:^
         specifier: workspace:^
         version: link:../../config/typescript
         version: link:../../config/typescript
+      '@types/ps-tree':
+        specifier: ^1.1.6
+        version: 1.1.6
 
 
   apps/web:
   apps/web:
     dependencies:
     dependencies:

+ 22 - 11
evals/scripts/setup.sh

@@ -275,6 +275,25 @@ fi
 
 
 pnpm install --silent || exit 1
 pnpm install --silent || exit 1
 
 
+if ! command -v code &>/dev/null; then
+  echo "⚠️ Visual Studio Code cli is not installed"
+  exit 1
+else
+  VSCODE_VERSION=$(code --version | head -n 1)
+  echo "✅ Visual Studio Code is installed ($VSCODE_VERSION)"
+fi
+
+# To reset VSCode:
+# rm -rvf ~/.vscode && rm -rvf ~/Library/Application\ Support/Code
+
+echo "🔌 Installing Visual Studio Code extensions..."
+code --install-extension golang.go &>/dev/null || exit 1
+code --install-extension dbaeumer.vscode-eslint&>/dev/null || exit 1
+code --install-extension redhat.java &>/dev/null || exit 1
+code --install-extension ms-python.python&>/dev/null || exit 1
+code --install-extension rust-lang.rust-analyzer &>/dev/null || exit 1
+code --install-extension rooveterinaryinc.roo-cline &>/dev/null || exit 1
+
 if [[ ! -d "../../evals" ]]; then
 if [[ ! -d "../../evals" ]]; then
   if gh auth status &>/dev/null; then
   if gh auth status &>/dev/null; then
     read -p "🔗 Would you like to be able to share eval results? (Y/n): " fork_evals
     read -p "🔗 Would you like to be able to share eval results? (Y/n): " fork_evals
@@ -293,9 +312,9 @@ if [[ ! -s .env ]]; then
   cp .env.sample .env || exit 1
   cp .env.sample .env || exit 1
 fi
 fi
 
 
-echo "🗄️ Syncing database..."
-pnpm --filter @evals/db db:push || exit 1
-pnpm --filter @evals/db db:enable-wal || exit 1
+echo "🗄️ Syncing Roo Code evals database..."
+pnpm --filter @evals/db db:push &>/dev/null || exit 1
+pnpm --filter @evals/db db:enable-wal &>/dev/null || exit 1
 
 
 if ! grep -q "OPENROUTER_API_KEY" .env; then
 if ! grep -q "OPENROUTER_API_KEY" .env; then
   read -p "🔐 Enter your OpenRouter API key (sk-or-v1-...): " openrouter_api_key
   read -p "🔐 Enter your OpenRouter API key (sk-or-v1-...): " openrouter_api_key
@@ -304,14 +323,6 @@ if ! grep -q "OPENROUTER_API_KEY" .env; then
   echo "OPENROUTER_API_KEY=$openrouter_api_key" >> .env || exit 1
   echo "OPENROUTER_API_KEY=$openrouter_api_key" >> .env || exit 1
 fi
 fi
 
 
-if ! command -v code &>/dev/null; then
-  echo "⚠️ Visual Studio Code cli is not installed"
-  exit 1
-else
-  VSCODE_VERSION=$(code --version | head -n 1)
-  echo "✅ Visual Studio Code is installed ($VSCODE_VERSION)"
-fi
-
 if [[ ! -s "../bin/roo-code-latest.vsix" ]]; then
 if [[ ! -s "../bin/roo-code-latest.vsix" ]]; then
   build_extension
   build_extension
 else
 else