Explorar el Código

Evals improvements (#2555)

* Evals improvements

* Remove debugging
Chris Estreich hace 8 meses
padre
commit
ef9b339027

+ 4 - 2
evals/apps/cli/package.json

@@ -16,10 +16,12 @@
 		"execa": "^9.5.2",
 		"gluegun": "^5.1.2",
 		"p-map": "^7.0.3",
-		"p-wait-for": "^5.0.2"
+		"p-wait-for": "^5.0.2",
+		"ps-tree": "^1.2.0"
 	},
 	"devDependencies": {
 		"@evals/eslint-config": "workspace:^",
-		"@evals/typescript-config": "workspace:^"
+		"@evals/typescript-config": "workspace:^",
+		"@types/ps-tree": "^1.1.6"
 	}
 }

+ 76 - 35
evals/apps/cli/src/index.ts

@@ -6,6 +6,7 @@ import pMap from "p-map"
 import pWaitFor from "p-wait-for"
 import { execa, parseCommandString } from "execa"
 import { build, filesystem, GluegunPrompt, GluegunToolbox } from "gluegun"
+import psTree from "ps-tree"
 
 import {
 	type ExerciseLanguage,
@@ -36,8 +37,9 @@ import { getExercises } from "./exercises.js"
 type TaskResult = { success: boolean; retry: boolean }
 type TaskPromise = Promise<TaskResult>
 
-const TASK_TIMEOUT = 10 * 60 * 1_000
-const UNIT_TEST_TIMEOUT = 60 * 1_000
+const TASK_START_DELAY = 10 * 1_000
+const TASK_TIMEOUT = 5 * 60 * 1_000
+const UNIT_TEST_TIMEOUT = 2 * 60 * 1_000
 
 const testCommands: Record<ExerciseLanguage, { commands: string[]; timeout?: number; cwd?: string }> = {
 	go: { commands: ["go test"] }, // timeout 15s bash -c "cd '$dir' && go test > /dev/null 2>&1"
@@ -98,13 +100,11 @@ const run = async (toolbox: GluegunToolbox) => {
 		throw new Error("No tasks found.")
 	}
 
-	console.log(await execa({ cwd: exercisesPath })`git config user.name "Roo Code"`)
-	console.log(await execa({ cwd: exercisesPath })`git config user.email "[email protected]"`)
-	console.log(await execa({ cwd: exercisesPath })`git checkout -f`)
-	console.log(await execa({ cwd: exercisesPath })`git clean -fd`)
-	console.log(
-		await execa({ cwd: exercisesPath })`git checkout -b runs/${run.id}-${crypto.randomUUID().slice(0, 8)} main`,
-	)
+	await execa({ cwd: exercisesPath })`git config user.name "Roo Code"`
+	await execa({ cwd: exercisesPath })`git config user.email "[email protected]"`
+	await execa({ cwd: exercisesPath })`git checkout -f`
+	await execa({ cwd: exercisesPath })`git clean -fd`
+	await execa({ cwd: exercisesPath })`git checkout -b runs/${run.id}-${crypto.randomUUID().slice(0, 8)} main`
 
 	fs.writeFileSync(
 		path.resolve(exercisesPath, "settings.json"),
@@ -145,11 +145,11 @@ const run = async (toolbox: GluegunToolbox) => {
 		}
 	}
 
-	let delay = 0
+	let delay = TASK_START_DELAY
 
 	for (const task of tasks) {
 		const promise = processTask(task, delay)
-		delay = delay + 5_000
+		delay = delay + TASK_START_DELAY
 		runningPromises.push(promise)
 		promise.then(() => processTaskResult(task, promise))
 
@@ -162,10 +162,10 @@ const run = async (toolbox: GluegunToolbox) => {
 	await Promise.all(runningPromises)
 
 	const result = await finishRun(run.id)
-	console.log("[cli#run]", result)
+	console.log(`${Date.now()} [cli#run]`, result)
 
-	console.log(await execa({ cwd: exercisesPath })`git add .`)
-	console.log(await execa({ cwd: exercisesPath })`git commit -m ${`Run #${run.id}`} --no-verify`)
+	await execa({ cwd: exercisesPath })`git add .`
+	await execa({ cwd: exercisesPath })`git commit -m ${`Run #${run.id}`} --no-verify`
 }
 
 const runExercise = async ({ run, task, server }: { run: Run; task: Task; server: IpcServer }): TaskPromise => {
@@ -180,9 +180,7 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
 	// Don't await execa and store result as subprocess.
 	// subprocess.stdout.pipe(process.stdout)
 
-	// Sleep for a random amount of time before opening a new VSCode window.
-	await new Promise((resolve) => setTimeout(resolve, 1_000 + Math.random() * 5_000))
-	console.log(`Opening new VS Code window at ${workspacePath}`)
+	console.log(`${Date.now()} [cli#runExercise] Opening new VS Code window at ${workspacePath}`)
 
 	await execa({
 		env: {
@@ -192,15 +190,15 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
 	})`code --disable-workspace-trust -n ${workspacePath}`
 
 	// Give VSCode some time to spawn before connecting to its unix socket.
-	await new Promise((resolve) => setTimeout(resolve, 1_000 + Math.random() * 4_000))
-	console.log(`Connecting to ${taskSocketPath}`)
+	await new Promise((resolve) => setTimeout(resolve, 3_000))
+	console.log(`${Date.now()} [cli#runExercise] Connecting to ${taskSocketPath}`)
 	const client = new IpcClient(taskSocketPath)
 
 	try {
 		await pWaitFor(() => client.isReady, { interval: 250, timeout: 5_000 })
 		// eslint-disable-next-line @typescript-eslint/no-unused-vars
 	} catch (error) {
-		console.log(`[cli#runExercise | ${language} / ${exercise}] unable to connect`)
+		console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] unable to connect`)
 		client.disconnect()
 		return { success: false, retry: false }
 	}
@@ -220,16 +218,20 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
 	client.on(IpcMessageType.TaskEvent, async (taskEvent) => {
 		const { eventName, payload } = taskEvent
 
-		server.broadcast({
-			type: IpcMessageType.TaskEvent,
-			origin: IpcOrigin.Server,
-			relayClientId: client.clientId!,
-			data: { ...taskEvent, taskId: task.id },
-		})
+		if (taskEvent.eventName !== RooCodeEventName.Message) {
+			server.broadcast({
+				type: IpcMessageType.TaskEvent,
+				origin: IpcOrigin.Server,
+				relayClientId: client.clientId!,
+				data: { ...taskEvent, taskId: task.id },
+			})
+		}
 
 		if (!ignoreEvents.includes(eventName)) {
-			console.log(`[cli#runExercise | ${language} / ${exercise}] taskEvent -> ${eventName}`)
-			console.log(payload)
+			console.log(
+				`${Date.now()} [cli#runExercise | ${language} / ${exercise}] taskEvent -> ${eventName}`,
+				payload,
+			)
 		}
 
 		if (eventName === RooCodeEventName.TaskStarted) {
@@ -279,11 +281,11 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
 	})
 
 	client.on(IpcMessageType.Disconnect, async () => {
-		console.log(`[cli#runExercise | ${language} / ${exercise}] disconnect`)
+		console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] disconnect`)
 		isClientDisconnected = true
 	})
 
-	console.log(`[cli#runExercise | ${language} / ${exercise}] starting task`)
+	console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] starting task`)
 
 	client.sendMessage({
 		type: IpcMessageType.TaskCommand,
@@ -307,7 +309,7 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
 		await pWaitFor(() => !!taskFinishedAt || isClientDisconnected, { interval: 1_000, timeout: TASK_TIMEOUT })
 		// eslint-disable-next-line @typescript-eslint/no-unused-vars
 	} catch (error) {
-		console.log(`[cli#runExercise | ${language} / ${exercise}] time limit reached`)
+		console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] time limit reached`)
 
 		// Cancel the task.
 		if (rooTaskId && !isClientDisconnected) {
@@ -351,17 +353,56 @@ const runUnitTest = async ({ task }: { task: Task }) => {
 	let passed = true
 
 	for (const command of commands) {
-		const timeout = cmd.timeout ?? UNIT_TEST_TIMEOUT
-
 		try {
-			const result = await execa({ cwd, shell: true, reject: false, timeout })`${command}`
+			console.log(
+				`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] running "${command.join(" ")}"`,
+			)
+			const subprocess = execa({ cwd, shell: true, reject: false })`${command}`
+
+			const timeout = setTimeout(async () => {
+				const descendants = await new Promise<number[]>((resolve, reject) => {
+					psTree(subprocess.pid!, (err, children) => {
+						if (err) {
+							reject(err)
+						}
+
+						resolve(children.map((p) => parseInt(p.PID)))
+					})
+				})
+
+				if (descendants.length > 0) {
+					try {
+						console.log(
+							`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] killing ${descendants.join(" ")}`,
+						)
+
+						await execa`kill -9 ${descendants.join(" ")}`
+					} catch (error) {
+						console.error("Error killing descendant processes:", error)
+					}
+				}
+
+				console.log(
+					`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] killing ${subprocess.pid}`,
+				)
+
+				await execa`kill -9 ${subprocess.pid!}`
+			}, UNIT_TEST_TIMEOUT)
+
+			const result = await subprocess
+
+			console.log(
+				`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] "${command.join(" ")}" result -> ${JSON.stringify(result)}`,
+			)
+
+			clearTimeout(timeout)
 
 			if (result.failed) {
 				passed = false
 				break
 			}
 		} catch (error) {
-			console.log("[cli#runUnitTest]", error)
+			console.log(`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}]`, error)
 			passed = false
 			break
 		}

+ 41 - 56
evals/apps/web/src/app/runs/[id]/run.tsx

@@ -1,33 +1,44 @@
 "use client"
 
-import { useState, useRef } from "react"
-import { LoaderCircle, SquareTerminal } from "lucide-react"
+import { useMemo } from "react"
+import { LoaderCircle } from "lucide-react"
 
 import * as db from "@evals/db"
 
 import { formatCurrency, formatDuration, formatTokens } from "@/lib"
 import { useRunStatus } from "@/hooks/use-run-status"
-import {
-	Drawer,
-	DrawerContent,
-	DrawerHeader,
-	DrawerTitle,
-	ScrollArea,
-	Table,
-	TableBody,
-	TableCell,
-	TableHead,
-	TableHeader,
-	TableRow,
-} from "@/components/ui"
+import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui"
 
 import { TaskStatus } from "./task-status"
 import { ConnectionStatus } from "./connection-status"
 
+type TaskMetrics = Pick<db.TaskMetrics, "tokensIn" | "tokensOut" | "tokensContext" | "duration" | "cost">
+
 export function Run({ run }: { run: db.Run }) {
-	const { tasks, status, output, outputCounts } = useRunStatus(run)
-	const scrollAreaRef = useRef<HTMLDivElement>(null)
-	const [selectedTask, setSelectedTask] = useState<db.Task>()
+	const { tasks, status, tokenUsage, usageUpdatedAt } = useRunStatus(run)
+
+	const taskMetrics: Record<number, TaskMetrics> = useMemo(() => {
+		const metrics: Record<number, TaskMetrics> = {}
+
+		tasks?.forEach((task) => {
+			const usage = tokenUsage.get(task.id)
+
+			if (task.finishedAt && task.taskMetrics) {
+				metrics[task.id] = task.taskMetrics
+			} else if (usage) {
+				metrics[task.id] = {
+					tokensIn: usage.totalTokensIn,
+					tokensOut: usage.totalTokensOut,
+					tokensContext: usage.contextTokens,
+					duration: usage.duration ?? 0,
+					cost: usage.totalCost,
+				}
+			}
+		})
+
+		return metrics
+		// eslint-disable-next-line react-hooks/exhaustive-deps
+	}, [tasks, tokenUsage, usageUpdatedAt])
 
 	return (
 		<>
@@ -57,38 +68,33 @@ export function Run({ run }: { run: db.Run }) {
 								<TableRow key={task.id}>
 									<TableCell>
 										<div className="flex items-center gap-2">
-											<TaskStatus task={task} />
+											<TaskStatus
+												task={task}
+												running={!!task.startedAt || !!tokenUsage.get(task.id)}
+											/>
 											<div>
 												{task.language}/{task.exercise}
 											</div>
-											{(outputCounts[task.id] ?? 0) > 0 && (
-												<div
-													className="flex items-center gap-1 cursor-pointer"
-													onClick={() => setSelectedTask(task)}>
-													<SquareTerminal className="size-4" />
-													<div className="font-mono text-xs text-foreground/50">
-														{outputCounts[task.id]}
-													</div>
-												</div>
-											)}
 										</div>
 									</TableCell>
-									{task.taskMetrics ? (
+									{taskMetrics[task.id] ? (
 										<>
 											<TableCell className="font-mono text-xs">
 												<div className="flex items-center justify-evenly">
-													<div>{formatTokens(task.taskMetrics.tokensIn)}</div>/
-													<div>{formatTokens(task.taskMetrics.tokensOut)}</div>
+													<div>{formatTokens(taskMetrics[task.id]!.tokensIn)}</div>/
+													<div>{formatTokens(taskMetrics[task.id]!.tokensOut)}</div>
 												</div>
 											</TableCell>
 											<TableCell className="font-mono text-xs">
-												{formatTokens(task.taskMetrics.tokensContext)}
+												{formatTokens(taskMetrics[task.id]!.tokensContext)}
 											</TableCell>
 											<TableCell className="font-mono text-xs">
-												{formatDuration(task.taskMetrics.duration)}
+												{taskMetrics[task.id]!.duration
+													? formatDuration(taskMetrics[task.id]!.duration)
+													: "-"}
 											</TableCell>
 											<TableCell className="font-mono text-xs">
-												{formatCurrency(task.taskMetrics.cost)}
+												{formatCurrency(taskMetrics[task.id]!.cost)}
 											</TableCell>
 										</>
 									) : (
@@ -100,27 +106,6 @@ export function Run({ run }: { run: db.Run }) {
 					</Table>
 				)}
 			</div>
-			<Drawer open={!!selectedTask} onOpenChange={() => setSelectedTask(undefined)}>
-				<DrawerContent>
-					<div className="mx-auto w-full max-w-2xl">
-						<DrawerHeader>
-							<DrawerTitle>
-								{selectedTask?.language}/{selectedTask?.exercise}
-							</DrawerTitle>
-						</DrawerHeader>
-						<div className="font-mono text-xs pb-12">
-							{selectedTask && (
-								<ScrollArea viewportRef={scrollAreaRef} className="h-96 rounded-sm border">
-									<div className="p-4">
-										<h4 className="mb-4 text-sm font-medium leading-none">Tags</h4>
-										{output.get(selectedTask.id)?.map((line, i) => <div key={i}>{line}</div>)}
-									</div>
-								</ScrollArea>
-							)}
-						</div>
-					</div>
-				</DrawerContent>
-			</Drawer>
 		</>
 	)
 }

+ 3 - 4
evals/apps/web/src/app/runs/[id]/task-status.tsx

@@ -4,16 +4,15 @@ import { type Task } from "@evals/db"
 
 type TaskStatusProps = {
 	task: Task
+	running: boolean
 }
 
-export const TaskStatus = ({ task }: TaskStatusProps) => {
+export const TaskStatus = ({ task, running }: TaskStatusProps) => {
 	return task.passed === false ? (
 		<CircleSlash className="size-4 text-destructive" />
 	) : task.passed === true ? (
 		<CircleCheck className="size-4 text-green-500" />
-	) : task.startedAt ? (
-		<LoaderCircle className="size-4 animate-spin" />
-	) : task.finishedAt ? (
+	) : running ? (
 		<LoaderCircle className="size-4 animate-spin" />
 	) : (
 		<CircleDashed className="size-4" />

+ 13 - 1
evals/apps/web/src/app/runs/new/new-run.tsx

@@ -86,13 +86,25 @@ export function NewRun() {
 	const onSubmit = useCallback(
 		async (values: FormValues) => {
 			try {
+				if (mode === "openrouter") {
+					const openRouterModel = models.data?.find(({ id }) => id === model)
+
+					if (!openRouterModel) {
+						throw new Error("Model not found.")
+					}
+
+					const openRouterModelId = openRouterModel.id
+					const openRouterModelInfo = openRouterModel.modelInfo
+					values.settings = { ...(values.settings || {}), openRouterModelId, openRouterModelInfo }
+				}
+
 				const { id } = await createRun(values)
 				router.push(`/runs/${id}`)
 			} catch (e) {
 				toast.error(e instanceof Error ? e.message : "An unknown error occurred.")
 			}
 		},
-		[router],
+		[mode, model, models.data, router],
 	)
 
 	const onFilterModels = useCallback(

+ 1 - 0
evals/apps/web/src/hooks/use-process-tree.ts

@@ -7,4 +7,5 @@ export const useProcessList = (pid: number | null) =>
 		queryKey: ["process-tree", pid],
 		queryFn: () => (pid ? getProcessList(pid) : []),
 		enabled: !!pid,
+		refetchInterval: 30_000,
 	})

+ 19 - 23
evals/apps/web/src/hooks/use-run-status.ts

@@ -1,7 +1,7 @@
 import { useState, useCallback, useRef } from "react"
 import { useQuery, keepPreviousData } from "@tanstack/react-query"
 
-import { RooCodeEventName, taskEventSchema } from "@evals/types"
+import { RooCodeEventName, taskEventSchema, TokenUsage } from "@evals/types"
 import { Run } from "@evals/db"
 
 import { getTasks } from "@/lib/server/tasks"
@@ -9,14 +9,16 @@ import { useEventSource } from "@/hooks/use-event-source"
 
 export const useRunStatus = (run: Run) => {
 	const [tasksUpdatedAt, setTasksUpdatedAt] = useState<number>()
-	const outputRef = useRef<Map<number, string[]>>(new Map())
-	const [outputCounts, setOutputCounts] = useState<Record<number, number>>({})
+	const [usageUpdatedAt, setUsageUpdatedAt] = useState<number>()
+
+	const tokenUsage = useRef<Map<number, TokenUsage & { duration?: number }>>(new Map())
+	const startTimes = useRef<Map<number, number>>(new Map())
 
 	const { data: tasks } = useQuery({
 		queryKey: ["run", run.id, tasksUpdatedAt],
 		queryFn: async () => getTasks(run.id),
 		placeholderData: keepPreviousData,
-		refetchInterval: 10_000,
+		refetchInterval: 30_000,
 	})
 
 	const url = `/api/runs/${run.id}/stream`
@@ -47,28 +49,17 @@ export const useRunStatus = (run: Run) => {
 
 		switch (eventName) {
 			case RooCodeEventName.TaskStarted:
+				startTimes.current.set(taskId, Date.now())
+				break
 			case RooCodeEventName.TaskCompleted:
 			case RooCodeEventName.TaskAborted:
 				setTasksUpdatedAt(Date.now())
 				break
-			case RooCodeEventName.Message: {
-				const [
-					{
-						message: { text },
-					},
-				] = payload
-
-				if (text) {
-					outputRef.current.set(taskId, [...(outputRef.current.get(taskId) || []), text])
-					const outputCounts: Record<number, number> = {}
-
-					for (const [taskId, messages] of outputRef.current.entries()) {
-						outputCounts[taskId] = messages.length
-					}
-
-					setOutputCounts(outputCounts)
-				}
-
+			case RooCodeEventName.TaskTokenUsageUpdated: {
+				const startTime = startTimes.current.get(taskId)
+				const duration = startTime ? Date.now() - startTime : undefined
+				tokenUsage.current.set(taskId, { ...payload[1], duration })
+				setUsageUpdatedAt(Date.now())
 				break
 			}
 		}
@@ -76,5 +67,10 @@ export const useRunStatus = (run: Run) => {
 
 	const status = useEventSource({ url, onMessage })
 
-	return { tasks, status, output: outputRef.current, outputCounts }
+	return {
+		status,
+		tasks,
+		tokenUsage: tokenUsage.current,
+		usageUpdatedAt,
+	}
 }

+ 2 - 2
evals/packages/db/src/schema.ts

@@ -2,7 +2,7 @@ import { sqliteTable, text, real, integer, blob, uniqueIndex } from "drizzle-orm
 import { relations } from "drizzle-orm"
 import { createInsertSchema } from "drizzle-zod"
 
-import { GlobalSettings, exerciseLanguages, rooCodeSettingsSchema } from "@evals/types"
+import { GlobalSettings, RooCodeSettings, exerciseLanguages, rooCodeSettingsSchema } from "@evals/types"
 
 /**
  * runs
@@ -13,7 +13,7 @@ export const runs = sqliteTable("runs", {
 	taskMetricsId: integer({ mode: "number" }).references(() => taskMetrics.id),
 	model: text().notNull(),
 	description: text(),
-	settings: blob({ mode: "json" }).$type<GlobalSettings>(),
+	settings: blob({ mode: "json" }).$type<RooCodeSettings>(),
 	pid: integer({ mode: "number" }),
 	socketPath: text().notNull(),
 	concurrency: integer({ mode: "number" }).default(2).notNull(),

+ 1 - 1
evals/packages/ipc/src/client.ts

@@ -65,7 +65,7 @@ export class IpcClient extends EventEmitter<IpcClientEvents> {
 		const result = ipcMessageSchema.safeParse(data)
 
 		if (!result.success) {
-			this.log("[client#onMessage] invalid payload", data)
+			this.log("[client#onMessage] invalid payload", result.error, data)
 			return
 		}
 

+ 1 - 1
evals/packages/ipc/src/server.ts

@@ -83,7 +83,7 @@ export class IpcServer extends EventEmitter<IpcServerEvents> {
 		const result = ipcMessageSchema.safeParse(data)
 
 		if (!result.success) {
-			this.log("[server#onMessage] invalid payload", result.error)
+			this.log("[server#onMessage] invalid payload", result.error, data)
 			return
 		}
 

+ 11 - 21
evals/packages/types/src/roo-code-defaults.ts

@@ -2,25 +2,9 @@ import { RooCodeSettings } from "./roo-code.js"
 
 export const rooCodeDefaults: RooCodeSettings = {
 	apiProvider: "openrouter",
-	openRouterModelId: "google/gemini-2.0-flash-001", // "anthropic/claude-3.7-sonnet",
+	openRouterUseMiddleOutTransform: false,
 
-	// apiProvider: "openai",
-	// openAiBaseUrl: "http://hrudolph.duckdns.org:4269/api/v1",
-	// openAiApiKey: process.env.OPENAI_API_KEY,
-	// openAiModelId: "models/gemini-2.5-pro-exp-03-25",
-	// openAiCustomModelInfo: {
-	// 	maxTokens: 65536,
-	// 	contextWindow: 1000000,
-	// 	supportsImages: true,
-	// 	supportsPromptCache: false,
-	// 	inputPrice: 0,
-	// 	outputPrice: 0,
-	// 	description:
-	// 		"Gemini 2.5 Pro is Google’s state-of-the-art AI model designed for advanced reasoning, coding, mathematics, and scientific tasks. It employs “thinking” capabilities, enabling it to reason through responses with enhanced accuracy and nuanced context handling. Gemini 2.5 Pro achieves top-tier performance on multiple benchmarks, including first-place positioning on the LMArena leaderboard, reflecting superior human-preference alignment and complex problem-solving abilities.",
-	// 	thinking: false,
-	// },
-
-	modelTemperature: null,
+	// modelTemperature: null,
 	// reasoningEffort: "high",
 
 	pinnedApiConfigs: {},
@@ -60,12 +44,18 @@ export const rooCodeDefaults: RooCodeSettings = {
 	maxReadFileLine: 500,
 
 	terminalOutputLineLimit: 500,
-	terminalShellIntegrationTimeout: 15000,
+	terminalShellIntegrationTimeout: 30_000,
+	// terminalCommandDelay: 0,
+	// terminalPowershellCounter: false,
+	// terminalZshClearEolMark: true,
+	// terminalZshOhMy: true,
+	// terminalZshP10k: false,
+	// terminalZdotdir: true,
 
-	diffEnabled: true,
+	diffEnabled: false,
 	fuzzyMatchThreshold: 1.0,
 	experiments: {
-		search_and_replace: true,
+		search_and_replace: false,
 		insert_content: false,
 		powerSteering: false,
 	},

+ 14 - 0
evals/packages/types/src/roo-code.ts

@@ -396,6 +396,7 @@ const providerSettingsRecord: ProviderSettingsRecord = {
 	apiModelId: undefined,
 	apiKey: undefined,
 	anthropicBaseUrl: undefined,
+	anthropicUseAuthToken: undefined,
 	// Glama
 	glamaModelId: undefined,
 	glamaModelInfo: undefined,
@@ -523,6 +524,12 @@ export const globalSettingsSchema = z.object({
 
 	terminalOutputLineLimit: z.number().optional(),
 	terminalShellIntegrationTimeout: z.number().optional(),
+	terminalCommandDelay: z.number().optional(),
+	terminalPowershellCounter: z.boolean().optional(),
+	terminalZshClearEolMark: z.boolean().optional(),
+	terminalZshOhMy: z.boolean().optional(),
+	terminalZshP10k: z.boolean().optional(),
+	terminalZdotdir: z.boolean().optional(),
 
 	diffEnabled: z.boolean().optional(),
 	fuzzyMatchThreshold: z.number().optional(),
@@ -592,6 +599,12 @@ const globalSettingsRecord: GlobalSettingsRecord = {
 
 	terminalOutputLineLimit: undefined,
 	terminalShellIntegrationTimeout: undefined,
+	terminalCommandDelay: undefined,
+	terminalPowershellCounter: undefined,
+	terminalZshClearEolMark: undefined,
+	terminalZshOhMy: undefined,
+	terminalZshP10k: undefined,
+	terminalZdotdir: undefined,
 
 	diffEnabled: undefined,
 	fuzzyMatchThreshold: undefined,
@@ -731,6 +744,7 @@ export const clineSays = [
 	"new_task",
 	"checkpoint_saved",
 	"rooignore_error",
+	"diff_error",
 ] as const
 
 export const clineSaySchema = z.enum(clineSays)

+ 6 - 0
evals/pnpm-lock.yaml

@@ -62,6 +62,9 @@ importers:
       p-wait-for:
         specifier: ^5.0.2
         version: 5.0.2
+      ps-tree:
+        specifier: ^1.2.0
+        version: 1.2.0
     devDependencies:
       '@evals/eslint-config':
         specifier: workspace:^
@@ -69,6 +72,9 @@ importers:
       '@evals/typescript-config':
         specifier: workspace:^
         version: link:../../config/typescript
+      '@types/ps-tree':
+        specifier: ^1.1.6
+        version: 1.1.6
 
   apps/web:
     dependencies:

+ 22 - 11
evals/scripts/setup.sh

@@ -275,6 +275,25 @@ fi
 
 pnpm install --silent || exit 1
 
+if ! command -v code &>/dev/null; then
+  echo "⚠️ Visual Studio Code cli is not installed"
+  exit 1
+else
+  VSCODE_VERSION=$(code --version | head -n 1)
+  echo "✅ Visual Studio Code is installed ($VSCODE_VERSION)"
+fi
+
+# To reset VSCode:
+# rm -rvf ~/.vscode && rm -rvf ~/Library/Application\ Support/Code
+
+echo "🔌 Installing Visual Studio Code extensions..."
+code --install-extension golang.go &>/dev/null || exit 1
+code --install-extension dbaeumer.vscode-eslint&>/dev/null || exit 1
+code --install-extension redhat.java &>/dev/null || exit 1
+code --install-extension ms-python.python&>/dev/null || exit 1
+code --install-extension rust-lang.rust-analyzer &>/dev/null || exit 1
+code --install-extension rooveterinaryinc.roo-cline &>/dev/null || exit 1
+
 if [[ ! -d "../../evals" ]]; then
   if gh auth status &>/dev/null; then
     read -p "🔗 Would you like to be able to share eval results? (Y/n): " fork_evals
@@ -293,9 +312,9 @@ if [[ ! -s .env ]]; then
   cp .env.sample .env || exit 1
 fi
 
-echo "🗄️ Syncing database..."
-pnpm --filter @evals/db db:push || exit 1
-pnpm --filter @evals/db db:enable-wal || exit 1
+echo "🗄️ Syncing Roo Code evals database..."
+pnpm --filter @evals/db db:push &>/dev/null || exit 1
+pnpm --filter @evals/db db:enable-wal &>/dev/null || exit 1
 
 if ! grep -q "OPENROUTER_API_KEY" .env; then
   read -p "🔐 Enter your OpenRouter API key (sk-or-v1-...): " openrouter_api_key
@@ -304,14 +323,6 @@ if ! grep -q "OPENROUTER_API_KEY" .env; then
   echo "OPENROUTER_API_KEY=$openrouter_api_key" >> .env || exit 1
 fi
 
-if ! command -v code &>/dev/null; then
-  echo "⚠️ Visual Studio Code cli is not installed"
-  exit 1
-else
-  VSCODE_VERSION=$(code --version | head -n 1)
-  echo "✅ Visual Studio Code is installed ($VSCODE_VERSION)"
-fi
-
 if [[ ! -s "../bin/roo-code-latest.vsix" ]]; then
   build_extension
 else