Просмотр исходного кода

Improve Docker setup for evals (#4327)

Chris Estreich 6 месяцев назад
Родитель
Сommit
cb5b9c3718
45 измененных файлов с 2101 добавлено и 891 удалено
  1. 77 4
      .dockerignore
  2. 4 4
      apps/web-evals/package.json
  3. 20 0
      apps/web-evals/scripts/check-services.sh
  4. 46 14
      apps/web-evals/src/app/api/runs/[id]/stream/route.ts
  5. 0 12
      apps/web-evals/src/app/api/runs/route.ts
  6. 0 12
      apps/web-evals/src/app/api/tasks/route.ts
  7. 8 27
      apps/web-evals/src/app/runs/[id]/connection-status.tsx
  8. 1 1
      apps/web-evals/src/app/runs/[id]/run.tsx
  9. 0 66
      apps/web-evals/src/app/runs/new/defaults.ts
  10. 10 64
      apps/web-evals/src/app/runs/new/new-run.tsx
  11. 55 11
      apps/web-evals/src/hooks/use-event-source.ts
  12. 0 11
      apps/web-evals/src/hooks/use-process-tree.ts
  13. 10 0
      apps/web-evals/src/hooks/use-runners.ts
  14. 3 1
      apps/web-evals/src/lib/schemas.ts
  15. 111 0
      apps/web-evals/src/lib/server/__tests__/sse-stream.spec.ts
  16. 0 55
      apps/web-evals/src/lib/server/processes.ts
  17. 13 0
      apps/web-evals/src/lib/server/redis.ts
  18. 8 0
      apps/web-evals/src/lib/server/runners.ts
  19. 30 12
      apps/web-evals/src/lib/server/runs.ts
  20. 26 3
      apps/web-evals/src/lib/server/sse-stream.ts
  21. 1 0
      apps/web-evals/tsconfig.json
  22. 7 0
      apps/web-evals/vitest.config.ts
  23. 7 0
      packages/evals/.docker/entrypoints/runner.sh
  24. 48 0
      packages/evals/.docker/entrypoints/web.sh
  25. 1 0
      packages/evals/.gitignore
  26. 282 0
      packages/evals/ARCHITECTURE.md
  27. 0 77
      packages/evals/Dockerfile
  28. 138 0
      packages/evals/Dockerfile.runner
  29. 62 0
      packages/evals/Dockerfile.web
  30. 45 13
      packages/evals/README.md
  31. 77 12
      packages/evals/docker-compose.yml
  32. 7 3
      packages/evals/package.json
  33. 99 64
      packages/evals/scripts/setup.sh
  34. 28 415
      packages/evals/src/cli/index.ts
  35. 56 0
      packages/evals/src/cli/processTask.ts
  36. 53 0
      packages/evals/src/cli/redis.ts
  37. 56 0
      packages/evals/src/cli/runEvals.ts
  38. 253 0
      packages/evals/src/cli/runTask.ts
  39. 84 0
      packages/evals/src/cli/runUnitTest.ts
  40. 16 0
      packages/evals/src/cli/utils.ts
  41. 9 0
      packages/evals/src/db/queries/runs.ts
  42. 6 2
      packages/evals/src/db/queries/tasks.ts
  43. 1 1
      packages/evals/src/db/schema.ts
  44. 73 1
      packages/types/src/global-settings.ts
  45. 270 6
      pnpm-lock.yaml

+ 77 - 4
.dockerignore

@@ -1,18 +1,91 @@
-# Build artifacts
+# git
+.git
+
+# build artifacts
 bin/
-!bin/roo-code-latest.vsix
 dist/
 **/dist/
 out/
 **/out/
+src/webview-ui/
 
-# Dependencies
+# dependencies
 node_modules/
 **/node_modules/
 
-# Test and development files
+# testing
 coverage/
 **/.vscode-test/
+**/mock/
 
+# devtools
 knip.json
 .husky/
+
+# monorepo
+.turbo/
+**/.turbo/
+
+# next.js
+**/.next/
+.vercel
+
+# Ignore common development files
+node_modules
+.git
+.gitignore
+.dockerignore
+.env*
+.vscode
+.idea
+
+# Ignore build artifacts
+dist
+build
+*.log
+*.tmp
+.cache
+coverage
+
+# Ignore OS files
+.DS_Store
+Thumbs.db
+
+# Ignore test files
+__tests__
+*.test.js
+*.spec.js
+*.test.ts
+*.spec.ts
+
+# Ignore development config files
+.eslintrc*
+.prettierrc*
+jest.config*
+
+# Ignore most directories except what we need for the build
+apps/
+evals/
+webview-ui/node_modules
+src/node_modules
+
+# Keep essential files for the build
+!README.md
+!CHANGELOG.md
+!package.json
+!pnpm-lock.yaml
+!pnpm-workspace.yaml
+!scripts/bootstrap.mjs
+!apps/web-evals/
+!src/
+!webview-ui/
+!packages/evals/.docker/entrypoints/runner.sh
+!packages/build/
+!packages/cloud/
+!packages/config-eslint/
+!packages/config-typescript/
+!packages/evals/
+!packages/ipc/
+!packages/telemetry/
+!packages/types/
+!locales/

+ 4 - 4
apps/web-evals/package.json

@@ -5,7 +5,7 @@
 	"scripts": {
 		"lint": "next lint",
 		"check-types": "tsc -b",
-		"dev": "next dev --turbopack",
+		"dev": "scripts/check-services.sh && next dev --turbopack",
 		"format": "prettier --write src",
 		"build": "next build",
 		"start": "next start"
@@ -25,7 +25,6 @@
 		"@radix-ui/react-tabs": "^1.1.3",
 		"@radix-ui/react-tooltip": "^1.1.8",
 		"@roo-code/evals": "workspace:^",
-		"@roo-code/ipc": "workspace:^",
 		"@roo-code/types": "workspace:^",
 		"@tanstack/react-query": "^5.69.0",
 		"class-variance-authority": "^0.7.1",
@@ -36,11 +35,11 @@
 		"next": "^15.2.5",
 		"next-themes": "^0.4.6",
 		"p-map": "^7.0.3",
-		"ps-tree": "^1.2.0",
 		"react": "^18.3.1",
 		"react-dom": "^18.3.1",
 		"react-hook-form": "^7.57.0",
 		"react-use": "^17.6.0",
+		"redis": "^5.5.5",
 		"sonner": "^2.0.5",
 		"tailwind-merge": "^3.3.0",
 		"tailwindcss-animate": "^1.0.7",
@@ -54,6 +53,7 @@
 		"@types/ps-tree": "^1.1.6",
 		"@types/react": "^18.3.23",
 		"@types/react-dom": "^18.3.5",
-		"tailwindcss": "^4"
+		"tailwindcss": "^4",
+		"vitest": "^3.2.1"
 	}
 }

+ 20 - 0
apps/web-evals/scripts/check-services.sh

@@ -0,0 +1,20 @@
+#!/bin/bash
+
+if ! docker info &> /dev/null; then
+  echo "❌ Docker is not running. Please start Docker Desktop and try again."
+  exit 1
+fi
+
+if ! nc -z localhost 5432 2>/dev/null; then
+  echo "❌ PostgreSQL is not running on port 5432"
+  echo "💡 Start it with: pnpm --filter @roo-code/evals db:start"
+  exit 1
+fi
+
+if ! nc -z localhost 6379 2>/dev/null; then
+  echo "❌ Redis is not running on port 6379"
+  echo "💡 Start it with: pnpm --filter @roo-code/evals redis:start"
+  exit 1
+fi
+
+echo "✅ All required services are running"

+ 46 - 14
apps/web-evals/src/app/api/runs/[id]/stream/route.ts

@@ -1,10 +1,10 @@
 import type { NextRequest } from "next/server"
 
+import { taskEventSchema } from "@roo-code/types"
 import { findRun } from "@roo-code/evals"
-import { IpcClient } from "@roo-code/ipc"
-import { IpcMessageType } from "@roo-code/types"
 
 import { SSEStream } from "@/lib/server/sse-stream"
+import { redisClient } from "@/lib/server/redis"
 
 export const dynamic = "force-dynamic"
 
@@ -13,26 +13,58 @@ export async function GET(request: NextRequest, { params }: { params: Promise<{
 	const requestId = crypto.randomUUID()
 	const stream = new SSEStream()
 	const run = await findRun(Number(id))
-	const client = new IpcClient(run.socketPath, () => {})
+	const redis = await redisClient()
 
-	const write = async (data: string | object) => {
-		// console.log(`[stream#${requestId}] write`, data)
-		const success = await stream.write(data)
+	let isStreamClosed = false
+	const channelName = `evals:${run.id}`
 
-		if (!success) {
-			client.disconnect()
+	const onMessage = async (data: string) => {
+		if (isStreamClosed || stream.isClosed) {
+			return
+		}
+
+		try {
+			const taskEvent = taskEventSchema.parse(JSON.parse(data))
+			// console.log(`[stream#${requestId}] task event -> ${taskEvent.eventName}`)
+			const writeSuccess = await stream.write(JSON.stringify(taskEvent))
+
+			if (!writeSuccess) {
+				await disconnect()
+			}
+		} catch (_error) {
+			console.error(`[stream#${requestId}] invalid task event:`, data)
+		}
+	}
+
+	const disconnect = async () => {
+		if (isStreamClosed) {
+			return
+		}
+
+		isStreamClosed = true
+
+		try {
+			await redis.unsubscribe(channelName)
+			console.log(`[stream#${requestId}] unsubscribed from ${channelName}`)
+		} catch (error) {
+			console.error(`[stream#${requestId}] error unsubscribing:`, error)
+		}
+
+		try {
+			await stream.close()
+		} catch (error) {
+			console.error(`[stream#${requestId}] error closing stream:`, error)
 		}
 	}
 
-	console.log(`[stream#${requestId}] connect`)
-	client.on(IpcMessageType.Connect, () => write("connect"))
-	client.on(IpcMessageType.Disconnect, () => write("disconnect"))
-	client.on(IpcMessageType.TaskEvent, write)
+	await redis.subscribe(channelName, onMessage)
 
 	request.signal.addEventListener("abort", () => {
 		console.log(`[stream#${requestId}] abort`)
-		client.disconnect()
-		stream.close().catch(() => {})
+
+		disconnect().catch((error) => {
+			console.error(`[stream#${requestId}] cleanup error:`, error)
+		})
 	})
 
 	return stream.getResponse()

+ 0 - 12
apps/web-evals/src/app/api/runs/route.ts

@@ -1,12 +0,0 @@
-import { NextResponse } from "next/server"
-
-import { createRun } from "@roo-code/evals"
-
-export async function POST(request: Request) {
-	try {
-		const run = await createRun(await request.json())
-		return NextResponse.json({ run }, { status: 201 })
-	} catch (error) {
-		return NextResponse.json({ error: (error as Error).message }, { status: 500 })
-	}
-}

+ 0 - 12
apps/web-evals/src/app/api/tasks/route.ts

@@ -1,12 +0,0 @@
-import { NextResponse } from "next/server"
-
-import { createTask } from "@roo-code/evals"
-
-export async function POST(request: Request) {
-	try {
-		const task = await createTask(await request.json())
-		return NextResponse.json({ task }, { status: 201 })
-	} catch (error) {
-		return NextResponse.json({ error: (error as Error).message }, { status: 500 })
-	}
-}

+ 8 - 27
apps/web-evals/src/app/runs/[id]/connection-status.tsx

@@ -1,29 +1,17 @@
 "use client"
 
-import { useCallback } from "react"
-import { Skull } from "lucide-react"
-
-import { killProcessTree } from "@/lib/server/processes"
-import { EventSourceStatus } from "@/hooks/use-event-source"
-import { useProcessList } from "@/hooks/use-process-tree"
+import type { EventSourceStatus } from "@/hooks/use-event-source"
+import { useRunners } from "@/hooks/use-runners"
 import { cn } from "@/lib/utils"
-import { Button } from "@/components/ui"
 
 type ConnectionStatusProps = {
 	status: EventSourceStatus
-	pid: number | null
+	runId: number
 }
 
 export const ConnectionStatus = (connectionStatus: ConnectionStatusProps) => {
-	const { data: pids, isLoading } = useProcessList(connectionStatus.pid)
-	const status = isLoading ? "loading" : pids === null ? "dead" : connectionStatus.status
-
-	const onKill = useCallback(async () => {
-		if (connectionStatus.pid) {
-			await killProcessTree(connectionStatus.pid)
-			window.location.reload()
-		}
-	}, [connectionStatus.pid])
+	const { data: runners, isLoading } = useRunners(connectionStatus.runId)
+	const status = isLoading ? "loading" : runners === null ? "dead" : connectionStatus.status
 
 	return (
 		<div>
@@ -52,16 +40,9 @@ export const ConnectionStatus = (connectionStatus: ConnectionStatusProps) => {
 				</div>
 			</div>
 			<div className="flex items-center gap-2">
-				<div>PIDs:</div>
-				<div className="font-mono text-sm">{connectionStatus.pid}</div>
-				{status === "connected" && (
-					<>
-						<div className="font-mono text-sm text-muted-foreground">{pids?.join(" ")}</div>
-						<Button variant="ghost" size="sm" onClick={onKill}>
-							Kill
-							<Skull />
-						</Button>
-					</>
+				<div>Runners:</div>
+				{runners && runners.length > 0 && (
+					<div className="font-mono text-sm text-muted-foreground">{runners?.join(", ")}</div>
 				)}
 			</div>
 		</div>

+ 1 - 1
apps/web-evals/src/app/runs/[id]/run.tsx

@@ -48,7 +48,7 @@ export function Run({ run }: { run: Run }) {
 						<div>{run.model}</div>
 						{run.description && <div className="text-sm text-muted-foreground">{run.description}</div>}
 					</div>
-					{!run.taskMetricsId && <ConnectionStatus status={status} pid={run.pid} />}
+					{!run.taskMetricsId && <ConnectionStatus status={status} runId={run.id} />}
 				</div>
 				{!tasks ? (
 					<LoaderCircle className="size-4 animate-spin" />

+ 0 - 66
apps/web-evals/src/app/runs/new/defaults.ts

@@ -1,66 +0,0 @@
-import { RooCodeSettings } from "@roo-code/types"
-
-export const rooCodeDefaults: RooCodeSettings = {
-	apiProvider: "openrouter",
-	openRouterUseMiddleOutTransform: false,
-
-	lastShownAnnouncementId: "may-21-2025-3-18",
-
-	pinnedApiConfigs: {},
-
-	autoApprovalEnabled: true,
-	alwaysAllowReadOnly: true,
-	alwaysAllowReadOnlyOutsideWorkspace: false,
-	alwaysAllowWrite: true,
-	alwaysAllowWriteOutsideWorkspace: false,
-	writeDelayMs: 1000,
-	alwaysAllowBrowser: true,
-	alwaysApproveResubmit: true,
-	requestDelaySeconds: 10,
-	alwaysAllowMcp: true,
-	alwaysAllowModeSwitch: true,
-	alwaysAllowSubtasks: true,
-	alwaysAllowExecute: true,
-	allowedCommands: ["*"],
-
-	browserToolEnabled: false,
-	browserViewportSize: "900x600",
-	screenshotQuality: 75,
-	remoteBrowserEnabled: false,
-
-	ttsEnabled: false,
-	ttsSpeed: 1,
-	soundEnabled: false,
-	soundVolume: 0.5,
-
-	terminalOutputLineLimit: 500,
-	terminalShellIntegrationTimeout: 30000,
-	terminalCommandDelay: 0,
-	terminalPowershellCounter: false,
-	terminalZshOhMy: true,
-	terminalZshClearEolMark: true,
-	terminalZshP10k: false,
-	terminalZdotdir: true,
-	terminalCompressProgressBar: true,
-	terminalShellIntegrationDisabled: false,
-
-	diffEnabled: true,
-	fuzzyMatchThreshold: 1,
-
-	enableCheckpoints: false,
-
-	rateLimitSeconds: 0,
-	maxOpenTabsContext: 20,
-	maxWorkspaceFiles: 200,
-	showRooIgnoredFiles: true,
-	maxReadFileLine: -1, // -1 to enable full file reading.
-
-	language: "en",
-	telemetrySetting: "enabled",
-
-	mcpEnabled: false,
-
-	mode: "code",
-
-	customModes: [],
-}

+ 10 - 64
apps/web-evals/src/app/runs/new/new-run.tsx

@@ -9,12 +9,13 @@ import fuzzysort from "fuzzysort"
 import { toast } from "sonner"
 import { X, Rocket, Check, ChevronsUpDown, SlidersHorizontal, Book, CircleCheck } from "lucide-react"
 
-import { globalSettingsSchema, providerSettingsSchema } from "@roo-code/types"
+import { globalSettingsSchema, providerSettingsSchema, EVALS_SETTINGS, getModelId } from "@roo-code/types"
 
 import { createRun } from "@/lib/server/runs"
 import {
 	createRunSchema as formSchema,
 	type CreateRun as FormValues,
+	MODEL_DEFAULT,
 	CONCURRENCY_MIN,
 	CONCURRENCY_MAX,
 	CONCURRENCY_DEFAULT,
@@ -51,26 +52,25 @@ import {
 	DialogFooter,
 } from "@/components/ui"
 
-import { rooCodeDefaults } from "./defaults"
 import { SettingsDiff } from "./settings-diff"
 
 export function NewRun() {
 	const router = useRouter()
 
 	const [mode, setMode] = useState<"openrouter" | "settings">("openrouter")
-
 	const [modelSearchValue, setModelSearchValue] = useState("")
 	const [modelPopoverOpen, setModelPopoverOpen] = useState(false)
+
 	const modelSearchResultsRef = useRef<Map<string, number>>(new Map())
 	const modelSearchValueRef = useRef("")
-	const models = useOpenRouterModels()
 
+	const models = useOpenRouterModels()
 	const exercises = useExercises()
 
 	const form = useForm<FormValues>({
 		resolver: zodResolver(formSchema),
 		defaultValues: {
-			model: "",
+			model: MODEL_DEFAULT,
 			description: "",
 			suite: "full",
 			exercises: [],
@@ -96,14 +96,7 @@ export function NewRun() {
 		async (values: FormValues) => {
 			try {
 				if (mode === "openrouter") {
-					const openRouterModel = models.data?.find(({ id }) => id === model)
-
-					if (!openRouterModel) {
-						throw new Error("Model not found.")
-					}
-
-					const openRouterModelId = openRouterModel.id
-					values.settings = { ...(values.settings || {}), openRouterModelId }
+					values.settings = { ...(values.settings || {}), openRouterModelId: model }
 				}
 
 				const { id } = await createRun({ ...values, systemPrompt })
@@ -112,7 +105,7 @@ export function NewRun() {
 				toast.error(e instanceof Error ? e.message : "An unknown error occurred.")
 			}
 		},
-		[mode, model, models.data, router, systemPrompt],
+		[mode, model, router, systemPrompt],
 	)
 
 	const onFilterModels = useCallback(
@@ -167,55 +160,8 @@ export function NewRun() {
 
 				const providerSettings = providerProfiles.apiConfigs[providerProfiles.currentApiConfigName] ?? {}
 
-				const {
-					apiProvider,
-					apiModelId,
-					openRouterModelId,
-					glamaModelId,
-					requestyModelId,
-					unboundModelId,
-					ollamaModelId,
-					lmStudioModelId,
-					openAiModelId,
-				} = providerSettings
-
-				switch (apiProvider) {
-					case "anthropic":
-					case "bedrock":
-					case "deepseek":
-					case "gemini":
-					case "mistral":
-					case "openai-native":
-					case "xai":
-					case "vertex":
-						setValue("model", apiModelId ?? "")
-						break
-					case "openrouter":
-						setValue("model", openRouterModelId ?? "")
-						break
-					case "glama":
-						setValue("model", glamaModelId ?? "")
-						break
-					case "requesty":
-						setValue("model", requestyModelId ?? "")
-						break
-					case "unbound":
-						setValue("model", unboundModelId ?? "")
-						break
-					case "openai":
-						setValue("model", openAiModelId ?? "")
-						break
-					case "ollama":
-						setValue("model", ollamaModelId ?? "")
-						break
-					case "lmstudio":
-						setValue("model", lmStudioModelId ?? "")
-						break
-					default:
-						throw new Error(`Unsupported API provider: ${apiProvider}`)
-				}
-
-				setValue("settings", { ...rooCodeDefaults, ...providerSettings, ...globalSettings })
+				setValue("model", getModelId(providerSettings) ?? "")
+				setValue("settings", { ...EVALS_SETTINGS, ...providerSettings, ...globalSettings })
 				setMode("settings")
 
 				event.target.value = ""
@@ -316,7 +262,7 @@ export function NewRun() {
 												settings.
 											</div>
 										</div>
-										<SettingsDiff defaultSettings={rooCodeDefaults} customSettings={settings} />
+										<SettingsDiff defaultSettings={EVALS_SETTINGS} customSettings={settings} />
 									</>
 								</ScrollArea>
 							)}

+ 55 - 11
apps/web-evals/src/hooks/use-event-source.ts

@@ -14,44 +14,88 @@ export function useEventSource({ url, withCredentials, onMessage }: UseEventSour
 	const sourceRef = useRef<EventSource | null>(null)
 	const statusRef = useRef<EventSourceStatus>("waiting")
 	const [status, setStatus] = useState<EventSourceStatus>("waiting")
+	const reconnectTimeoutRef = useRef<NodeJS.Timeout | null>(null)
+	const isUnmountedRef = useRef(false)
 	const handleMessage = useCallback((event: MessageEvent) => onMessage(event), [onMessage])
 
+	const cleanup = useCallback(() => {
+		if (reconnectTimeoutRef.current) {
+			clearTimeout(reconnectTimeoutRef.current)
+			reconnectTimeoutRef.current = null
+		}
+
+		if (sourceRef.current) {
+			sourceRef.current.close()
+			sourceRef.current = null
+		}
+	}, [])
+
 	const createEventSource = useCallback(() => {
+		if (isUnmountedRef.current) {
+			return
+		}
+
+		cleanup()
+
+		statusRef.current = "waiting"
+		setStatus("waiting")
+
 		sourceRef.current = new EventSource(url, { withCredentials })
 
 		sourceRef.current.onopen = () => {
+			if (isUnmountedRef.current) {
+				return
+			}
+
 			statusRef.current = "connected"
 			setStatus("connected")
 		}
 
 		sourceRef.current.onmessage = (event) => {
+			if (isUnmountedRef.current) {
+				return
+			}
+
 			handleMessage(event)
 		}
 
 		sourceRef.current.onerror = () => {
+			if (isUnmountedRef.current) {
+				return
+			}
+
 			statusRef.current = "error"
 			setStatus("error")
-			// sourceRef.current?.close()
-			// sourceRef.current = null
+
+			// Clean up current connection.
+			cleanup()
+
+			// Attempt to reconnect after a delay.
+			reconnectTimeoutRef.current = setTimeout(() => {
+				if (!isUnmountedRef.current) {
+					createEventSource()
+				}
+			}, 1000)
 		}
-	}, [url, withCredentials, handleMessage])
+	}, [url, withCredentials, handleMessage, cleanup])
 
 	useEffect(() => {
+		isUnmountedRef.current = false
 		createEventSource()
 
-		setTimeout(() => {
-			if (statusRef.current === "waiting") {
-				sourceRef.current?.close()
-				sourceRef.current = null
+		// Initial connection timeout.
+		const initialTimeout = setTimeout(() => {
+			if (statusRef.current === "waiting" && !isUnmountedRef.current) {
 				createEventSource()
 			}
-		}, 100)
+		}, 5000)
 
 		return () => {
-			sourceRef.current?.close()
-			sourceRef.current = null
+			isUnmountedRef.current = true
+			clearTimeout(initialTimeout)
+			cleanup()
 		}
-	}, [createEventSource])
+	}, [createEventSource, cleanup])
 
 	return status
 }

+ 0 - 11
apps/web-evals/src/hooks/use-process-tree.ts

@@ -1,11 +0,0 @@
-import { useQuery } from "@tanstack/react-query"
-
-import { getProcessList } from "@/lib/server/processes"
-
-export const useProcessList = (pid: number | null) =>
-	useQuery({
-		queryKey: ["process-tree", pid],
-		queryFn: () => (pid ? getProcessList(pid) : []),
-		enabled: !!pid,
-		refetchInterval: 30_000,
-	})

+ 10 - 0
apps/web-evals/src/hooks/use-runners.ts

@@ -0,0 +1,10 @@
+import { useQuery } from "@tanstack/react-query"
+
+import { getRunners } from "@/lib/server/runners"
+
+export const useRunners = (runId: number) =>
+	useQuery({
+		queryKey: ["runners", runId],
+		queryFn: () => getRunners(runId),
+		refetchInterval: 10_000,
+	})

+ 3 - 1
apps/web-evals/src/lib/schemas.ts

@@ -6,9 +6,11 @@ import { rooCodeSettingsSchema } from "@roo-code/types"
  * CreateRun
  */
 
+export const MODEL_DEFAULT = "anthropic/claude-sonnet-4"
+
 export const CONCURRENCY_MIN = 1
 export const CONCURRENCY_MAX = 25
-export const CONCURRENCY_DEFAULT = 2
+export const CONCURRENCY_DEFAULT = 1
 
 export const createRunSchema = z
 	.object({

+ 111 - 0
apps/web-evals/src/lib/server/__tests__/sse-stream.spec.ts

@@ -0,0 +1,111 @@
+// npx vitest run src/lib/server/__tests__/sse-stream.spec.ts
+
+import { SSEStream } from "../sse-stream"
+
+describe("SSEStream", () => {
+	let stream: SSEStream
+
+	beforeEach(() => {
+		stream = new SSEStream()
+	})
+
+	it("should create a new SSEStream instance", () => {
+		expect(stream).toBeInstanceOf(SSEStream)
+		expect(stream.isClosed).toBe(false)
+	})
+
+	it("should write string data successfully when stream is open", async () => {
+		const response = stream.getResponse()
+		const reader = response.body?.getReader()
+
+		const writePromise = stream.write("test message")
+
+		if (reader) {
+			await reader.read()
+			reader.releaseLock()
+		}
+
+		const result = await writePromise
+		expect(result).toBe(true)
+		expect(stream.isClosed).toBe(false)
+	})
+
+	it("should write object data successfully when stream is open", async () => {
+		const testData = { message: "test", id: 123 }
+
+		const response = stream.getResponse()
+		const reader = response.body?.getReader()
+
+		const writePromise = stream.write(testData)
+
+		if (reader) {
+			await reader.read()
+			reader.releaseLock()
+		}
+
+		const result = await writePromise
+		expect(result).toBe(true)
+		expect(stream.isClosed).toBe(false)
+	})
+
+	it("should return false when writing to closed stream", async () => {
+		await stream.close()
+		expect(stream.isClosed).toBe(true)
+
+		const result = await stream.write("test message")
+		expect(result).toBe(false)
+	})
+
+	it("should handle multiple close calls gracefully", async () => {
+		await stream.close()
+		expect(stream.isClosed).toBe(true)
+
+		// Second close should not throw.
+		await expect(stream.close()).resolves.toBeUndefined()
+		expect(stream.isClosed).toBe(true)
+	})
+
+	it("should create response with correct headers", () => {
+		const response = stream.getResponse()
+		expect(response).toBeInstanceOf(Response)
+		expect(response.headers.get("Content-Type")).toBe("text/event-stream")
+		expect(response.headers.get("Connection")).toBe("keep-alive")
+		expect(response.headers.get("Cache-Control")).toBe("no-cache, no-transform")
+		expect(response.headers.get("Access-Control-Allow-Origin")).toBe("*")
+	})
+
+	it("should format data correctly for SSE", async () => {
+		const response = stream.getResponse()
+		const reader = response.body?.getReader()
+		const decoder = new TextDecoder()
+
+		const writePromise = stream.write("hello world")
+
+		if (reader) {
+			const { value } = await reader.read()
+			const text = decoder.decode(value)
+			expect(text).toBe("data: hello world\n\n")
+			reader.releaseLock()
+		}
+
+		await writePromise
+	})
+
+	it("should format JSON data correctly for SSE", async () => {
+		const response = stream.getResponse()
+		const reader = response.body?.getReader()
+		const decoder = new TextDecoder()
+
+		const testData = { type: "test", message: "hello" }
+		const writePromise = stream.write(testData)
+
+		if (reader) {
+			const { value } = await reader.read()
+			const text = decoder.decode(value)
+			expect(text).toBe(`data: ${JSON.stringify(testData)}\n\n`)
+			reader.releaseLock()
+		}
+
+		await writePromise
+	})
+})

+ 0 - 55
apps/web-evals/src/lib/server/processes.ts

@@ -1,55 +0,0 @@
-"use server"
-
-import psTree from "ps-tree"
-import { exec } from "child_process"
-
-const asyncExec = (command: string): Promise<{ stdout: string; stderr: string }> =>
-	new Promise((resolve, reject) => {
-		exec(command, (error, stdout, stderr) => {
-			if (error) {
-				reject(error)
-			} else {
-				resolve({ stdout, stderr })
-			}
-		})
-	})
-
-export const getProcessList = async (pid: number) => {
-	try {
-		await asyncExec(`ps -p ${pid} -o pid=`)
-
-		return new Promise<number[]>((resolve, reject) => {
-			psTree(pid, (err, children) => {
-				if (err) {
-					reject(err)
-				}
-
-				resolve(children.map((p) => parseInt(p.PID)))
-			})
-		})
-	} catch (_) {
-		return null
-	}
-}
-
-export const killProcessTree = async (pid: number) => {
-	const descendants = await getProcessList(pid)
-
-	if (descendants === null) {
-		return
-	}
-
-	if (descendants.length > 0) {
-		try {
-			await asyncExec(`kill -9 ${descendants.join(" ")}`)
-		} catch (error) {
-			console.error("Error killing descendant processes:", error)
-		}
-	}
-
-	try {
-		await asyncExec(`kill -9 ${pid}`)
-	} catch (error) {
-		console.error("Error killing main process:", error)
-	}
-}

+ 13 - 0
apps/web-evals/src/lib/server/redis.ts

@@ -0,0 +1,13 @@
+import { type RedisClientType, createClient } from "redis"
+
+let redis: RedisClientType | null = null
+
+export async function redisClient() {
+	if (!redis) {
+		redis = createClient({ url: process.env.REDIS_URL || "redis://localhost:6379" })
+		redis.on("error", (error) => console.error("Redis error:", error))
+		await redis.connect()
+	}
+
+	return redis
+}

+ 8 - 0
apps/web-evals/src/lib/server/runners.ts

@@ -0,0 +1,8 @@
+"use server"
+
+import { redisClient } from "./redis"
+
+export const getRunners = async (runId: number) => {
+	const redis = await redisClient()
+	return redis.sMembers(`runners:${runId}`)
+}

+ 30 - 12
apps/web-evals/src/lib/server/runs.ts

@@ -1,8 +1,6 @@
 "use server"
 
 import { spawn } from "child_process"
-import path from "path"
-import os from "os"
 import fs from "fs"
 
 import { revalidatePath } from "next/cache"
@@ -12,7 +10,6 @@ import {
 	type ExerciseLanguage,
 	exerciseLanguages,
 	createRun as _createRun,
-	updateRun as _updateRun,
 	deleteRun as _deleteRun,
 	createTask,
 } from "@roo-code/evals"
@@ -21,10 +18,11 @@ import { CreateRun } from "@/lib/schemas"
 
 import { getExercisesForLanguage } from "./exercises"
 
+// eslint-disable-next-line @typescript-eslint/no-unused-vars
 export async function createRun({ suite, exercises = [], systemPrompt, ...values }: CreateRun) {
 	const run = await _createRun({
 		...values,
-		socketPath: path.join(os.tmpdir(), `roo-code-evals-${crypto.randomUUID()}.sock`),
+		socketPath: "", // TODO: Get rid of this.
 	})
 
 	if (suite === "partial") {
@@ -50,20 +48,40 @@ export async function createRun({ suite, exercises = [], systemPrompt, ...values
 	revalidatePath("/runs")
 
 	try {
-		const logFile = fs.openSync(`/tmp/roo-code-evals-${run.id}.log`, "a")
+		const isRunningInDocker = fs.existsSync("/.dockerenv")
 
-		const env: NodeJS.ProcessEnv = systemPrompt
-			? { ...process.env, FOOTGUN_SYSTEM_PROMPT: systemPrompt }
-			: process.env
+		const dockerArgs = [
+			`--name evals-controller-${run.id}`,
+			"--rm",
+			"--network evals_default",
+			"-v /var/run/docker.sock:/var/run/docker.sock",
+			"-e HOST_EXECUTION_METHOD=docker",
+		]
 
-		const childProcess = spawn("pnpm", ["--filter", "@roo-code/evals", "cli", run.id.toString()], {
+		const cliCommand = `pnpm --filter @roo-code/evals cli --runId ${run.id}`
+
+		const command = isRunningInDocker
+			? `docker run ${dockerArgs.join(" ")} evals-runner sh -c "${cliCommand}"`
+			: cliCommand
+
+		console.log("spawn ->", command)
+
+		const childProcess = spawn("sh", ["-c", command], {
 			detached: true,
-			stdio: ["ignore", logFile, logFile],
-			env,
+			stdio: ["ignore", "pipe", "pipe"],
 		})
 
+		const logStream = fs.createWriteStream("/tmp/roo-code-evals.log", { flags: "a" })
+
+		if (childProcess.stdout) {
+			childProcess.stdout.pipe(logStream)
+		}
+
+		if (childProcess.stderr) {
+			childProcess.stderr.pipe(logStream)
+		}
+
 		childProcess.unref()
-		await _updateRun(run.id, { pid: childProcess.pid })
 	} catch (error) {
 		console.error(error)
 	}

+ 26 - 3
apps/web-evals/src/lib/server/sse-stream.ts

@@ -2,6 +2,7 @@ export class SSEStream {
 	private readonly _stream: TransformStream
 	private readonly _writer: WritableStreamDefaultWriter
 	private readonly _encoder: TextEncoder
+	private _isClosed: boolean = false
 
 	constructor() {
 		this._stream = new TransformStream()
@@ -9,20 +10,40 @@ export class SSEStream {
 		this._encoder = new TextEncoder()
 	}
 
-	public async write(data: string | object) {
+	public async write(data: string | object): Promise<boolean> {
+		if (this._isClosed) {
+			return false
+		}
+
 		try {
 			const buffer = typeof data === "object" ? JSON.stringify(data) : data
 			await this._writer.write(this._encoder.encode(`data: ${buffer}\n\n`))
 			return true
 		} catch (error) {
 			console.error("[SSEStream#write]", error)
+			this._isClosed = true
 			this.close().catch(() => {})
 			return false
 		}
 	}
 
-	public close() {
-		return this._writer.close()
+	public async close(): Promise<void> {
+		if (this._isClosed) {
+			return
+		}
+
+		this._isClosed = true
+
+		try {
+			await this._writer.close()
+		} catch (error) {
+			// Writer might already be closed, ignore the error.
+			console.debug("[SSEStream#close] Writer already closed:", error)
+		}
+	}
+
+	public get isClosed(): boolean {
+		return this._isClosed
 	}
 
 	public getResponse() {
@@ -31,6 +52,8 @@ export class SSEStream {
 				"Content-Type": "text/event-stream",
 				Connection: "keep-alive",
 				"Cache-Control": "no-cache, no-transform",
+				"Access-Control-Allow-Origin": "*",
+				"Access-Control-Allow-Headers": "Cache-Control",
 			},
 		})
 	}

+ 1 - 0
apps/web-evals/tsconfig.json

@@ -1,6 +1,7 @@
 {
 	"extends": "@roo-code/config-typescript/nextjs.json",
 	"compilerOptions": {
+		"types": ["vitest/globals"],
 		"plugins": [{ "name": "next" }],
 		"paths": { "@/*": ["./src/*"] }
 	},

+ 7 - 0
apps/web-evals/vitest.config.ts

@@ -0,0 +1,7 @@
+import { defineConfig } from "vitest/config"
+
+export default defineConfig({
+	test: {
+		globals: true,
+	},
+})

+ 7 - 0
packages/evals/.docker/entrypoints/runner.sh

@@ -0,0 +1,7 @@
+#!/bin/bash
+
+if [ $# -eq 0 ]; then
+    exec bash
+else
+    exec "$@"
+fi

+ 48 - 0
packages/evals/.docker/entrypoints/web.sh

@@ -0,0 +1,48 @@
+#!/bin/bash
+
+set -e
+
+echo "🚀 Starting evals web service..."
+
+wait_for_db() {
+    echo "⏳ Waiting for database..."
+
+    # postgresql://user:password@host:port/database
+    DB_HOST=$(echo $DATABASE_URL | sed -n 's/.*@\([^:]*\):.*/\1/p')
+    DB_PORT=$(echo $DATABASE_URL | sed -n 's/.*:\([0-9]*\)\/.*/\1/p')
+    DB_USER=$(echo $DATABASE_URL | sed -n 's/.*\/\/\([^:]*\):.*/\1/p')
+    DB_NAME=$(echo $DATABASE_URL | sed -n 's/.*\/\([^?]*\).*/\1/p')
+
+    DB_HOST=${DB_HOST:-db}
+    DB_PORT=${DB_PORT:-5432}
+    DB_USER=${DB_USER:-postgres}
+    DB_NAME=${DB_NAME:-evals_development}
+
+    until pg_isready -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" > /dev/null 2>&1; do
+        echo "⏳ Database not ready yet, waiting 2 seconds..."
+        sleep 2
+    done
+    
+    echo "✅ Database is ready"
+}
+
+run_migrations() {
+    echo "🔄 Running database migrations..."
+
+    if pnpm --filter @roo-code/evals db:migrate; then
+        echo "✅ Database migrations completed successfully!"
+    else
+        echo "❌ Database migration failed!"
+        exit 1
+    fi
+}
+
+main() {
+    wait_for_db
+    run_migrations
+
+    echo "🌐 Starting web service..."
+    pnpm --filter @roo-code/web-evals start
+}
+
+main "$@"

+ 1 - 0
packages/evals/.gitignore

@@ -5,3 +5,4 @@
 # docker
 .docker/*
 !.docker/scripts
+!.docker/entrypoints

+ 282 - 0
packages/evals/ARCHITECTURE.md

@@ -0,0 +1,282 @@
+# Evals System Architecture
+
+## Overview
+
+The evals system is a distributed evaluation platform that runs AI coding tasks in isolated VS Code environments. It solves two critical problems in AI evaluation:
+
+1. **Dependency Management**: Eliminates the complexity of setting up multiple programming language environments by packaging everything into pre-configured containers
+2. **Resource Isolation**: Prevents memory exhaustion and state contamination by running each task in a fresh, isolated container instead of sequentially in a single VS Code instance
+
+The architecture consists of three main components: a Next.js web application for management, a controller container that orchestrates evaluation runs, and multiple runner containers that execute individual tasks.
+
+## Problems Solved
+
+### Simplified Setup and Deployment
+
+Traditional AI evaluation setups require complex dependency management across multiple programming languages, development tools, and VS Code extensions. The evals system eliminates this friction by:
+
+- **One-Command Deployment**: Single `docker compose up` command starts the entire evaluation infrastructure
+- **Pre-configured Environments**: Runner containers include all necessary language runtimes, tools, and VS Code extensions
+- **Dependency Isolation**: No host system contamination or version conflicts between different language requirements
+- **Reproducible Environments**: Identical evaluation conditions across different machines and deployments
+
+### Resource Management and Isolation
+
+Running multiple AI evaluation tasks sequentially in a single VS Code instance creates several problems:
+
+- **Memory Accumulation**: VS Code instances gradually consume more memory with each task, eventually leading to crashes
+- **State Contamination**: Previous tasks can leave behind files, settings, or processes that affect subsequent evaluations
+- **Resource Contention**: Multiple tasks competing for the same VS Code instance create bottlenecks and inconsistent performance
+- **Failure Propagation**: A single problematic task can crash the entire evaluation session
+
+The containerized approach solves these issues by:
+
+- **Fresh Environments**: Each task starts with a clean VS Code instance and workspace
+- **Memory Reset**: Container termination automatically reclaims all memory and resources
+- **Parallel Execution**: Multiple tasks can run simultaneously without interference
+- **Fault Isolation**: Individual task failures don't affect other running evaluations
+
+## Architecture Components
+
+```mermaid
+graph TB
+    Web[Admin Web App] <--> Redis[(Redis<br/>PubSub & Registration)]
+    Web <--> DB[(PostgreSQL<br/>Runs & Tasks)]
+    Web --> Controller[Run Controller / PQueue]
+
+    Controller <--> DB
+    Controller --> Runner1[Task Runner 1]
+    Controller --> Runner2[...]
+    Controller --> RunnerN[Task Runner N]
+
+    Runner1 <--> Redis
+    Runner2 <--> Redis
+    RunnerN <--> Redis
+
+    Redis <--> Web
+```
+
+### Core Components
+
+#### Next.js Web Application
+
+The web application serves as the primary interface for creating and monitoring evaluation runs. It provides:
+
+- **Run Management**: Create evaluation runs with configurable parameters (model, concurrency, exercise selection)
+- **Real-time Monitoring**: Live progress tracking via Server-Sent Events
+- **Results Dashboard**: View task completion status, metrics, and outcomes
+- **Container Orchestration**: Spawns controller containers for new runs
+
+#### Controller Container
+
+A specialized instance of the `evals-runner` container that acts as the run orchestrator. The controller:
+
+- **In-Memory Task Queue**: Uses the `p-queue` npm package to manage task distribution with configurable concurrency limits
+- **Git Workspace Setup**: Prepares exercise repositories and manages version control
+- **Runner Coordination**: Spawns and monitors individual task runner containers
+- **Heartbeat Monitoring**: Maintains Redis heartbeat to track controller health
+- **Result Aggregation**: Collects task results and finalizes run metrics
+
+#### Runner Containers
+
+Individual containers that execute single evaluation tasks. Each runner:
+
+- **Isolated Environment**: Fresh VS Code instance with pre-installed language tools and extensions
+- **Task Execution**: Runs AI agent with evaluation prompt in VS Code environment
+- **IPC Communication**: Connects to VS Code via Unix socket for real-time interaction
+- **Unit Testing**: Validates task completion using language-specific test suites
+- **Metrics Collection**: Tracks token usage, costs, tool usage, and execution time
+
+#### Supporting Infrastructure
+
+- **Redis**: Provides pub/sub messaging for real-time events and runner registration tracking (not used for task queuing)
+- **PostgreSQL**: Stores run configurations, task definitions, execution metrics, and results
+- **Docker**: Container orchestration for isolation and scalability
+
+## Execution Flow
+
+### 1. Run Initialization
+
+The web application creates an evaluation run with specified parameters:
+
+- **Suite Type**: Full evaluation (all exercises) or partial (selected exercises)
+- **Model Configuration**: AI model selection and settings
+- **Concurrency**: Number of parallel task executions (1-25)
+- **Exercise Selection**: Programming language and specific coding challenges
+
+### 2. Controller Deployment
+
+The web application spawns a controller container that:
+
+- **Loads Run Configuration**: Retrieves run parameters and associated tasks from database
+- **Prepares Workspace**: Sets up git repository with exercise code and test suites
+- **Establishes Monitoring**: Starts Redis heartbeat and event publishing
+- **Creates Task Queue**: Initializes concurrent task processing with specified limits
+
+### 3. Task Distribution
+
+The controller distributes tasks across runner containers using an in-memory queue:
+
+- **p-queue Management**: Uses the `p-queue` npm package to manage task concurrency in memory
+- **Container Spawning**: Creates isolated runner containers for each task
+- **Resource Management**: Enforces concurrency limits to prevent resource exhaustion
+- **Task Assignment**: Each runner receives a single task with full context
+- **Progress Tracking**: Monitors runner registration and task status via Redis pub/sub
+
+### 4. Task Execution
+
+Individual runners execute evaluation tasks:
+
+- **Environment Setup**: Launches VS Code with Roo extension in isolated container
+- **Prompt Delivery**: Sends evaluation prompt to AI agent via IPC
+- **Code Generation**: AI agent writes code using available tools and context
+- **Real-time Events**: Publishes progress updates, token usage, and completion status
+- **Validation**: Runs language-specific unit tests to verify task completion
+
+### 5. Result Collection
+
+The system aggregates and reports results:
+
+- **Event Streaming**: Real-time progress updates flow from runners through Redis to web interface
+- **Metrics Aggregation**: Controller collects execution metrics, costs, and success rates
+- **Run Completion**: Final results stored in database with comprehensive analytics
+- **Cleanup**: Containers terminated and resources released
+
+## Technical Implementation
+
+### CLI System
+
+The evaluation system is driven by a command-line interface that can operate in two modes:
+
+- **Run Mode**: Orchestrates complete evaluation runs with multiple tasks
+- **Task Mode**: Executes individual tasks within runner containers
+
+The CLI automatically detects its execution environment and adapts behavior accordingly, using containerized task execution when running within Docker.
+
+### Container Architecture
+
+Both controller and runner containers use the same base image but serve different purposes:
+
+#### Runner Container Features
+
+- **Multi-language Support**: Pre-installed runtimes for Go, Java, JavaScript, Python, and Rust
+- **Development Tools**: VS Code with language-specific extensions and Roo Code extension
+- **Containerization**: Docker-in-Docker capability for nested container execution
+- **Exercise Repository**: Git clone of evaluation exercises with test suites
+
+#### Container Isolation
+
+Each task executes in complete isolation with:
+
+- **Fresh VS Code Instance**: Clean environment with no shared state
+- **Dedicated Workspace**: Task-specific directory with relevant exercise files
+- **Resource Limits**: Controlled CPU and memory allocation
+- **Network Isolation**: Containers communicate only through Redis pub/sub
+
+### Communication Architecture
+
+The system uses multiple communication channels:
+
+#### IPC (Inter-Process Communication)
+
+- **Unix Sockets**: Direct communication between CLI and VS Code extension
+- **Event Streaming**: Real-time task progress and AI agent interactions
+- **Command Interface**: Task lifecycle management (start, cancel, close)
+
+#### Redis Pub/Sub
+
+- **Event Broadcasting**: Task events published to run-specific channels
+- **Runner Registration**: Active runner tracking per evaluation run
+- **Heartbeat Monitoring**: Controller health and availability status
+- **Not Used for Queuing**: Task queue management is handled in-memory by the controller using `p-queue`
+
+#### HTTP/SSE
+
+- **Web Interface**: REST API for run management and configuration
+- **Real-time Updates**: Server-Sent Events for live progress monitoring
+- **Result Retrieval**: Task metrics and completion status
+
+### Task Lifecycle Management
+
+Each evaluation task follows a structured lifecycle:
+
+1. **Initialization**: Container startup and VS Code launch
+2. **Connection**: IPC socket establishment and extension activation
+3. **Prompt Delivery**: Evaluation challenge sent to AI agent
+4. **Execution**: AI agent writes code using available tools
+5. **Validation**: Unit test execution to verify correctness
+6. **Cleanup**: Container termination and resource cleanup
+
+### Error Handling and Timeouts
+
+The system implements comprehensive error handling:
+
+- **Task Timeouts**: 30-minute maximum execution time per task
+- **Process Cleanup**: Automatic termination of hung processes
+- **Container Recovery**: Failed containers are cleaned up and resources released
+- **Graceful Degradation**: Individual task failures don't affect other tasks in the run
+
+### Metrics and Monitoring
+
+Comprehensive tracking of evaluation performance:
+
+- **Token Usage**: Input/output tokens and context size tracking
+- **Cost Analysis**: API costs per task and aggregated run costs
+- **Tool Usage**: Frequency and success rates of different AI tools
+- **Execution Time**: Task duration and queue wait times
+- **Success Rates**: Pass/fail statistics across languages and exercises
+
+## Configuration and Customization
+
+### Run Configuration
+
+Evaluation runs support extensive customization:
+
+- **Model Selection**: Choose from available AI models via OpenRouter integration
+- **Concurrency Control**: 1-25 parallel task executions based on resource availability
+- **Exercise Selection**: Full suite (all exercises) or partial (selected exercises)
+- **Custom Settings**: Override default AI agent configuration and behavior
+- **System Prompts**: Optional custom prompts for specialized evaluation scenarios
+
+### Exercise Management
+
+The system uses a separate Git repository containing:
+
+- **Language-specific Exercises**: Coding challenges organized by programming language
+- **Test Suites**: Automated validation for each exercise
+- **Prompt Templates**: Standardized evaluation instructions per language
+- **Workspace Configuration**: Language-specific development environment setup
+
+### Scalability Considerations
+
+The architecture supports horizontal scaling:
+
+- **Container Orchestration**: Multiple controller instances can run simultaneously
+- **Resource Management**: Configurable concurrency prevents resource exhaustion
+- **Database Optimization**: Efficient task querying and result storage
+- **Redis Clustering**: Pub/sub system can scale with message volume
+
+## Operational Characteristics
+
+### Performance
+
+- **Task Isolation**: Complete environment isolation prevents interference between tasks
+- **Parallel Execution**: Configurable concurrency maximizes resource utilization
+- **Efficient Communication**: Unix sockets and Redis provide low-latency messaging
+- **Resource Cleanup**: Automatic container termination prevents resource leaks
+
+### Reliability
+
+- **Fault Tolerance**: Individual task failures don't impact other tasks
+- **Timeout Management**: Prevents hung tasks from consuming resources indefinitely
+- **Health Monitoring**: Controller heartbeat and runner registration tracking
+- **Graceful Shutdown**: Proper cleanup of containers and database connections
+
+### Observability
+
+- **Real-time Monitoring**: Live progress tracking through web interface
+- **Comprehensive Logging**: Detailed execution logs for debugging and analysis
+- **Metrics Collection**: Performance and cost analytics for optimization
+- **Event Auditing**: Complete task lifecycle tracking for accountability
+
+This architecture provides a robust, scalable platform for evaluating AI coding capabilities across multiple programming languages while maintaining strict isolation and comprehensive monitoring.

+ 0 - 77
packages/evals/Dockerfile

@@ -1,77 +0,0 @@
-FROM node:20-slim AS base
- ENV PNPM_HOME="/pnpm"
- ENV PATH="$PNPM_HOME:$PATH"
-RUN corepack enable
-RUN npm install -g npm@latest
-RUN npm install -g npm-run-all
-# Install dependencies
-RUN apt update && apt install -y sudo curl git vim jq
-
-# Create a `vscode` user
-RUN useradd -m vscode -s /bin/bash && \
-  echo "vscode ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/vscode && \
-  chmod 0440 /etc/sudoers.d/vscode
-# Install VS Code
-# https://code.visualstudio.com/docs/setup/linux
-RUN apt install -y wget gpg apt-transport-https
-RUN wget -qO- https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > packages.microsoft.gpg
-RUN install -D -o root -g root -m 644 packages.microsoft.gpg /etc/apt/keyrings/packages.microsoft.gpg
-RUN echo "deb [arch=amd64,arm64,armhf signed-by=/etc/apt/keyrings/packages.microsoft.gpg] https://packages.microsoft.com/repos/code stable main" | tee /etc/apt/sources.list.d/vscode.list > /dev/null
-RUN rm -f packages.microsoft.gpg
-RUN apt update && apt install -y code
-# Install Xvfb
-RUN apt install -y xvfb
-# [cpp] Install cmake 3.28.3
-RUN apt install -y cmake
-# [go] Install Go 1.22.2
-RUN apt install -y golang-go
-# [java] Install Java 21
-RUN apt install -y default-jre
-# [python] Install Python 3.12.3 and uv 0.6.6
-RUN apt install -y python3 python3-venv python3-dev python3-pip
-# [rust] Install Rust 1.85
-RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
-RUN echo 'source $HOME/.cargo/env' >> $HOME/.bashrc
- WORKDIR /home/vscode
- USER vscode
-
- # Copy evals
- RUN git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals
-
- # Prepare evals
- WORKDIR /home/vscode/evals/python
- RUN curl -LsSf https://astral.sh/uv/install.sh | sh
- RUN /home/vscode/.local/bin/uv sync
-
- WORKDIR /home/vscode/repo/benchmark
-
- # Install dependencies
- COPY --chown=vscode:vscode ./evals/package.json ./evals/pnpm-lock.yaml ./evals/pnpm-workspace.yaml ./evals/.npmrc ./
- RUN mkdir -p apps/cli apps/web \
-   config/eslint config/typescript \
-   packages/db packages/ipc packages/lib packages/types
- COPY --chown=vscode:vscode ./evals/apps/cli/package.json          ./apps/cli/
- COPY --chown=vscode:vscode ./evals/apps/web/package.json          ./apps/web/
- COPY --chown=vscode:vscode ./evals/config/eslint/package.json     ./config/eslint/
- COPY --chown=vscode:vscode ./evals/config/typescript/package.json ./config/typescript/
- COPY --chown=vscode:vscode ./evals/packages/db/package.json       ./packages/db/
- COPY --chown=vscode:vscode ./evals/packages/ipc/package.json      ./packages/ipc/
- COPY --chown=vscode:vscode ./evals/packages/lib/package.json      ./packages/lib/
- COPY --chown=vscode:vscode ./evals/packages/types/package.json    ./packages/types/
- RUN pnpm install
-
- # Copy & install extension
- COPY --chown=vscode:vscode ./bin/roo-code-latest.vsix ./
- RUN code --debug --install-extension ./roo-code-latest.vsix
-
- # Copy application code
- COPY --chown=vscode:vscode ./evals ./
-
- # Copy environment variables
- COPY --chown=vscode:vscode ./evals/.env ./
-
- # Push database schema
- RUN pnpm --filter @roo-code/evals db:push --force
-
- EXPOSE 3000
- CMD ["pnpm", "web"]

+ 138 - 0
packages/evals/Dockerfile.runner

@@ -0,0 +1,138 @@
+FROM node:20-slim AS base
+
+# Install pnpm
+ENV PNPM_HOME="/pnpm"
+ENV PATH="$PNPM_HOME:$PATH"
+RUN corepack enable
+RUN npm install -g npm@latest npm-run-all
+
+# Install system packages
+RUN apt update && \
+  apt install -y \
+  curl \
+  git \
+  vim \
+  jq \
+  apt-transport-https \
+  ca-certificates \
+  gnupg \
+  lsb-release \
+  wget \
+  gpg \
+  xvfb \
+  cmake \
+  golang-go \
+  default-jre \
+  python3 \
+  python3-venv \
+  python3-dev \
+  python3-pip \
+  && rm -rf /var/lib/apt/lists/*
+
+# Install Docker cli
+RUN curl -fsSL https://download.docker.com/linux/debian/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \
+  && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null \
+  && apt update && apt install -y docker-ce-cli \
+  && rm -rf /var/lib/apt/lists/*
+
+# Install VS Code
+RUN wget -qO- https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > packages.microsoft.gpg \
+  && install -D -o root -g root -m 644 packages.microsoft.gpg /etc/apt/keyrings/packages.microsoft.gpg \
+  && echo "deb [arch=amd64,arm64,armhf signed-by=/etc/apt/keyrings/packages.microsoft.gpg] https://packages.microsoft.com/repos/code stable main" | tee /etc/apt/sources.list.d/vscode.list > /dev/null \
+  && rm -f packages.microsoft.gpg \
+  && apt update && apt install -y code \
+  && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /roo
+
+# Install rust
+ARG RUST_VERSION=1.87.0
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain ${RUST_VERSION} \
+  && echo 'source $HOME/.cargo/env' >> $HOME/.bashrc
+
+# Install VS Code extensions
+ARG GOLANG_EXT_VERSION=0.46.1
+ARG ESLINT_EXT_VERSION=3.0.10
+ARG JAVA_EXT_VERSION=1.42.0
+ARG PYTHON_EXT_VERSION=2025.6.1
+ARG RUST_EXT_VERSION=0.3.2482
+
+RUN mkdir -p /roo/.vscode-template \
+  && code --no-sandbox --user-data-dir /roo/.vscode-template --install-extension golang.go@${GOLANG_EXT_VERSION} \
+  && code --no-sandbox --user-data-dir /roo/.vscode-template --install-extension dbaeumer.vscode-eslint@${ESLINT_EXT_VERSION} \
+  && code --no-sandbox --user-data-dir /roo/.vscode-template --install-extension redhat.java@${JAVA_EXT_VERSION} \
+  && code --no-sandbox --user-data-dir /roo/.vscode-template --install-extension ms-python.python@${PYTHON_EXT_VERSION} \
+  && code --no-sandbox --user-data-dir /roo/.vscode-template --install-extension rust-lang.rust-analyzer@${RUST_EXT_VERSION}
+
+# Copy evals
+ARG EVALS_COMMIT=main
+ARG EVALS_REPO_URL=https://github.com/RooCodeInc/Roo-Code-Evals.git
+RUN git clone ${EVALS_REPO_URL} evals \
+  && cd evals \
+  && git checkout ${EVALS_COMMIT}
+
+# Install uv and sync python dependencies
+ARG UV_VERSION=0.7.11
+WORKDIR /roo/evals/python
+RUN curl -LsSf https://github.com/astral-sh/uv/releases/download/${UV_VERSION}/uv-installer.sh | sh \
+  && /root/.local/bin/uv sync
+
+WORKDIR /roo/repo
+
+# Install npm packages
+RUN mkdir -p \
+  scripts \
+  packages/build \
+  packages/cloud \
+  packages/config-eslint \
+  packages/config-typescript \
+  packages/evals \
+  packages/ipc \
+  packages/telemetry \
+  packages/types \
+  src \
+  webview-ui
+
+COPY ./package.json                            ./
+COPY ./pnpm-lock.yaml                          ./
+COPY ./pnpm-workspace.yaml                     ./
+COPY ./scripts/bootstrap.mjs                   ./scripts/
+COPY ./packages/build/package.json             ./packages/build/
+COPY ./packages/cloud/package.json             ./packages/cloud/
+COPY ./packages/config-eslint/package.json     ./packages/config-eslint/
+COPY ./packages/config-typescript/package.json ./packages/config-typescript/
+COPY ./packages/evals/package.json             ./packages/evals/
+COPY ./packages/ipc/package.json               ./packages/ipc/
+COPY ./packages/telemetry/package.json         ./packages/telemetry/
+COPY ./packages/types/package.json             ./packages/types/
+COPY ./src/package.json                        ./src/
+COPY ./webview-ui/package.json                 ./webview-ui/
+
+RUN pnpm install
+
+# Copy source code
+COPY . ./
+
+# Validate that .env.local exists and is not empty
+RUN if [ ! -f "packages/evals/.env.local" ] || [ ! -s "packages/evals/.env.local" ]; then \
+  echo "ERROR: packages/evals/.env.local is missing or empty. Please create it with your API keys before building."; \
+  exit 1; \
+fi
+
+# Copy ENV secrets
+COPY packages/evals/.env.local ./packages/evals/
+
+# Copy the pre-installed VS Code extensions
+RUN cp -r /roo/.vscode-template /roo/.vscode
+
+# Build the Roo Code extension
+RUN pnpm vsix -- --out ../bin/roo-code.vsix \
+    && code --no-sandbox --user-data-dir /roo/.vscode --install-extension bin/roo-code.vsix
+
+# Copy entrypoint script
+COPY packages/evals/.docker/entrypoints/runner.sh /usr/local/bin/entrypoint.sh
+RUN chmod +x /usr/local/bin/entrypoint.sh
+
+ENV DATABASE_URL=postgresql://postgres:password@db:5432/evals_development
+ENV REDIS_URL=redis://redis:6379
+ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]

+ 62 - 0
packages/evals/Dockerfile.web

@@ -0,0 +1,62 @@
+FROM node:20-slim AS base
+
+# Install pnpm
+ENV PNPM_HOME="/pnpm"
+ENV PATH="$PNPM_HOME:$PATH"
+RUN corepack enable
+RUN npm install -g npm@latest
+RUN npm install -g npm-run-all
+
+# Install system packages
+RUN apt update && apt install -y curl git vim jq postgresql-client
+
+# Install Docker cli
+RUN apt install -y apt-transport-https ca-certificates gnupg lsb-release
+RUN curl -fsSL https://download.docker.com/linux/debian/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
+RUN echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
+RUN apt update && apt install -y docker-ce-cli
+
+WORKDIR /roo
+
+# Copy evals
+RUN git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals
+
+WORKDIR /roo/repo
+
+# Install npm packages
+RUN mkdir -p \
+  scripts \
+  apps/web-evals \
+  packages/config-eslint \
+  packages/config-typescript \
+  packages/evals \
+  packages/ipc \
+  packages/types
+
+COPY ./package.json                            ./
+COPY ./pnpm-lock.yaml                          ./
+COPY ./pnpm-workspace.yaml                     ./
+COPY ./scripts/bootstrap.mjs                   ./scripts/
+COPY ./apps/web-evals/package.json             ./apps/web-evals/
+COPY ./packages/config-eslint/package.json     ./packages/config-eslint/
+COPY ./packages/config-typescript/package.json ./packages/config-typescript/
+COPY ./packages/evals/package.json             ./packages/evals/
+COPY ./packages/ipc/package.json               ./packages/ipc/
+COPY ./packages/types/package.json             ./packages/types/
+
+RUN pnpm install
+
+# Copy source code
+COPY . ./
+
+# Build the web-evals app
+RUN pnpm --filter @roo-code/web-evals build
+
+# Copy entrypoint script
+COPY packages/evals/.docker/entrypoints/web.sh /usr/local/bin/entrypoint.sh
+RUN chmod +x /usr/local/bin/entrypoint.sh
+
+ENV DATABASE_URL=postgresql://postgres:password@db:5432/evals_development
+ENV REDIS_URL=redis://redis:6379
+EXPOSE 3000
+ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]

+ 45 - 13
packages/evals/README.md

@@ -1,8 +1,12 @@
 # Run Roo Code Evals
 
-## Get Started
+### Prerequisites
 
-NOTE: This is MacOS only for now!
+- [Docker Desktop](https://docs.docker.com/desktop/)
+- [git](https://git-scm.com/)
+- That's it!
+
+### Setup
 
 Clone the Roo Code repo:
 
@@ -11,27 +15,55 @@ git clone https://github.com/RooCodeInc/Roo-Code.git
 cd Roo-Code
 ```
 
-Run the setup script:
+Add your OpenRouter API key:
 
 ```sh
-cd packages/evals
-./scripts/setup.sh
+echo "OPENROUTER_API_KEY=sk-or-v1-[...]" > packages/evals/.env.local
 ```
 
-Navigate to [localhost:3000](http://localhost:3000/) in your browser.
-
-## Running Migrations
+### Run
 
-Update `src/schema.ts` as needed, and then run:
+Start the evals service:
 
 ```sh
-pnpm db:generate
+docker compose -f packages/evals/docker-compose.yml --profile server --profile runner up --build --scale runner=0
 ```
 
-Inspect the sql in the migration file added to `drizzle/`.
+The initial build process can take a minute or two. Upon success you should see ouput indicating that a web service is running on [localhost:3000](http://localhost:3000/):
+<img width="1182" alt="Screenshot 2025-06-05 at 12 05 38 PM" src="https://github.com/user-attachments/assets/34f25a59-1362-458c-aafa-25e13cdb2a7a" />
+
+Additionally, you'll find in Docker Desktop that database and redis services are running:
+<img width="1283" alt="Screenshot 2025-06-05 at 12 07 09 PM" src="https://github.com/user-attachments/assets/ad75d791-9cc7-41e3-8168-df7b21b49da2" />
+
+Navigate to [localhost:3000](http://localhost:3000/) in your browser and click the 🚀 button.
+
+By default a evals run will run all programming exercises in [Roo Code Evals](https://github.com/RooCodeInc/Roo-Code-Evals) repository with the Claude Sonnet 4 model and default settings. For basic configuration you can specify the LLM to use and any subset of the exercises you'd like. For advanced configuration you can import a Roo Code settings file which will allow you to run the evals with Roo Code configured any way you'd like (this includes custom modes, a footgun prompt, etc).
+
+<img width="1053" alt="Screenshot 2025-06-05 at 12 08 06 PM" src="https://github.com/user-attachments/assets/2367eef4-6ae9-4ac2-8ee4-80f981046486" />
 
-If it looks okay, then run:
+After clicking "Launch" you should find that a "controller" container has spawned as well as `N` "task" containers where `N` is the value you chose for concurrency:
+<img width="1283" alt="Screenshot 2025-06-05 at 12 13 29 PM" src="https://github.com/user-attachments/assets/024413e2-c886-4272-ab59-909b4b114e7c" />
+
+The web app's UI should update in realtime with the results of the eval run:
+<img width="1053" alt="Screenshot 2025-06-05 at 12 14 52 PM" src="https://github.com/user-attachments/assets/6fe3b651-0898-4f14-a231-3cc8d66f0e1f" />
+
+## Advanced Usage / Debugging
+
+The evals system runs VS Code headlessly in Docker containers for consistent, reproducible environments. While this design ensures reliability, it can make debugging more challenging. For debugging purposes, you can run the system locally on macOS, though this approach is less reliable due to hardware and environment variability.
+
+To configure your MacOS system to run evals locally, execute the setup script:
 
 ```sh
-pnpm db:migrate
+cd packages/evals && ./scripts/setup.sh
 ```
+
+The setup script does the following:
+
+- Installs development tools: Homebrew, asdf, GitHub CLI, pnpm
+- Installs programming languages: Node.js 20.19.2, Python 3.13.2, Go 1.24.2, Rust 1.85.1, Java 17
+- Sets up VS Code with required extensions
+- Configures Docker services (PostgreSQL, Redis)
+- Clones/updates the evals repository
+- Creates and migrates a Postgres database
+- Prompts for an OpenRouter API key to add to `.env.local`
+- Optionally builds and installs the Roo Code extension from source

+ 77 - 12
packages/evals/docker-compose.yml

@@ -1,13 +1,78 @@
+# Build the web and runner images:
+# docker compose build web runner
+#
+# Start all "server" services (db, redis, web):
+# docker compose --profile server up
+#
+# Start a shell in the runner container:
+# docker compose run --rm runner bash
+#
+# Or using the docker cli:
+# docker run -it --rm --network evals_default evals-runner bash
+#
+# To enable docker execution, run:
+# docker run -it --rm --network evals_default -v /var/run/docker.sock:/var/run/docker.sock -e HOST_EXECUTION_METHOD=docker evals-runner bash
+
 services:
-  postgres:
-    container_name: postgres-evals
-    image: postgres:15.4
-    ports:
-      - 5432:5432
-    volumes:
-      - ./.docker/postgres:/var/lib/postgresql/data
-      - ./.docker/scripts/postgres:/docker-entrypoint-initdb.d
-    environment:
-      - POSTGRES_USER=postgres
-      - POSTGRES_PASSWORD=password
-      - POSTGRES_DATABASES=evals_development,evals_test
+    db:
+        container_name: evals-db
+        image: postgres:15.4
+        ports:
+            - 5432:5432
+        volumes:
+            - ./.docker/postgres:/var/lib/postgresql/data
+            - ./.docker/scripts/postgres:/docker-entrypoint-initdb.d
+        environment:
+            - POSTGRES_USER=postgres
+            - POSTGRES_PASSWORD=password
+            - POSTGRES_DATABASES=evals_development,evals_test
+        healthcheck:
+            test: ["CMD-SHELL", "pg_isready -U postgres -d evals_development"]
+            interval: 5s
+            timeout: 5s
+            retries: 5
+            start_period: 30s
+        profiles:
+            - server
+
+    redis:
+        container_name: evals-redis
+        image: redis:7-alpine
+        ports:
+            - "6379:6379"
+        volumes:
+            - ./.docker/redis:/data
+        command: redis-server --appendonly yes
+        profiles:
+            - server
+
+    web:
+        container_name: evals-web
+        build:
+            context: ../../
+            dockerfile: packages/evals/Dockerfile.web
+        ports:
+            - "3000:3000"
+        environment:
+            - HOST_EXECUTION_METHOD=docker
+        volumes:
+            - /var/run/docker.sock:/var/run/docker.sock
+        depends_on:
+            db:
+                condition: service_healthy
+        profiles:
+            - server
+
+    runner:
+        container_name: evals-runner
+        build:
+            context: ../../
+            dockerfile: packages/evals/Dockerfile.runner
+        environment:
+            - HOST_EXECUTION_METHOD=docker
+        volumes:
+            - /var/run/docker.sock:/var/run/docker.sock
+        stdin_open: true
+        tty: true
+        profiles:
+            - runner

+ 7 - 3
packages/evals/package.json

@@ -12,14 +12,16 @@
 		"cli": "dotenvx run -f .env.development .env.local -- tsx src/cli/index.ts",
 		"drizzle-kit": "dotenvx run -f .env.development -- tsx node_modules/drizzle-kit/bin.cjs",
 		"drizzle-kit:test": "dotenvx run -f .env.test -- tsx node_modules/drizzle-kit/bin.cjs",
-		"db:start": "docker compose up -d",
-		"db:stop": "docker compose down",
 		"db:generate": "pnpm drizzle-kit generate",
 		"db:migrate": "pnpm drizzle-kit migrate",
 		"db:push": "pnpm drizzle-kit push",
 		"db:check": "pnpm drizzle-kit check",
 		"db:test:push": "pnpm drizzle-kit:test push",
-		"db:test:check": "pnpm drizzle-kit:test check"
+		"db:test:check": "pnpm drizzle-kit:test check",
+		"db:start": "docker compose up -d db",
+		"db:stop": "docker compose down db",
+		"redis:start": "docker compose up -d redis",
+		"redis:stop": "docker compose down redis"
 	},
 	"dependencies": {
 		"@roo-code/ipc": "workspace:^",
@@ -30,9 +32,11 @@
 		"execa": "^9.6.0",
 		"node-ipc": "^12.0.0",
 		"p-map": "^7.0.3",
+		"p-queue": "^8.1.0",
 		"p-wait-for": "^5.0.2",
 		"postgres": "^3.4.7",
 		"ps-tree": "^1.2.0",
+		"redis": "^5.5.5",
 		"zod": "^3.24.2"
 	},
 	"devDependencies": {

+ 99 - 64
packages/evals/scripts/setup.sh

@@ -1,21 +1,5 @@
 #!/bin/bash
 
-menu() {
-  echo -e "\n📋 Which eval types would you like to support?\n"
-
-  for i in ${!options[@]}; do
-    printf " %d) %-6s [%s]" $((i + 1)) "${options[i]}" "${choices[i]:- }"
-
-    if [[ $i == 0 ]]; then
-      printf " (required)"
-    fi
-
-    printf "\n"
-  done
-
-  echo -e " q) quit\n"
-}
-
 has_asdf_plugin() {
   local plugin="$1"
   case "$plugin" in
@@ -26,52 +10,106 @@ has_asdf_plugin() {
 
 build_extension() {
   echo "🔨 Building the Roo Code extension..."
-  cd ..
-  mkdir -p bin
-  pnpm build -- --out ../bin/roo-code-$(git rev-parse --short HEAD).vsix || exit 1
-  code --install-extension bin/roo-code-$(git rev-parse --short HEAD).vsix || exit 1
+  pnpm -w vsix -- --out ../bin/roo-code-$(git rev-parse --short HEAD).vsix || exit 1
+  code --install-extension ../../bin/roo-code-$(git rev-parse --short HEAD).vsix || exit 1
   cd evals
 }
 
-if [[ "$(uname -s)" != "Darwin" ]]; then
-  echo "⚠️ Only macOS is currently supported."
-  exit 1
-fi
-
-options=("nodejs" "python" "golang" "rust" "java")
-binaries=("node" "python" "go" "rustc" "javac")
-
-for i in "${!options[@]}"; do
-  choices[i]="*"
-done
+check_docker_services() {
+  echo "🐳 Checking Docker services..."
 
-prompt="Type 1-5 to select, 'q' to quit, ⏎ to continue: "
-
-while menu && read -rp "$prompt" num && [[ "$num" ]]; do
-  [[ "$num" == "q" ]] && exit 0
+  if ! command -v docker &> /dev/null; then
+    echo "❌ Docker is not installed. Please install Docker Desktop and try again."
+    exit 1
+  fi
 
-  [[ "$num" != *[![:digit:]]* ]] &&
-    ((num > 1 && num <= ${#options[@]})) ||
-    {
-      continue
-    }
+  if ! docker info &> /dev/null; then
+    echo "❌ Docker is not running. Please start Docker Desktop and try again."
+    exit 1
+  fi
 
-  ((num--))
-  [[ "${choices[num]}" ]] && choices[num]="" || choices[num]="*"
-done
+  if ! docker compose version &> /dev/null; then
+    echo "❌ Docker Compose is not available. Please ensure Docker Desktop is properly installed."
+    exit 1
+  fi
+  
+  local services_to_start=()
 
-empty=true
+  if ! nc -z localhost 5432 2>/dev/null; then
+    echo "📦 PostgreSQL not running on port 5432"
+    services_to_start+=("db")
+  else
+    echo "✅ PostgreSQL is running"
+  fi
 
-for i in ${!options[@]}; do
-  [[ "${choices[i]}" ]] && {
-    empty=false
-    break
-  }
-done
+  if ! nc -z localhost 6379 2>/dev/null; then
+    echo "📦 Redis not running on port 6379"
+    services_to_start+=("redis")
+  else
+    echo "✅ Redis is running"
+  fi
 
-[[ "$empty" == true ]] && exit 0
+  if [ ${#services_to_start[@]} -gt 0 ]; then
+    echo "🚀 Starting Docker services: ${services_to_start[*]}"
+
+    echo "🧹 Cleaning up stale Docker state..."
+    docker compose down --remove-orphans &>/dev/null || true
+    docker network prune -f &>/dev/null || true
+
+    if docker compose --profile server up -d "${services_to_start[@]}"; then
+      echo "✅ Docker services started successfully"
+
+      echo "⏳ Waiting for services to be ready..."
+      local timeout=30
+      local elapsed=0
+      local all_ready=false
+
+      while [ $elapsed -lt $timeout ]; do
+        all_ready=true
+
+        for service in "${services_to_start[@]}"; do
+          if [[ "$service" == "db" ]] && ! nc -z localhost 5432 2>/dev/null; then
+            all_ready=false
+            break
+          elif [[ "$service" == "redis" ]] && ! nc -z localhost 6379 2>/dev/null; then
+            all_ready=false
+            break
+          fi
+        done
+
+        if [ "$all_ready" = true ]; then
+          echo "✅ All services are ready"
+          break
+        fi
+
+        sleep 1
+        elapsed=$((elapsed + 1))
+
+        if [ $((elapsed % 5)) -eq 0 ]; then
+          echo "   Still waiting... (${elapsed}s/${timeout}s)"
+        fi
+      done
+
+      if [ "$all_ready" = false ]; then
+        echo "❌ Timeout: Services failed to start within ${timeout} seconds"
+        echo "   Please check Docker logs: docker compose logs"
+        exit 1
+      fi
+    else
+      echo "❌ Failed to start Docker services even after cleanup. Please check your docker-compose.yml file."
+      exit 1
+    fi
+  else
+    echo "✅ All required Docker services are already running"
+  fi
+}
 
-printf "\n"
+if [[ "$(uname -s)" != "Darwin" ]]; then
+  echo "⚠️ Only macOS is currently supported."
+  echo "The Roo Code evals system can also be run with Docker on any platform."
+  echo "See https://github.com/RooCodeInc/Roo-Code/blob/main/packages/evals/README.md for instructions."
+  exit 1
+fi
 
 if ! command -v brew &>/dev/null; then
   if [[ -f "/opt/homebrew/bin/brew" ]]; then
@@ -159,9 +197,10 @@ else
   echo "✅ gh is installed ($GH_VERSION)"
 fi
 
-for i in "${!options[@]}"; do
-  [[ "${choices[i]}" ]] || continue
+options=("nodejs" "python" "golang" "rust" "java")
+binaries=("node" "python" "go" "rustc" "javac")
 
+for i in "${!options[@]}"; do
   plugin="${options[$i]}"
   binary="${binaries[$i]}"
 
@@ -282,7 +321,6 @@ fi
 
 # To reset VSCode:
 # rm -rvf ~/.vscode && rm -rvf ~/Library/Application\ Support/Code
-
 echo -n "🔌 Installing Visual Studio Code extensions... "
 code --install-extension golang.go &>/dev/null || exit 1
 code --install-extension dbaeumer.vscode-eslint&>/dev/null || exit 1
@@ -296,20 +334,14 @@ fi
 
 echo "✅ Done"
 
-if [[ ! -d "../../evals" ]]; then
+if [[ ! -d "../../../evals" ]]; then
   echo -n "🔗 Cloning evals repository... "
-
-  if gh auth status &>/dev/null; then
-    gh repo clone cte/evals ../../evals || exit 1
-  else
-    git clone https://github.com/cte/evals.git ../../evals || exit 1
-  fi
-
+  git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals ../../../evals || exit 1
   echo "✅ Done"
 else
   echo -n "🔄 Updating evals repository... "
 
-  (cd ../../evals && \
+  (cd ../../../evals && \
     git checkout -f &>/dev/null && \
     git clean -f -d &>/dev/null && \
     git checkout main &>/dev/null && \
@@ -322,6 +354,9 @@ if [[ ! -s .env.local ]]; then
   touch .env.local || exit 1
 fi
 
+# Check and start Docker services before database operations
+check_docker_services
+
 echo -n "🗄️ Syncing Roo Code evals database... "
 pnpm --filter @roo-code/evals db:push --force &>/dev/null || exit 1
 echo "✅ Done"

+ 28 - 415
packages/evals/src/cli/index.ts

@@ -1,432 +1,44 @@
 import * as fs from "fs"
-import * as path from "path"
 
-import pWaitFor from "p-wait-for"
-import { execa, parseCommandString } from "execa"
-import { command, run, number, positional } from "cmd-ts"
-import psTree from "ps-tree"
+import { command, run, number, option } from "cmd-ts"
 
-import { RooCodeEventName, IpcOrigin, IpcMessageType, TaskCommandName } from "@roo-code/types"
-import { IpcServer, IpcClient } from "@roo-code/ipc"
+import { exercisesPath } from "../exercises/index.js"
 
-import {
-	type Run,
-	type Task,
-	findRun,
-	finishRun,
-	getTasks,
-	updateTask,
-	createTaskMetrics,
-	updateTaskMetrics,
-	createToolError,
-} from "../db/index.js"
-import { type ExerciseLanguage, exercisesPath } from "../exercises/index.js"
+import { runEvals } from "./runEvals.js"
+import { processTask } from "./processTask.js"
 
-type TaskResult = { success: boolean }
-type TaskPromise = Promise<TaskResult>
-
-const TASK_START_DELAY = 10 * 1_000
-const TASK_TIMEOUT = 5 * 60 * 1_000
-const UNIT_TEST_TIMEOUT = 2 * 60 * 1_000
-
-const testCommands: Record<ExerciseLanguage, { commands: string[]; timeout?: number; cwd?: string }> = {
-	go: { commands: ["go test"] }, // timeout 15s bash -c "cd '$dir' && go test > /dev/null 2>&1"
-	java: { commands: ["./gradlew test"] }, // timeout --foreground 15s bash -c "cd '$dir' && ./gradlew test > /dev/null 2>&1"
-	javascript: { commands: ["pnpm install", "pnpm test"] }, // timeout 15s bash -c "cd '$dir' && pnpm install >/dev/null 2>&1 && pnpm test >/dev/null 2>&1"
-	python: { commands: ["uv run python3 -m pytest -o markers=task *_test.py"] }, // timeout 15s bash -c "cd '$dir' && uv run python3 -m pytest -o markers=task *_test.py"
-	rust: { commands: ["cargo test"] }, // timeout 15s bash -c "cd '$dir' && cargo test > /dev/null 2>&1"
-}
-
-const runEvals = async (id: number) => {
-	const run = await findRun(id)
-	const tasks = await getTasks(run.id)
-
-	if (!tasks[0]) {
-		throw new Error("No tasks found.")
-	}
-
-	await execa({ cwd: exercisesPath })`git config user.name "Roo Code"`
-	await execa({ cwd: exercisesPath })`git config user.email "[email protected]"`
-	await execa({ cwd: exercisesPath })`git checkout -f`
-	await execa({ cwd: exercisesPath })`git clean -fd`
-	await execa({ cwd: exercisesPath })`git checkout -b runs/${run.id}-${crypto.randomUUID().slice(0, 8)} main`
-
-	const server = new IpcServer(run.socketPath, () => {})
-	server.listen()
-
-	const runningPromises: TaskPromise[] = []
-
-	const processTask = async (task: Task, delay = 0) => {
-		if (task.finishedAt === null) {
-			await new Promise((resolve) => setTimeout(resolve, delay))
-			await runExercise({ run, task, server })
-		}
-
-		if (task.passed === null) {
-			const passed = await runUnitTest({ task })
-			await updateTask(task.id, { passed })
-
-			server.broadcast({
-				type: IpcMessageType.TaskEvent,
-				origin: IpcOrigin.Server,
-				data: { eventName: passed ? RooCodeEventName.EvalPass : RooCodeEventName.EvalFail, taskId: task.id },
-			})
-
-			return { success: passed }
-		} else {
-			return { success: task.passed }
-		}
-	}
-
-	const processTaskResult = async (task: Task, promise: TaskPromise) => {
-		const index = runningPromises.indexOf(promise)
-
-		if (index > -1) {
-			runningPromises.splice(index, 1)
-		}
-	}
-
-	let delay = TASK_START_DELAY
-
-	for (const task of tasks) {
-		const promise = processTask(task, delay)
-		delay = delay + TASK_START_DELAY
-		runningPromises.push(promise)
-		promise.then(() => processTaskResult(task, promise))
-
-		if (runningPromises.length >= run.concurrency) {
-			delay = 0
-			await Promise.race(runningPromises)
-		}
-	}
-
-	await Promise.all(runningPromises)
-
-	const result = await finishRun(run.id)
-	console.log(`${Date.now()} [cli#run]`, result)
-
-	await execa({ cwd: exercisesPath })`git add .`
-	await execa({ cwd: exercisesPath })`git commit -m ${`Run #${run.id}`} --no-verify`
-}
-
-const runExercise = async ({ run, task, server }: { run: Run; task: Task; server: IpcServer }): TaskPromise => {
-	const { language, exercise } = task
-	const prompt = fs.readFileSync(path.resolve(exercisesPath, `prompts/${language}.md`), "utf-8")
-	const dirname = path.dirname(run.socketPath)
-	const workspacePath = path.resolve(exercisesPath, language, exercise)
-	const taskSocketPath = path.resolve(dirname, `${dirname}/task-${task.id}.sock`)
-
-	// Inject foot gun system prompt if present
-	if (process.env.FOOTGUN_SYSTEM_PROMPT) {
-		const rooDir = path.join(workspacePath, ".roo")
-		if (!fs.existsSync(rooDir)) {
-			fs.mkdirSync(rooDir, { recursive: true })
-		}
-		fs.writeFileSync(path.join(rooDir, "system-prompt-code"), process.env.FOOTGUN_SYSTEM_PROMPT)
-	}
-
-	// If debugging:
-	// Use --wait --log trace or --verbose.
-	// Don't await execa and store result as subprocess.
-	// subprocess.stdout.pipe(process.stdout)
-
-	console.log(`${Date.now()} [cli#runExercise] Opening new VS Code window at ${workspacePath}`)
-
-	const controller = new AbortController()
-	const cancelSignal = controller.signal
-
-	// If debugging:
-	// Use --wait --log trace or --verbose.
-	let codeCommand = `code --disable-workspace-trust`
-	const isDocker = fs.existsSync("/.dockerenv")
-
-	if (isDocker) {
-		if (run.concurrency > 1) {
-			throw new Error("Cannot run multiple tasks in parallel in Docker. Please set concurrency to 1.")
-		}
-		codeCommand = `xvfb-run --auto-servernum --server-num=1 ${codeCommand} --wait --log trace --disable-gpu --password-store="basic"`
-	}
-
-	const subprocess = execa({
-		env: {
-			ROO_CODE_IPC_SOCKET_PATH: taskSocketPath,
-		},
-		shell: "/bin/bash",
-		cancelSignal,
-	})`${codeCommand} -n ${workspacePath}`
-
-	// If debugging:
-	// subprocess.stdout.pipe(process.stdout)
-
-	// Give VSCode some time to spawn before connecting to its unix socket.
-	await new Promise((resolve) => setTimeout(resolve, 3_000))
-	console.log(`${Date.now()} [cli#runExercise] Connecting to ${taskSocketPath}`)
-	const client = new IpcClient(taskSocketPath)
-
-	try {
-		await pWaitFor(() => client.isReady, { interval: 250, timeout: 5_000 })
-		// eslint-disable-next-line @typescript-eslint/no-unused-vars
-	} catch (error) {
-		console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] unable to connect`)
-		client.disconnect()
-		return { success: false }
-	}
-
-	let taskStartedAt = Date.now()
-	let taskFinishedAt: number | undefined
-	let taskMetricsId: number | undefined
-	let rooTaskId: string | undefined
-	let isClientDisconnected = false
-
-	const ignoreEvents: Record<"broadcast" | "log", RooCodeEventName[]> = {
-		broadcast: [RooCodeEventName.Message],
-		log: [RooCodeEventName.Message, RooCodeEventName.TaskTokenUsageUpdated, RooCodeEventName.TaskAskResponded],
-	}
-
-	client.on(IpcMessageType.TaskEvent, async (taskEvent) => {
-		const { eventName, payload } = taskEvent
-
-		if (!ignoreEvents.broadcast.includes(eventName)) {
-			server.broadcast({
-				type: IpcMessageType.TaskEvent,
-				origin: IpcOrigin.Server,
-				relayClientId: client.clientId!,
-				data: { ...taskEvent, taskId: task.id },
-			})
-		}
-
-		if (!ignoreEvents.log.includes(eventName)) {
-			console.log(
-				`${Date.now()} [cli#runExercise | ${language} / ${exercise}] taskEvent -> ${eventName}`,
-				payload,
-			)
-		}
-
-		if (eventName === RooCodeEventName.TaskStarted) {
-			taskStartedAt = Date.now()
-
-			const taskMetrics = await createTaskMetrics({
-				cost: 0,
-				tokensIn: 0,
-				tokensOut: 0,
-				tokensContext: 0,
-				duration: 0,
-				cacheWrites: 0,
-				cacheReads: 0,
-			})
-
-			await updateTask(task.id, { taskMetricsId: taskMetrics.id, startedAt: new Date() })
-
-			taskStartedAt = Date.now()
-			taskMetricsId = taskMetrics.id
-			rooTaskId = payload[0]
-		}
-
-		if (eventName === RooCodeEventName.TaskToolFailed) {
-			const [_taskId, toolName, error] = payload
-			await createToolError({ taskId: task.id, toolName, error })
-		}
-
-		if (
-			(eventName === RooCodeEventName.TaskTokenUsageUpdated || eventName === RooCodeEventName.TaskCompleted) &&
-			taskMetricsId
-		) {
-			const duration = Date.now() - taskStartedAt
-
-			const { totalCost, totalTokensIn, totalTokensOut, contextTokens, totalCacheWrites, totalCacheReads } =
-				payload[1]
-
-			await updateTaskMetrics(taskMetricsId, {
-				cost: totalCost,
-				tokensIn: totalTokensIn,
-				tokensOut: totalTokensOut,
-				tokensContext: contextTokens,
-				duration,
-				cacheWrites: totalCacheWrites ?? 0,
-				cacheReads: totalCacheReads ?? 0,
-			})
-		}
-
-		if (eventName === RooCodeEventName.TaskCompleted && taskMetricsId) {
-			const toolUsage = payload[2]
-			await updateTaskMetrics(taskMetricsId, { toolUsage })
-		}
-
-		if (eventName === RooCodeEventName.TaskAborted || eventName === RooCodeEventName.TaskCompleted) {
-			taskFinishedAt = Date.now()
-			await updateTask(task.id, { finishedAt: new Date() })
-		}
-	})
-
-	client.on(IpcMessageType.Disconnect, async () => {
-		console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] disconnect`)
-		isClientDisconnected = true
-	})
-
-	console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] starting task`)
-
-	if (client.isReady) {
-		client.sendMessage({
-			type: IpcMessageType.TaskCommand,
-			origin: IpcOrigin.Client,
-			clientId: client.clientId!,
-			data: {
-				commandName: TaskCommandName.StartNewTask,
-				data: {
-					configuration: {
-						openRouterApiKey: process.env.OPENROUTER_API_KEY!,
-						...run.settings,
-					},
-					text: prompt,
-					newTab: true,
-				},
+const main = async () => {
+	const result = await run(
+		command({
+			name: "cli",
+			description: "Execute an eval run.",
+			version: "0.0.0",
+			args: {
+				runId: option({ type: number, long: "runId", short: "r", defaultValue: () => -1 }),
+				taskId: option({ type: number, long: "taskId", short: "t", defaultValue: () => -1 }),
 			},
-		})
-	} else {
-		console.log(`[cli#runExercise | ${language} / ${exercise}] unable to connect`)
-		client.disconnect()
-		taskFinishedAt = Date.now()
-		isClientDisconnected = true
-	}
-
-	try {
-		await pWaitFor(() => !!taskFinishedAt || isClientDisconnected, { interval: 1_000, timeout: TASK_TIMEOUT })
-		// eslint-disable-next-line @typescript-eslint/no-unused-vars
-	} catch (error) {
-		console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] time limit reached`)
-
-		// Cancel the task.
-		if (rooTaskId && !isClientDisconnected) {
-			client.sendMessage({
-				type: IpcMessageType.TaskCommand,
-				origin: IpcOrigin.Client,
-				clientId: client.clientId!,
-				data: { commandName: TaskCommandName.CancelTask, data: rooTaskId },
-			})
-
-			// Allow some time for the task to cancel.
-			await new Promise((resolve) => setTimeout(resolve, 5_000))
-		}
-
-		await updateTask(task.id, { finishedAt: new Date() })
-	}
-
-	if (!isClientDisconnected) {
-		if (rooTaskId) {
-			client.sendMessage({
-				type: IpcMessageType.TaskCommand,
-				origin: IpcOrigin.Client,
-				clientId: client.clientId!,
-				data: { commandName: TaskCommandName.CloseTask, data: rooTaskId },
-			})
-
-			// Allow some time for the window to close.
-			await new Promise((resolve) => setTimeout(resolve, 2_000))
-		}
-
-		client.disconnect()
-	}
-
-	controller.abort()
-	await subprocess
-
-	return { success: !!taskFinishedAt }
-}
-
-const runUnitTest = async ({ task }: { task: Task }) => {
-	const cmd = testCommands[task.language]
-	const exercisePath = path.resolve(exercisesPath, task.language, task.exercise)
-	const cwd = cmd.cwd ? path.resolve(exercisePath, cmd.cwd) : exercisePath
-	const commands = cmd.commands.map((cs) => parseCommandString(cs))
-
-	let passed = true
-
-	for (const command of commands) {
-		try {
-			console.log(
-				`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] running "${command.join(" ")}"`,
-			)
+			handler: async (args) => {
+				const { runId, taskId } = args
 
-			const subprocess = execa({ cwd, shell: true, reject: false })`${command}`
-
-			const timeout = setTimeout(async () => {
-				const descendants = await new Promise<number[]>((resolve, reject) => {
-					psTree(subprocess.pid!, (err, children) => {
-						if (err) {
-							reject(err)
-						}
-
-						resolve(children.map((p) => parseInt(p.PID)))
-					})
-				})
-
-				console.log(
-					`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] "${command.join(" ")}": unit tests timed out, killing ${subprocess.pid} + ${JSON.stringify(descendants)}`,
-				)
-
-				if (descendants.length > 0) {
-					for (const descendant of descendants) {
-						try {
-							console.log(
-								`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] killing ${descendant}`,
-							)
-
-							await execa`kill -9 ${descendant}`
-						} catch (error) {
-							console.error(
-								`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] Error killing descendant processes:`,
-								error,
-							)
-						}
-					}
+				if (runId === -1 && taskId === -1) {
+					throw new Error("Either runId or taskId must be provided.")
 				}
 
-				console.log(
-					`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] killing ${subprocess.pid}`,
-				)
+				if (runId !== -1 && taskId !== -1) {
+					throw new Error("Only one of runId or taskId must be provided.")
+				}
 
 				try {
-					await execa`kill -9 ${subprocess.pid!}`
+					if (runId !== -1) {
+						await runEvals(runId)
+					} else {
+						await processTask(taskId)
+					}
 				} catch (error) {
-					console.error(
-						`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] Error killing process:`,
-						error,
-					)
+					console.error(error)
+					process.exit(1)
 				}
-			}, UNIT_TEST_TIMEOUT)
-
-			const result = await subprocess
-
-			console.log(
-				`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] "${command.join(" ")}" result -> ${JSON.stringify(result)}`,
-			)
-
-			clearTimeout(timeout)
-
-			if (result.failed) {
-				passed = false
-				break
-			}
-		} catch (error) {
-			console.log(`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}]`, error)
-			passed = false
-			break
-		}
-	}
-
-	return passed
-}
-
-const main = async () => {
-	const result = await run(
-		command({
-			name: "cli",
-			description: "Execute an eval run.",
-			version: "0.0.0",
-			args: {
-				runId: positional({ type: number, displayName: "runId" }),
 			},
-			handler: (args) => runEvals(args.runId),
 		}),
 		process.argv.slice(2),
 	)
@@ -439,6 +51,7 @@ if (!fs.existsSync(exercisesPath)) {
 	console.error(
 		`Exercises do not exist at ${exercisesPath}. Please run "git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals".`,
 	)
+
 	process.exit(1)
 }
 

+ 56 - 0
packages/evals/src/cli/processTask.ts

@@ -0,0 +1,56 @@
+import { RooCodeEventName, type TaskEvent } from "@roo-code/types"
+
+import { findTask, updateTask, findRun } from "../db/index.js"
+
+import { getTag } from "./utils.js"
+import { redisClient, getPubSubKey, registerRunner, deregisterRunner } from "./redis.js"
+import { runTask } from "./runTask.js"
+import { runUnitTest } from "./runUnitTest.js"
+import { execa } from "execa"
+
+export const processTask = async (taskId: number) => {
+	const task = await findTask(taskId)
+	const run = await findRun(task.runId)
+	await registerRunner({ runId: run.id, taskId })
+
+	try {
+		const tag = getTag("processTask", { run, task })
+
+		const publish = async (e: TaskEvent) => {
+			const redis = await redisClient()
+			await redis.publish(getPubSubKey(run.id), JSON.stringify(e))
+		}
+
+		console.log(`[${Date.now()} | ${tag}] running task ${task.id} (${task.language}/${task.exercise})...`)
+		await runTask({ run, task, publish })
+
+		console.log(`[${Date.now()} | ${tag}] testing task ${task.id} (${task.language}/${task.exercise})...`)
+		const passed = await runUnitTest({ task })
+
+		console.log(`[${Date.now()} | ${tag}] task ${task.id} (${task.language}/${task.exercise}) -> ${passed}`)
+		await updateTask(task.id, { passed })
+
+		await publish({
+			eventName: passed ? RooCodeEventName.EvalPass : RooCodeEventName.EvalFail,
+			taskId: task.id,
+		})
+	} finally {
+		await deregisterRunner({ runId: run.id, taskId })
+	}
+}
+
+export const processTaskInContainer = async (taskId: number) => {
+	const args = [
+		`--name evals-task-${taskId}`,
+		"--rm",
+		"--network evals_default",
+		"-v /var/run/docker.sock:/var/run/docker.sock",
+		"-e HOST_EXECUTION_METHOD=docker",
+	]
+
+	const command = `pnpm --filter @roo-code/evals cli --taskId ${taskId}`
+	const subprocess = execa(`docker run ${args.join(" ")} evals-runner sh -c "${command}"`, { shell: true })
+	// subprocess.stdout?.on("data", (data) => console.log(data.toString()))
+	// subprocess.stderr?.on("data", (data) => console.error(data.toString()))
+	await subprocess
+}

+ 53 - 0
packages/evals/src/cli/redis.ts

@@ -0,0 +1,53 @@
+import { createClient, type RedisClientType } from "redis"
+
+let redis: RedisClientType | undefined
+
+export const redisClient = async () => {
+	if (!redis) {
+		redis = createClient({ url: process.env.REDIS_URL || "redis://localhost:6379" })
+		redis.on("error", (error) => console.error("redis error:", error))
+		await redis.connect()
+	}
+
+	return redis
+}
+
+export const getPubSubKey = (runId: number) => `evals:${runId}`
+export const getRunnersKey = (runId: number) => `runners:${runId}`
+export const getHeartbeatKey = (runId: number) => `heartbeat:${runId}`
+
+export const registerRunner = async ({ runId, taskId }: { runId: number; taskId: number }) => {
+	const redis = await redisClient()
+	await redis.sAdd(getRunnersKey(runId), `task-${taskId}:${process.env.HOSTNAME}`)
+}
+
+export const deregisterRunner = async ({ runId, taskId }: { runId: number; taskId: number }) => {
+	const redis = await redisClient()
+	await redis.sRem(getRunnersKey(runId), `task-${taskId}:${process.env.HOSTNAME}`)
+}
+
+export const startHeartbeat = async (runId: number, interval: number = 10) => {
+	const pid = process.pid.toString()
+	const redis = await redisClient()
+	const heartbeatKey = getHeartbeatKey(runId)
+	await redis.setEx(heartbeatKey, interval, pid)
+
+	return setInterval(
+		() =>
+			redis.expire(heartbeatKey, interval).catch((error) => {
+				console.error("heartbeat error:", error)
+			}),
+		(interval * 1_000) / 2,
+	)
+}
+
+export const stopHeartbeat = async (runId: number, heartbeat: NodeJS.Timeout) => {
+	clearInterval(heartbeat)
+
+	try {
+		const redis = await redisClient()
+		await redis.del(getHeartbeatKey(runId))
+	} catch (error) {
+		console.error("redis.del failed:", error)
+	}
+}

+ 56 - 0
packages/evals/src/cli/runEvals.ts

@@ -0,0 +1,56 @@
+import { execa } from "execa"
+import PQueue from "p-queue"
+
+import { findRun, finishRun, getTasks } from "../db/index.js"
+import { exercisesPath } from "../exercises/index.js"
+
+import { getTag, isDockerContainer } from "./utils.js"
+import { processTask, processTaskInContainer } from "./processTask.js"
+import { startHeartbeat, stopHeartbeat } from "./redis.js"
+
+export const runEvals = async (runId: number) => {
+	const run = await findRun(runId)
+
+	if (run.taskMetricsId) {
+		throw new Error(`Run ${run.id} already finished.`)
+	}
+
+	const tasks = await getTasks(runId)
+
+	if (tasks.length === 0) {
+		throw new Error(`Run ${run.id} has no tasks.`)
+	}
+
+	const tag = getTag("runEvals", { run })
+	console.log(`[${Date.now()} | ${tag}] running ${tasks.length} task(s)`)
+
+	const cwd = exercisesPath
+	await execa({ cwd })`git config user.name "Roo Code"`
+	await execa({ cwd })`git config user.email "[email protected]"`
+	await execa({ cwd })`git checkout -f`
+	await execa({ cwd })`git clean -fd`
+	await execa({ cwd })`git checkout -b runs/${run.id}-${crypto.randomUUID().slice(0, 8)} main`
+
+	const heartbeat = await startHeartbeat(run.id)
+	const queue = new PQueue({ concurrency: run.concurrency })
+
+	try {
+		const containerize = isDockerContainer()
+
+		await queue.addAll(
+			tasks
+				.filter((task) => task.finishedAt === null)
+				.map((task) => () => (containerize ? processTaskInContainer(task.id) : processTask(task.id))),
+		)
+
+		console.log(`[${Date.now()} | ${tag}] finishRun`)
+		const result = await finishRun(run.id)
+		console.log(`[${Date.now()} | ${tag}] result ->`, result)
+
+		await execa({ cwd: exercisesPath })`git add .`
+		await execa({ cwd: exercisesPath })`git commit -m ${`Run #${run.id}`} --no-verify`
+	} finally {
+		console.log(`[${Date.now()} | ${tag}] cleaning up`)
+		stopHeartbeat(run.id, heartbeat)
+	}
+}

+ 253 - 0
packages/evals/src/cli/runTask.ts

@@ -0,0 +1,253 @@
+import * as fs from "fs"
+import * as path from "path"
+import * as os from "node:os"
+
+import pWaitFor from "p-wait-for"
+import { execa } from "execa"
+
+import {
+	RooCodeEventName,
+	IpcOrigin,
+	IpcMessageType,
+	TaskCommandName,
+	type TaskEvent,
+	EVALS_SETTINGS,
+	EVALS_TIMEOUT,
+} from "@roo-code/types"
+import { IpcClient } from "@roo-code/ipc"
+
+import { type Run, type Task, updateTask, createTaskMetrics, updateTaskMetrics, createToolError } from "../db/index.js"
+import { exercisesPath } from "../exercises/index.js"
+
+import { getTag, isDockerContainer } from "./utils.js"
+
+type RunTaskOptions = {
+	run: Run
+	task: Task
+	publish: (taskEvent: TaskEvent) => Promise<void>
+}
+
+export const runTask = async ({ run, task, publish }: RunTaskOptions): Promise<{ success: boolean }> => {
+	const { language, exercise } = task
+	const tag = getTag("runTask", { run, task })
+
+	const prompt = fs.readFileSync(path.resolve(exercisesPath, `prompts/${language}.md`), "utf-8")
+	const workspacePath = path.resolve(exercisesPath, language, exercise)
+	const taskSocketPath = path.resolve(os.tmpdir(), `evals-${run.id}-${task.id}.sock`)
+
+	// Inject foot gun system prompt if present.
+	if (process.env.FOOTGUN_SYSTEM_PROMPT) {
+		const rooDir = path.join(workspacePath, ".roo")
+
+		if (!fs.existsSync(rooDir)) {
+			fs.mkdirSync(rooDir, { recursive: true })
+		}
+
+		fs.writeFileSync(path.join(rooDir, "system-prompt-code"), process.env.FOOTGUN_SYSTEM_PROMPT)
+	}
+
+	console.log(`[${Date.now()} | ${tag}] Opening new VS Code window at ${workspacePath}`)
+
+	const controller = new AbortController()
+	const cancelSignal = controller.signal
+
+	const codeCommand = isDockerContainer()
+		? `xvfb-run --auto-servernum --server-num=1 code --wait --log trace --disable-workspace-trust --disable-gpu --disable-lcd-text --no-sandbox --user-data-dir /roo/.vscode --password-store="basic"`
+		: `code --disable-workspace-trust`
+
+	console.log(`[${Date.now()} | ${tag}] ${codeCommand}`)
+
+	// Sleep for a random amount of time between 5 and 10 seconds.
+	await new Promise((resolve) => setTimeout(resolve, Math.random() * 5_000 + 5_000))
+
+	const subprocess = execa({
+		env: {
+			ROO_CODE_IPC_SOCKET_PATH: taskSocketPath,
+		},
+		shell: "/bin/bash",
+		cancelSignal,
+	})`${codeCommand} -n ${workspacePath}`
+
+	// If debugging:
+	subprocess.stdout.pipe(process.stdout)
+
+	// Give VSCode some time to spawn before connecting to its unix socket.
+	await new Promise((resolve) => setTimeout(resolve, 3_000))
+	let client: IpcClient | undefined = undefined
+	let attempts = 5
+
+	while (true) {
+		try {
+			console.log(`[${Date.now()} | ${tag}] connecting to ${taskSocketPath}`)
+			client = new IpcClient(taskSocketPath)
+			await pWaitFor(() => client!.isReady, { interval: 250, timeout: 1_000 })
+			break
+		} catch (_error) {
+			if (client) {
+				client.disconnect()
+			}
+
+			attempts--
+
+			if (attempts <= 0) {
+				console.error(`[${Date.now()} | ${tag}] unable to connect`)
+				return { success: false }
+			}
+		}
+	}
+
+	console.log(`[${Date.now()} | ${tag}] connected to ${taskSocketPath}`)
+
+	let taskStartedAt = Date.now()
+	let taskFinishedAt: number | undefined
+	let taskMetricsId: number | undefined
+	let rooTaskId: string | undefined
+	let isClientDisconnected = false
+
+	const ignoreEvents: Record<"broadcast" | "log", RooCodeEventName[]> = {
+		broadcast: [RooCodeEventName.Message],
+		log: [RooCodeEventName.TaskTokenUsageUpdated], // [RooCodeEventName.Message, RooCodeEventName.TaskAskResponded],
+	}
+
+	client.on(IpcMessageType.TaskEvent, async (taskEvent) => {
+		const { eventName, payload } = taskEvent
+
+		if (!ignoreEvents.broadcast.includes(eventName)) {
+			await publish({ ...taskEvent, taskId: task.id })
+		}
+
+		if (!ignoreEvents.log.includes(eventName)) {
+			console.log(`[${Date.now()} | ${tag}] ${eventName} ->`, payload)
+		}
+
+		if (eventName === RooCodeEventName.TaskStarted) {
+			taskStartedAt = Date.now()
+
+			const taskMetrics = await createTaskMetrics({
+				cost: 0,
+				tokensIn: 0,
+				tokensOut: 0,
+				tokensContext: 0,
+				duration: 0,
+				cacheWrites: 0,
+				cacheReads: 0,
+			})
+
+			await updateTask(task.id, { taskMetricsId: taskMetrics.id, startedAt: new Date() })
+
+			taskStartedAt = Date.now()
+			taskMetricsId = taskMetrics.id
+			rooTaskId = payload[0]
+		}
+
+		if (eventName === RooCodeEventName.TaskToolFailed) {
+			const [_taskId, toolName, error] = payload
+			await createToolError({ taskId: task.id, toolName, error })
+		}
+
+		if (
+			(eventName === RooCodeEventName.TaskTokenUsageUpdated || eventName === RooCodeEventName.TaskCompleted) &&
+			taskMetricsId
+		) {
+			const duration = Date.now() - taskStartedAt
+
+			const { totalCost, totalTokensIn, totalTokensOut, contextTokens, totalCacheWrites, totalCacheReads } =
+				payload[1]
+
+			await updateTaskMetrics(taskMetricsId, {
+				cost: totalCost,
+				tokensIn: totalTokensIn,
+				tokensOut: totalTokensOut,
+				tokensContext: contextTokens,
+				duration,
+				cacheWrites: totalCacheWrites ?? 0,
+				cacheReads: totalCacheReads ?? 0,
+			})
+		}
+
+		if (eventName === RooCodeEventName.TaskCompleted && taskMetricsId) {
+			const toolUsage = payload[2]
+			await updateTaskMetrics(taskMetricsId, { toolUsage })
+		}
+
+		if (eventName === RooCodeEventName.TaskAborted || eventName === RooCodeEventName.TaskCompleted) {
+			taskFinishedAt = Date.now()
+			await updateTask(task.id, { finishedAt: new Date() })
+		}
+	})
+
+	client.on(IpcMessageType.Disconnect, async () => {
+		console.log(`[${Date.now()} | ${tag}] disconnect`)
+		isClientDisconnected = true
+	})
+
+	if (client.isReady) {
+		const configuration = {
+			...EVALS_SETTINGS,
+			...run.settings,
+			openRouterApiKey: process.env.OPENROUTER_API_KEY,
+		}
+
+		client.sendMessage({
+			type: IpcMessageType.TaskCommand,
+			origin: IpcOrigin.Client,
+			clientId: client.clientId!,
+			data: {
+				commandName: TaskCommandName.StartNewTask,
+				data: {
+					configuration,
+					text: prompt,
+					newTab: true,
+				},
+			},
+		})
+	} else {
+		console.error(`[${Date.now()} | ${tag}] unable to connect`)
+		client.disconnect()
+		taskFinishedAt = Date.now()
+		isClientDisconnected = true
+	}
+
+	try {
+		await pWaitFor(() => !!taskFinishedAt || isClientDisconnected, { interval: 1_000, timeout: EVALS_TIMEOUT })
+		// eslint-disable-next-line @typescript-eslint/no-unused-vars
+	} catch (error) {
+		console.log(`[${Date.now()} | ${tag}] time limit reached`)
+
+		// Cancel the task.
+		if (rooTaskId && !isClientDisconnected) {
+			client.sendMessage({
+				type: IpcMessageType.TaskCommand,
+				origin: IpcOrigin.Client,
+				clientId: client.clientId!,
+				data: { commandName: TaskCommandName.CancelTask, data: rooTaskId },
+			})
+
+			// Allow some time for the task to cancel.
+			await new Promise((resolve) => setTimeout(resolve, 5_000))
+		}
+
+		await updateTask(task.id, { finishedAt: new Date() })
+	}
+
+	if (!isClientDisconnected) {
+		if (rooTaskId) {
+			client.sendMessage({
+				type: IpcMessageType.TaskCommand,
+				origin: IpcOrigin.Client,
+				clientId: client.clientId!,
+				data: { commandName: TaskCommandName.CloseTask, data: rooTaskId },
+			})
+
+			// Allow some time for the window to close.
+			await new Promise((resolve) => setTimeout(resolve, 2_000))
+		}
+
+		client.disconnect()
+	}
+
+	controller.abort()
+	await subprocess
+
+	return { success: !!taskFinishedAt }
+}

+ 84 - 0
packages/evals/src/cli/runUnitTest.ts

@@ -0,0 +1,84 @@
+import * as path from "path"
+
+import { execa, parseCommandString } from "execa"
+import psTree from "ps-tree"
+
+import { type Task } from "../db/index.js"
+import { type ExerciseLanguage, exercisesPath } from "../exercises/index.js"
+
+const UNIT_TEST_TIMEOUT = 2 * 60 * 1_000
+
+const testCommands: Record<ExerciseLanguage, { commands: string[]; timeout?: number; cwd?: string }> = {
+	go: { commands: ["go test"] },
+	java: { commands: ["./gradlew test"] },
+	javascript: { commands: ["pnpm install", "pnpm test"] },
+	python: { commands: ["uv run python3 -m pytest -o markers=task *_test.py"] },
+	rust: { commands: ["cargo test"] },
+}
+
+export const runUnitTest = async ({ task }: { task: Task }) => {
+	const cmd = testCommands[task.language]
+	const exercisePath = path.resolve(exercisesPath, task.language, task.exercise)
+	const cwd = cmd.cwd ? path.resolve(exercisePath, cmd.cwd) : exercisePath
+	const commands = cmd.commands.map((cs) => parseCommandString(cs))
+	const tag = `cli#runUnitTest | ${task.language} / ${task.exercise}`
+
+	let passed = true
+
+	for (const command of commands) {
+		try {
+			const subprocess = execa({ cwd, shell: true, reject: false })`${command}`
+
+			const timeout = setTimeout(async () => {
+				const descendants = await new Promise<number[]>((resolve, reject) => {
+					psTree(subprocess.pid!, (err, children) => {
+						if (err) {
+							reject(err)
+						}
+
+						resolve(children.map((p) => parseInt(p.PID)))
+					})
+				})
+
+				console.log(
+					`${Date.now()} [${tag}] "${command.join(" ")}": unit tests timed out, killing ${subprocess.pid} + ${JSON.stringify(descendants)}`,
+				)
+
+				if (descendants.length > 0) {
+					for (const descendant of descendants) {
+						try {
+							console.log(`${Date.now()} [${tag}] killing ${descendant}`)
+
+							await execa`kill -9 ${descendant}`
+						} catch (error) {
+							console.error(`${Date.now()} [${tag}] Error killing descendant processes:`, error)
+						}
+					}
+				}
+
+				console.log(`${Date.now()} [${tag}] killing ${subprocess.pid}`)
+
+				try {
+					await execa`kill -9 ${subprocess.pid!}`
+				} catch (error) {
+					console.error(`${Date.now()} [${tag}] Error killing process:`, error)
+				}
+			}, UNIT_TEST_TIMEOUT)
+
+			const result = await subprocess
+
+			clearTimeout(timeout)
+
+			if (result.failed) {
+				passed = false
+				break
+			}
+		} catch (error) {
+			console.error(`${Date.now()} [${tag}]`, error)
+			passed = false
+			break
+		}
+	}
+
+	return passed
+}

+ 16 - 0
packages/evals/src/cli/utils.ts

@@ -0,0 +1,16 @@
+import * as fs from "fs"
+
+import type { Run, Task } from "../db/index.js"
+
+export const getTag = (caller: string, { run, task }: { run: Run; task?: Task }) =>
+	task
+		? `${caller} | pid:${process.pid} | run:${run.id} | task:${task.id} | ${task.language}/${task.exercise}`
+		: `${caller} | pid:${process.pid} | run:${run.id}`
+
+export const isDockerContainer = () => {
+	try {
+		return fs.existsSync("/.dockerenv")
+	} catch (_error) {
+		return false
+	}
+}

+ 9 - 0
packages/evals/src/db/queries/runs.ts

@@ -114,7 +114,16 @@ export const deleteRun = async (runId: number) => {
 		columns: { id: true, taskMetricsId: true },
 	})
 
+	await db.delete(schema.toolErrors).where(
+		inArray(
+			schema.toolErrors.taskId,
+			tasks.map(({ id }) => id),
+		),
+	)
+
 	await db.delete(schema.tasks).where(eq(schema.tasks.runId, runId))
+
+	await db.delete(schema.toolErrors).where(eq(schema.toolErrors.runId, runId))
 	await db.delete(schema.runs).where(eq(schema.runs.id, runId))
 
 	const taskMetricsIds = tasks

+ 6 - 2
packages/evals/src/db/queries/tasks.ts

@@ -1,4 +1,4 @@
-import { and, eq } from "drizzle-orm"
+import { and, asc, eq } from "drizzle-orm"
 
 import type { ExerciseLanguage } from "../../exercises/index.js"
 
@@ -58,4 +58,8 @@ export const getTask = async ({ runId, language, exercise }: GetTask) =>
 	})
 
 export const getTasks = async (runId: number) =>
-	db.query.tasks.findMany({ where: eq(tasks.runId, runId), with: { taskMetrics: true } })
+	db.query.tasks.findMany({
+		where: eq(tasks.runId, runId),
+		with: { taskMetrics: true },
+		orderBy: asc(tasks.id),
+	})

+ 1 - 1
packages/evals/src/db/schema.ts

@@ -117,4 +117,4 @@ export type UpdateToolError = Partial<Omit<ToolError, "id" | "createdAt">>
  * schema
  */
 
-export const schema = { runs, runsRelations, tasks, tasksRelations, taskMetrics }
+export const schema = { runs, runsRelations, tasks, tasksRelations, taskMetrics, toolErrors, toolErrorsRelations }

+ 73 - 1
packages/types/src/global-settings.ts

@@ -48,7 +48,7 @@ export const globalSettingsSchema = z.object({
 	allowedMaxRequests: z.number().nullish(),
 	autoCondenseContext: z.boolean().optional(),
 	autoCondenseContextPercent: z.number().optional(),
- 	maxConcurrentFileReads: z.number().optional(),
+	maxConcurrentFileReads: z.number().optional(),
 
 	browserToolEnabled: z.boolean().optional(),
 	browserViewportSize: z.string().optional(),
@@ -264,3 +264,75 @@ export const GLOBAL_STATE_KEYS = [...GLOBAL_SETTINGS_KEYS, ...PROVIDER_SETTINGS_
 
 export const isGlobalStateKey = (key: string): key is Keys<GlobalState> =>
 	GLOBAL_STATE_KEYS.includes(key as Keys<GlobalState>)
+
+/**
+ * Evals
+ */
+
+// Default settings when running evals (unless overridden).
+export const EVALS_SETTINGS: RooCodeSettings = {
+	apiProvider: "openrouter",
+	openRouterUseMiddleOutTransform: false,
+
+	lastShownAnnouncementId: "may-29-2025-3-19",
+
+	pinnedApiConfigs: {},
+
+	autoApprovalEnabled: true,
+	alwaysAllowReadOnly: true,
+	alwaysAllowReadOnlyOutsideWorkspace: false,
+	alwaysAllowWrite: true,
+	alwaysAllowWriteOutsideWorkspace: false,
+	writeDelayMs: 1000,
+	alwaysAllowBrowser: true,
+	alwaysApproveResubmit: true,
+	requestDelaySeconds: 10,
+	alwaysAllowMcp: true,
+	alwaysAllowModeSwitch: true,
+	alwaysAllowSubtasks: true,
+	alwaysAllowExecute: true,
+	allowedCommands: ["*"],
+
+	browserToolEnabled: false,
+	browserViewportSize: "900x600",
+	screenshotQuality: 75,
+	remoteBrowserEnabled: false,
+
+	ttsEnabled: false,
+	ttsSpeed: 1,
+	soundEnabled: false,
+	soundVolume: 0.5,
+
+	terminalOutputLineLimit: 500,
+	terminalShellIntegrationTimeout: 30000,
+	terminalCommandDelay: 0,
+	terminalPowershellCounter: false,
+	terminalZshOhMy: true,
+	terminalZshClearEolMark: true,
+	terminalZshP10k: false,
+	terminalZdotdir: true,
+	terminalCompressProgressBar: true,
+	terminalShellIntegrationDisabled: true,
+
+	diffEnabled: true,
+	fuzzyMatchThreshold: 1,
+
+	enableCheckpoints: false,
+
+	rateLimitSeconds: 0,
+	maxOpenTabsContext: 20,
+	maxWorkspaceFiles: 200,
+	showRooIgnoredFiles: true,
+	maxReadFileLine: -1, // -1 to enable full file reading.
+
+	language: "en",
+	telemetrySetting: "enabled",
+
+	mcpEnabled: false,
+
+	mode: "code",
+
+	customModes: [],
+}
+
+export const EVALS_TIMEOUT = 5 * 60 * 1_000

+ 270 - 6
pnpm-lock.yaml

@@ -143,9 +143,6 @@ importers:
       '@roo-code/evals':
         specifier: workspace:^
         version: link:../../packages/evals
-      '@roo-code/ipc':
-        specifier: workspace:^
-        version: link:../../packages/ipc
       '@roo-code/types':
         specifier: workspace:^
         version: link:../../packages/types
@@ -176,9 +173,6 @@ importers:
       p-map:
         specifier: ^7.0.3
         version: 7.0.3
-      ps-tree:
-        specifier: ^1.2.0
-        version: 1.2.0
       react:
         specifier: ^18.3.1
         version: 18.3.1
@@ -191,6 +185,9 @@ importers:
       react-use:
         specifier: ^17.6.0
         version: 17.6.0([email protected]([email protected]))([email protected])
+      redis:
+        specifier: ^5.5.5
+        version: 5.5.5
       sonner:
         specifier: ^2.0.5
         version: 2.0.5([email protected]([email protected]))([email protected])
@@ -228,6 +225,9 @@ importers:
       tailwindcss:
         specifier: ^4
         version: 4.1.6
+      vitest:
+        specifier: ^3.2.1
+        version: 3.2.1(@types/[email protected])(@types/[email protected])([email protected])([email protected])([email protected])([email protected])([email protected])
 
   apps/web-roo-code:
     dependencies:
@@ -446,6 +446,9 @@ importers:
       p-map:
         specifier: ^7.0.3
         version: 7.0.3
+      p-queue:
+        specifier: ^8.1.0
+        version: 8.1.0
       p-wait-for:
         specifier: ^5.0.2
         version: 5.0.2
@@ -455,6 +458,9 @@ importers:
       ps-tree:
         specifier: ^1.2.0
         version: 1.2.0
+      redis:
+        specifier: ^5.5.5
+        version: 5.5.5
       zod:
         specifier: ^3.24.2
         version: 3.24.4
@@ -3338,6 +3344,34 @@ packages:
   '@radix-ui/[email protected]':
     resolution: {integrity: sha512-HPwpGIzkl28mWyZqG52jiqDJ12waP11Pa1lGoiyUkIEuMLBP0oeK/C89esbXrxsky5we7dfd8U58nm0SgAWpVw==}
 
+  '@redis/[email protected]':
+    resolution: {integrity: sha512-M0GDmw8k0EOFoSpmMjhFUADk/apoano97fLSpT81opgmkkDtBB9iB6l6husxnzK5t2qNz/o0+OCVG9g6lEEwKw==}
+    engines: {node: '>= 18'}
+    peerDependencies:
+      '@redis/client': ^5.5.5
+
+  '@redis/[email protected]':
+    resolution: {integrity: sha512-1Dv/CVdMNLw0mlROSnmpp4MQu+6YIJX0YR0h3g2hnPdLvk6L7TcRcrUj7BQFGSeZD2MxklAUO+rp09ITUqE5Og==}
+    engines: {node: '>= 18'}
+
+  '@redis/[email protected]':
+    resolution: {integrity: sha512-Nq8wHjOhwuhD05YPWFPL9RyT3K1VdT37TKvqbhykZA2MWQgjjhLn5i1/6zZ+1b0Zc/Sr9E0eK9J8txk6YJR6EA==}
+    engines: {node: '>= 18'}
+    peerDependencies:
+      '@redis/client': ^5.5.5
+
+  '@redis/[email protected]':
+    resolution: {integrity: sha512-xM/DKrRhbsMS2QQF5bBPjR7P/QEjWWZDUr92r+UOwkZjvc/kmy0tp7h8zkxBo2jtSF99vkk2mwMzn6fQ8d60aQ==}
+    engines: {node: '>= 18'}
+    peerDependencies:
+      '@redis/client': ^5.5.5
+
+  '@redis/[email protected]':
+    resolution: {integrity: sha512-2ifwV75Fv/uVX4n0zqvgqIlIInHZtVj+afjcbXPBD2GhG2AeVfkitTz1bMnGnNDA78sWRYooK42OWH9yqujjyQ==}
+    engines: {node: '>= 18'}
+    peerDependencies:
+      '@redis/client': ^5.5.5
+
   '@rollup/[email protected]':
     resolution: {integrity: sha512-USm05zrsFxYLPdWWq+K3STlWiT/3ELn3RcV5hJMghpeAIhxfsUIg6mt12CBJBInWMV4VneoV7SfGv8xIwo2qNQ==}
     engines: {node: '>=14.0.0'}
@@ -4554,6 +4588,9 @@ packages:
   '@vitest/[email protected]':
     resolution: {integrity: sha512-0v4YVbhDKX3SKoy0PHWXpKhj44w+3zZkIoVES9Ex2pq+u6+Bijijbi2ua5kE+h3qT6LBWFTNZSCOEU37H8Y5sA==}
 
+  '@vitest/[email protected]':
+    resolution: {integrity: sha512-FqS/BnDOzV6+IpxrTg5GQRyLOCtcJqkwMwcS8qGCI2IyRVDwPAtutztaf1CjtPHlZlWtl1yUPCd7HM0cNiDOYw==}
+
   '@vitest/[email protected]':
     resolution: {integrity: sha512-PJbLjonJK82uCWHjzgBJZuR7zmAOrSvKk1QBxrennDIgtH4uK0TB1PvYmc0XBCigxxtiAVPfWtAdy4lpz8SQGQ==}
     peerDependencies:
@@ -4576,36 +4613,62 @@ packages:
       vite:
         optional: true
 
+  '@vitest/[email protected]':
+    resolution: {integrity: sha512-OXxMJnx1lkB+Vl65Re5BrsZEHc90s5NMjD23ZQ9NlU7f7nZiETGoX4NeKZSmsKjseuMq2uOYXdLOeoM0pJU+qw==}
+    peerDependencies:
+      msw: ^2.4.9
+      vite: ^5.0.0 || ^6.0.0 || ^7.0.0-0
+    peerDependenciesMeta:
+      msw:
+        optional: true
+      vite:
+        optional: true
+
   '@vitest/[email protected]':
     resolution: {integrity: sha512-i6FDiBeJUGLDKADw2Gb01UtUNb12yyXAqC/mmRWuYl+m/U9GS7s8us5ONmGkGpUUo7/iAYzI2ePVfOZTYvUifA==}
 
   '@vitest/[email protected]':
     resolution: {integrity: sha512-gUUhaUmPBHFkrqnOokmfMGRBMHhgpICud9nrz/xpNV3/4OXCn35oG+Pl8rYYsKaTNd/FAIrqRHnwpDpmYxCYZw==}
 
+  '@vitest/[email protected]':
+    resolution: {integrity: sha512-xBh1X2GPlOGBupp6E1RcUQWIxw0w/hRLd3XyBS6H+dMdKTAqHDNsIR2AnJwPA3yYe9DFy3VUKTe3VRTrAiQ01g==}
+
   '@vitest/[email protected]':
     resolution: {integrity: sha512-Tae+ogtlNfFei5DggOsSUvkIaSuVywujMj6HzR97AHK6XK8i3BuVyIifWAm/sE3a15lF5RH9yQIrbXYuo0IFyA==}
 
   '@vitest/[email protected]':
     resolution: {integrity: sha512-bXdmnHxuB7fXJdh+8vvnlwi/m1zvu+I06i1dICVcDQFhyV4iKw2RExC/acavtDn93m/dRuawUObKsrNE1gJacA==}
 
+  '@vitest/[email protected]':
+    resolution: {integrity: sha512-kygXhNTu/wkMYbwYpS3z/9tBe0O8qpdBuC3dD/AW9sWa0LE/DAZEjnHtWA9sIad7lpD4nFW1yQ+zN7mEKNH3yA==}
+
   '@vitest/[email protected]':
     resolution: {integrity: sha512-XVa5OPNTYUsyqG9skuUkFzAeFnEzDp8hQu7kZ0N25B1+6KjGm4hWLtURyBbsIAOekfWQ7Wuz/N/XXzgYO3deWQ==}
 
   '@vitest/[email protected]':
     resolution: {integrity: sha512-z7P/EneBRMe7hdvWhcHoXjhA6at0Q4ipcoZo6SqgxLyQQ8KSMMCmvw1cSt7FHib3ozt0wnRHc37ivuUMbxzG/A==}
 
+  '@vitest/[email protected]':
+    resolution: {integrity: sha512-5xko/ZpW2Yc65NVK9Gpfg2y4BFvcF+At7yRT5AHUpTg9JvZ4xZoyuRY4ASlmNcBZjMslV08VRLDrBOmUe2YX3g==}
+
   '@vitest/[email protected]':
     resolution: {integrity: sha512-x6w+ctOEmEXdWaa6TO4ilb7l9DxPR5bwEb6hILKuxfU1NqWT2mpJD9NJN7t3OTfxmVlOMrvtoFJGdgyzZ605lQ==}
 
   '@vitest/[email protected]':
     resolution: {integrity: sha512-s3+TkCNUIEOX99S0JwNDfsHRaZDDZZR/n8F0mop0PmsEbQGKZikCGpTGZ6JRiHuONKew3Fb5//EPwCP+pUX9cw==}
 
+  '@vitest/[email protected]':
+    resolution: {integrity: sha512-Nbfib34Z2rfcJGSetMxjDCznn4pCYPZOtQYox2kzebIJcgH75yheIKd5QYSFmR8DIZf2M8fwOm66qSDIfRFFfQ==}
+
   '@vitest/[email protected]':
     resolution: {integrity: sha512-2Ltrpht4OmHO9+c/nmHtF09HWiyWdworqnHIwjfvDyWjuwKbdkcS9AnhsDn+8E2RM4x++foD1/tNuLPVvWG1Rg==}
 
   '@vitest/[email protected]':
     resolution: {integrity: sha512-gXXOe7Fj6toCsZKVQouTRLJftJwmvbhH5lKOBR6rlP950zUq9AitTUjnFoXS/CqjBC2aoejAztLPzzuva++XBw==}
 
+  '@vitest/[email protected]':
+    resolution: {integrity: sha512-KkHlGhePEKZSub5ViknBcN5KEF+u7dSUr9NW8QsVICusUojrgrOnnY3DEWWO877ax2Pyopuk2qHmt+gkNKnBVw==}
+
   '@vscode/[email protected]':
     resolution: {integrity: sha512-wsNOvNMMJ2BY8rC2N2MNBG7yOowV3ov8KlvUE/AiVUlHKTfWsw3OgAOQduX7h0Un6GssKD3aoTVH+TF3DSQwKQ==}
 
@@ -5221,6 +5284,10 @@ packages:
     resolution: {integrity: sha512-eYm0QWBtUrBWZWG0d386OGAw16Z995PiOVo2B7bjWSbHedGl5e0ZWaq65kOGgUSNesEIDkB9ISbTg/JK9dhCZA==}
     engines: {node: '>=6'}
 
+  [email protected]:
+    resolution: {integrity: sha512-RMr0FhtfXemyinomL4hrWcYJxmX6deFdCxpJzhDttxgO1+bcCnkk+9drydLVDmAMG7NE6aN/fl4F7ucU/90gAA==}
+    engines: {node: '>=0.10.0'}
+
   [email protected]:
     resolution: {integrity: sha512-nsnxf6wNIM/JAS7T/x/1JmbEsjH0a8tezXqqpaL0O6+eV0/aDEnRxwjxpu0VzDdRcaC1ixGSbRlUuf/IU59I4g==}
 
@@ -8517,6 +8584,10 @@ packages:
     resolution: {integrity: sha512-VkndIv2fIB99swvQoA65bm+fsmt6UNdGeIB0oxBs+WhAhdh08QA04JXpI7rbB9r08/nkbysKoya9rtDERYOYMA==}
     engines: {node: '>=18'}
 
+  [email protected]:
+    resolution: {integrity: sha512-mxLDbbGIBEXTJL0zEx8JIylaj3xQ7Z/7eEVjcF9fJX4DBiH9oqe+oahYnlKKxm0Ci9TlWTyhSHgygxMxjIB2jw==}
+    engines: {node: '>=18'}
+
   [email protected]:
     resolution: {integrity: sha512-MyIV3ZA/PmyBN/ud8vV9XzwTrNtR4jFrObymZYnZqMmW0zA8Z17vnT0rBgFE/TlohB+YCHqXMgZzb3Csp49vqg==}
     engines: {node: '>=14.16'}
@@ -9131,6 +9202,10 @@ packages:
     resolution: {integrity: sha512-6tDA8g98We0zd0GvVeMT9arEOnTw9qM03L9cJXaCjrip1OO764RDBLBfrB4cwzNGDj5OA5ioymC9GkizgWJDUg==}
     engines: {node: '>=8'}
 
+  [email protected]:
+    resolution: {integrity: sha512-x7vpciikEY7nptGzQrE5I+/pvwFZJDadPk/uEoyGSg/pZ2m/CX2n5EhSgUh+S5T7Gz3uKM6YzWcXEu3ioAsdFQ==}
+    engines: {node: '>= 18'}
+
   [email protected]:
     resolution: {integrity: sha512-00o4I+DVrefhv+nX0ulyi3biSHCPDe+yLv5o/p6d/UVlirijB8E16FtfwSAi4g3tcqrQ4lRAqQSoFEZJehYEcw==}
     engines: {node: '>= 0.4'}
@@ -10333,6 +10408,11 @@ packages:
     engines: {node: ^18.0.0 || ^20.0.0 || >=22.0.0}
     hasBin: true
 
+  [email protected]:
+    resolution: {integrity: sha512-V4EyKQPxquurNJPtQJRZo8hKOoKNBRIhxcDbQFPFig0JdoWcUhwRgK8yoCXXrfYVPKS6XwirGHPszLnR8FbjCA==}
+    engines: {node: ^18.0.0 || ^20.0.0 || >=22.0.0}
+    hasBin: true
+
   [email protected]:
     resolution: {integrity: sha512-cZn6NDFE7wdTpINgs++ZJ4N49W2vRp8LCKrn3Ob1kYNtOo21vfDoaV5GzBfLU4MovSAB8uNRm4jgzVQZ+mBzPQ==}
     engines: {node: ^18.0.0 || ^20.0.0 || >=22.0.0}
@@ -10429,6 +10509,34 @@ packages:
       jsdom:
         optional: true
 
+  [email protected]:
+    resolution: {integrity: sha512-VZ40MBnlE1/V5uTgdqY3DmjUgZtIzsYq758JGlyQrv5syIsaYcabkfPkEuWML49Ph0D/SoqpVFd0dyVTr551oA==}
+    engines: {node: ^18.0.0 || ^20.0.0 || >=22.0.0}
+    hasBin: true
+    peerDependencies:
+      '@edge-runtime/vm': '*'
+      '@types/debug': ^4.1.12
+      '@types/node': ^18.0.0 || ^20.0.0 || >=22.0.0
+      '@vitest/browser': 3.2.1
+      '@vitest/ui': 3.2.1
+      happy-dom: '*'
+      jsdom: '*'
+    peerDependenciesMeta:
+      '@edge-runtime/vm':
+        optional: true
+      '@types/debug':
+        optional: true
+      '@types/node':
+        optional: true
+      '@vitest/browser':
+        optional: true
+      '@vitest/ui':
+        optional: true
+      happy-dom:
+        optional: true
+      jsdom:
+        optional: true
+
   [email protected]:
     resolution: {integrity: sha512-Dhxzh5HZuiHQhbvTW9AMetFfBHDMYpo23Uo9btPXgdYP+3T5S+p+jgNy7spra+veYhBP2dCSgxR/i2Y02h5/6w==}
     engines: {node: '>=0.10.0'}
@@ -13326,6 +13434,26 @@ snapshots:
 
   '@radix-ui/[email protected]': {}
 
+  '@redis/[email protected](@redis/[email protected])':
+    dependencies:
+      '@redis/client': 5.5.5
+
+  '@redis/[email protected]':
+    dependencies:
+      cluster-key-slot: 1.1.2
+
+  '@redis/[email protected](@redis/[email protected])':
+    dependencies:
+      '@redis/client': 5.5.5
+
+  '@redis/[email protected](@redis/[email protected])':
+    dependencies:
+      '@redis/client': 5.5.5
+
+  '@redis/[email protected](@redis/[email protected])':
+    dependencies:
+      '@redis/client': 5.5.5
+
   '@rollup/[email protected]([email protected])':
     dependencies:
       '@types/estree': 1.0.7
@@ -14767,6 +14895,14 @@ snapshots:
       chai: 5.2.0
       tinyrainbow: 2.0.0
 
+  '@vitest/[email protected]':
+    dependencies:
+      '@types/chai': 5.2.2
+      '@vitest/spy': 3.2.1
+      '@vitest/utils': 3.2.1
+      chai: 5.2.0
+      tinyrainbow: 2.0.0
+
   '@vitest/[email protected]([email protected](@types/[email protected])([email protected])([email protected])([email protected])([email protected]))':
     dependencies:
       '@vitest/spy': 3.1.3
@@ -14791,6 +14927,14 @@ snapshots:
     optionalDependencies:
       vite: 6.3.5(@types/[email protected])([email protected])([email protected])([email protected])([email protected])
 
+  '@vitest/[email protected]([email protected](@types/[email protected])([email protected])([email protected])([email protected])([email protected]))':
+    dependencies:
+      '@vitest/spy': 3.2.1
+      estree-walker: 3.0.3
+      magic-string: 0.30.17
+    optionalDependencies:
+      vite: 6.3.5(@types/[email protected])([email protected])([email protected])([email protected])([email protected])
+
   '@vitest/[email protected]':
     dependencies:
       tinyrainbow: 2.0.0
@@ -14799,6 +14943,10 @@ snapshots:
     dependencies:
       tinyrainbow: 2.0.0
 
+  '@vitest/[email protected]':
+    dependencies:
+      tinyrainbow: 2.0.0
+
   '@vitest/[email protected]':
     dependencies:
       '@vitest/utils': 3.1.3
@@ -14809,6 +14957,11 @@ snapshots:
       '@vitest/utils': 3.2.0
       pathe: 2.0.3
 
+  '@vitest/[email protected]':
+    dependencies:
+      '@vitest/utils': 3.2.1
+      pathe: 2.0.3
+
   '@vitest/[email protected]':
     dependencies:
       '@vitest/pretty-format': 3.1.3
@@ -14821,6 +14974,12 @@ snapshots:
       magic-string: 0.30.17
       pathe: 2.0.3
 
+  '@vitest/[email protected]':
+    dependencies:
+      '@vitest/pretty-format': 3.2.1
+      magic-string: 0.30.17
+      pathe: 2.0.3
+
   '@vitest/[email protected]':
     dependencies:
       tinyspy: 3.0.2
@@ -14829,6 +14988,10 @@ snapshots:
     dependencies:
       tinyspy: 4.0.3
 
+  '@vitest/[email protected]':
+    dependencies:
+      tinyspy: 4.0.3
+
   '@vitest/[email protected]':
     dependencies:
       '@vitest/pretty-format': 3.1.3
@@ -14841,6 +15004,12 @@ snapshots:
       loupe: 3.1.3
       tinyrainbow: 2.0.0
 
+  '@vitest/[email protected]':
+    dependencies:
+      '@vitest/pretty-format': 3.2.1
+      loupe: 3.1.3
+      tinyrainbow: 2.0.0
+
   '@vscode/[email protected]': {}
 
   '@vscode/[email protected]':
@@ -15554,6 +15723,8 @@ snapshots:
 
   [email protected]: {}
 
+  [email protected]: {}
+
   [email protected]:
     dependencies:
       chalk: 4.1.2
@@ -19547,6 +19718,11 @@ snapshots:
 
   [email protected]: {}
 
+  [email protected]:
+    dependencies:
+      eventemitter3: 5.0.1
+      p-timeout: 6.1.4
+
   [email protected]: {}
 
   [email protected]: {}
@@ -20224,6 +20400,14 @@ snapshots:
       indent-string: 4.0.0
       strip-indent: 3.0.0
 
+  [email protected]:
+    dependencies:
+      '@redis/bloom': 5.5.5(@redis/[email protected])
+      '@redis/client': 5.5.5
+      '@redis/json': 5.5.5(@redis/[email protected])
+      '@redis/search': 5.5.5(@redis/[email protected])
+      '@redis/time-series': 5.5.5(@redis/[email protected])
+
   [email protected]:
     dependencies:
       call-bind: 1.0.8
@@ -21694,6 +21878,27 @@ snapshots:
       - tsx
       - yaml
 
+  [email protected](@types/[email protected])([email protected])([email protected])([email protected])([email protected]):
+    dependencies:
+      cac: 6.7.14
+      debug: 4.4.1([email protected])
+      es-module-lexer: 1.7.0
+      pathe: 2.0.3
+      vite: 6.3.5(@types/[email protected])([email protected])([email protected])([email protected])([email protected])
+    transitivePeerDependencies:
+      - '@types/node'
+      - jiti
+      - less
+      - lightningcss
+      - sass
+      - sass-embedded
+      - stylus
+      - sugarss
+      - supports-color
+      - terser
+      - tsx
+      - yaml
+
   [email protected](@types/[email protected])([email protected])([email protected])([email protected])([email protected]):
     dependencies:
       esbuild: 0.25.5
@@ -21742,6 +21947,22 @@ snapshots:
       tsx: 4.19.4
       yaml: 2.8.0
 
+  [email protected](@types/[email protected])([email protected])([email protected])([email protected])([email protected]):
+    dependencies:
+      esbuild: 0.25.5
+      fdir: 6.4.4([email protected])
+      picomatch: 4.0.2
+      postcss: 8.5.4
+      rollup: 4.40.2
+      tinyglobby: 0.2.13
+    optionalDependencies:
+      '@types/node': 22.15.29
+      fsevents: 2.3.3
+      jiti: 2.4.2
+      lightningcss: 1.30.1
+      tsx: 4.19.4
+      yaml: 2.8.0
+
   [email protected](@types/[email protected])(@types/[email protected])([email protected])([email protected])([email protected])([email protected])([email protected]):
     dependencies:
       '@vitest/expect': 3.1.3
@@ -21867,6 +22088,49 @@ snapshots:
       - tsx
       - yaml
 
+  [email protected](@types/[email protected])(@types/[email protected])([email protected])([email protected])([email protected])([email protected])([email protected]):
+    dependencies:
+      '@types/chai': 5.2.2
+      '@vitest/expect': 3.2.1
+      '@vitest/mocker': 3.2.1([email protected](@types/[email protected])([email protected])([email protected])([email protected])([email protected]))
+      '@vitest/pretty-format': 3.2.1
+      '@vitest/runner': 3.2.1
+      '@vitest/snapshot': 3.2.1
+      '@vitest/spy': 3.2.1
+      '@vitest/utils': 3.2.1
+      chai: 5.2.0
+      debug: 4.4.1([email protected])
+      expect-type: 1.2.1
+      magic-string: 0.30.17
+      pathe: 2.0.3
+      picomatch: 4.0.2
+      std-env: 3.9.0
+      tinybench: 2.9.0
+      tinyexec: 0.3.2
+      tinyglobby: 0.2.14
+      tinypool: 1.1.0
+      tinyrainbow: 2.0.0
+      vite: 6.3.5(@types/[email protected])([email protected])([email protected])([email protected])([email protected])
+      vite-node: 3.2.1(@types/[email protected])([email protected])([email protected])([email protected])([email protected])
+      why-is-node-running: 2.3.0
+    optionalDependencies:
+      '@types/debug': 4.1.12
+      '@types/node': 22.15.29
+      jsdom: 20.0.3
+    transitivePeerDependencies:
+      - jiti
+      - less
+      - lightningcss
+      - msw
+      - sass
+      - sass-embedded
+      - stylus
+      - sugarss
+      - supports-color
+      - terser
+      - tsx
+      - yaml
+
   [email protected]: {}
 
   [email protected]: {}