Browse Source

Add an option to use our cli for evals (#10456)

Co-authored-by: Roo Code <[email protected]>
Chris Estreich 1 month ago
parent
commit
424bce6078

+ 4 - 0
.dockerignore

@@ -76,14 +76,18 @@ src/node_modules
 !pnpm-workspace.yaml
 !pnpm-workspace.yaml
 !scripts/bootstrap.mjs
 !scripts/bootstrap.mjs
 !apps/web-evals/
 !apps/web-evals/
+!apps/cli/
 !src/
 !src/
 !webview-ui/
 !webview-ui/
 !packages/evals/.docker/entrypoints/runner.sh
 !packages/evals/.docker/entrypoints/runner.sh
 !packages/build/
 !packages/build/
 !packages/config-eslint/
 !packages/config-eslint/
 !packages/config-typescript/
 !packages/config-typescript/
+!packages/core/
 !packages/evals/
 !packages/evals/
 !packages/ipc/
 !packages/ipc/
 !packages/telemetry/
 !packages/telemetry/
 !packages/types/
 !packages/types/
+!packages/vscode-shim/
+!packages/cloud/
 !locales/
 !locales/

+ 9 - 1
apps/web-evals/src/actions/runs.ts

@@ -28,10 +28,18 @@ const EVALS_STORAGE_PATH = "/tmp/evals/runs"
 
 
 const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals")
 const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals")
 
 
-export async function createRun({ suite, exercises = [], timeout, iterations = 1, ...values }: CreateRun) {
+export async function createRun({
+	suite,
+	exercises = [],
+	timeout,
+	iterations = 1,
+	executionMethod = "vscode",
+	...values
+}: CreateRun) {
 	const run = await _createRun({
 	const run = await _createRun({
 		...values,
 		...values,
 		timeout,
 		timeout,
+		executionMethod,
 		socketPath: "", // TODO: Get rid of this.
 		socketPath: "", // TODO: Get rid of this.
 	})
 	})
 
 

+ 86 - 44
apps/web-evals/src/app/runs/new/new-run.tsx

@@ -7,15 +7,26 @@ import { useQuery } from "@tanstack/react-query"
 import { useForm, FormProvider } from "react-hook-form"
 import { useForm, FormProvider } from "react-hook-form"
 import { zodResolver } from "@hookform/resolvers/zod"
 import { zodResolver } from "@hookform/resolvers/zod"
 import { toast } from "sonner"
 import { toast } from "sonner"
-import { X, Rocket, Check, ChevronsUpDown, SlidersHorizontal, Info, Plus, Minus } from "lucide-react"
+import {
+	X,
+	Rocket,
+	Check,
+	ChevronsUpDown,
+	SlidersHorizontal,
+	Info,
+	Plus,
+	Minus,
+	Terminal,
+	MonitorPlay,
+} from "lucide-react"
 
 
 import {
 import {
+	type ProviderSettings,
+	type GlobalSettings,
 	globalSettingsSchema,
 	globalSettingsSchema,
 	providerSettingsSchema,
 	providerSettingsSchema,
-	EVALS_SETTINGS,
 	getModelId,
 	getModelId,
-	type ProviderSettings,
-	type GlobalSettings,
+	EVALS_SETTINGS,
 } from "@roo-code/types"
 } from "@roo-code/types"
 
 
 import { createRun } from "@/actions/runs"
 import { createRun } from "@/actions/runs"
@@ -23,6 +34,7 @@ import { getExercises } from "@/actions/exercises"
 
 
 import {
 import {
 	type CreateRun,
 	type CreateRun,
+	type ExecutionMethod,
 	createRunSchema,
 	createRunSchema,
 	CONCURRENCY_MIN,
 	CONCURRENCY_MIN,
 	CONCURRENCY_MAX,
 	CONCURRENCY_MAX,
@@ -77,14 +89,12 @@ type ImportedSettings = {
 	currentApiConfigName: string
 	currentApiConfigName: string
 }
 }
 
 
-// Type for a model selection entry
 type ModelSelection = {
 type ModelSelection = {
 	id: string
 	id: string
 	model: string
 	model: string
 	popoverOpen: boolean
 	popoverOpen: boolean
 }
 }
 
 
-// Type for a config selection entry (for import mode)
 type ConfigSelection = {
 type ConfigSelection = {
 	id: string
 	id: string
 	configName: string
 	configName: string
@@ -95,16 +105,15 @@ export function NewRun() {
 	const router = useRouter()
 	const router = useRouter()
 
 
 	const [provider, setModelSource] = useState<"roo" | "openrouter" | "other">("other")
 	const [provider, setModelSource] = useState<"roo" | "openrouter" | "other">("other")
+	const [executionMethod, setExecutionMethod] = useState<ExecutionMethod>("vscode")
 	const [useNativeToolProtocol, setUseNativeToolProtocol] = useState(true)
 	const [useNativeToolProtocol, setUseNativeToolProtocol] = useState(true)
 	const [commandExecutionTimeout, setCommandExecutionTimeout] = useState(20)
 	const [commandExecutionTimeout, setCommandExecutionTimeout] = useState(20)
 	const [terminalShellIntegrationTimeout, setTerminalShellIntegrationTimeout] = useState(30) // seconds
 	const [terminalShellIntegrationTimeout, setTerminalShellIntegrationTimeout] = useState(30) // seconds
 
 
-	// State for multiple model selections
 	const [modelSelections, setModelSelections] = useState<ModelSelection[]>([
 	const [modelSelections, setModelSelections] = useState<ModelSelection[]>([
 		{ id: crypto.randomUUID(), model: "", popoverOpen: false },
 		{ id: crypto.randomUUID(), model: "", popoverOpen: false },
 	])
 	])
 
 
-	// State for imported settings with multiple config selections
 	const [importedSettings, setImportedSettings] = useState<ImportedSettings | null>(null)
 	const [importedSettings, setImportedSettings] = useState<ImportedSettings | null>(null)
 	const [configSelections, setConfigSelections] = useState<ConfigSelection[]>([
 	const [configSelections, setConfigSelections] = useState<ConfigSelection[]>([
 		{ id: crypto.randomUUID(), configName: "", popoverOpen: false },
 		{ id: crypto.randomUUID(), configName: "", popoverOpen: false },
@@ -119,7 +128,6 @@ export function NewRun() {
 
 
 	const exercises = useQuery({ queryKey: ["getExercises"], queryFn: () => getExercises() })
 	const exercises = useQuery({ queryKey: ["getExercises"], queryFn: () => getExercises() })
 
 
-	// State for selected exercises (needed for language toggle buttons)
 	const [selectedExercises, setSelectedExercises] = useState<string[]>([])
 	const [selectedExercises, setSelectedExercises] = useState<string[]>([])
 
 
 	const form = useForm<CreateRun>({
 	const form = useForm<CreateRun>({
@@ -134,6 +142,7 @@ export function NewRun() {
 			timeout: TIMEOUT_DEFAULT,
 			timeout: TIMEOUT_DEFAULT,
 			iterations: ITERATIONS_DEFAULT,
 			iterations: ITERATIONS_DEFAULT,
 			jobToken: "",
 			jobToken: "",
+			executionMethod: "vscode",
 		},
 		},
 	})
 	})
 
 
@@ -146,38 +155,49 @@ export function NewRun() {
 
 
 	const [suite, settings] = watch(["suite", "settings", "concurrency"])
 	const [suite, settings] = watch(["suite", "settings", "concurrency"])
 
 
-	// Load settings from localStorage on mount
 	useEffect(() => {
 	useEffect(() => {
 		const savedConcurrency = localStorage.getItem("evals-concurrency")
 		const savedConcurrency = localStorage.getItem("evals-concurrency")
+
 		if (savedConcurrency) {
 		if (savedConcurrency) {
 			const parsed = parseInt(savedConcurrency, 10)
 			const parsed = parseInt(savedConcurrency, 10)
+
 			if (!isNaN(parsed) && parsed >= CONCURRENCY_MIN && parsed <= CONCURRENCY_MAX) {
 			if (!isNaN(parsed) && parsed >= CONCURRENCY_MIN && parsed <= CONCURRENCY_MAX) {
 				setValue("concurrency", parsed)
 				setValue("concurrency", parsed)
 			}
 			}
 		}
 		}
+
 		const savedTimeout = localStorage.getItem("evals-timeout")
 		const savedTimeout = localStorage.getItem("evals-timeout")
+
 		if (savedTimeout) {
 		if (savedTimeout) {
 			const parsed = parseInt(savedTimeout, 10)
 			const parsed = parseInt(savedTimeout, 10)
+
 			if (!isNaN(parsed) && parsed >= TIMEOUT_MIN && parsed <= TIMEOUT_MAX) {
 			if (!isNaN(parsed) && parsed >= TIMEOUT_MIN && parsed <= TIMEOUT_MAX) {
 				setValue("timeout", parsed)
 				setValue("timeout", parsed)
 			}
 			}
 		}
 		}
+
 		const savedCommandTimeout = localStorage.getItem("evals-command-execution-timeout")
 		const savedCommandTimeout = localStorage.getItem("evals-command-execution-timeout")
+
 		if (savedCommandTimeout) {
 		if (savedCommandTimeout) {
 			const parsed = parseInt(savedCommandTimeout, 10)
 			const parsed = parseInt(savedCommandTimeout, 10)
+
 			if (!isNaN(parsed) && parsed >= 20 && parsed <= 60) {
 			if (!isNaN(parsed) && parsed >= 20 && parsed <= 60) {
 				setCommandExecutionTimeout(parsed)
 				setCommandExecutionTimeout(parsed)
 			}
 			}
 		}
 		}
+
 		const savedShellTimeout = localStorage.getItem("evals-shell-integration-timeout")
 		const savedShellTimeout = localStorage.getItem("evals-shell-integration-timeout")
+
 		if (savedShellTimeout) {
 		if (savedShellTimeout) {
 			const parsed = parseInt(savedShellTimeout, 10)
 			const parsed = parseInt(savedShellTimeout, 10)
+
 			if (!isNaN(parsed) && parsed >= 30 && parsed <= 60) {
 			if (!isNaN(parsed) && parsed >= 30 && parsed <= 60) {
 				setTerminalShellIntegrationTimeout(parsed)
 				setTerminalShellIntegrationTimeout(parsed)
 			}
 			}
 		}
 		}
-		// Load saved exercises selection
+
 		const savedSuite = localStorage.getItem("evals-suite")
 		const savedSuite = localStorage.getItem("evals-suite")
+
 		if (savedSuite === "partial") {
 		if (savedSuite === "partial") {
 			setValue("suite", "partial")
 			setValue("suite", "partial")
 			const savedExercises = localStorage.getItem("evals-exercises")
 			const savedExercises = localStorage.getItem("evals-exercises")
@@ -189,48 +209,57 @@ export function NewRun() {
 						setValue("exercises", parsed)
 						setValue("exercises", parsed)
 					}
 					}
 				} catch {
 				} catch {
-					// Invalid JSON, ignore
+					// Invalid JSON, ignore.
 				}
 				}
 			}
 			}
 		}
 		}
 	}, [setValue])
 	}, [setValue])
 
 
-	// Extract unique languages from exercises
 	const languages = useMemo(() => {
 	const languages = useMemo(() => {
-		if (!exercises.data) return []
+		if (!exercises.data) {
+			return []
+		}
+
 		const langs = new Set<string>()
 		const langs = new Set<string>()
+
 		for (const path of exercises.data) {
 		for (const path of exercises.data) {
 			const lang = path.split("/")[0]
 			const lang = path.split("/")[0]
-			if (lang) langs.add(lang)
+
+			if (lang) {
+				langs.add(lang)
+			}
 		}
 		}
+
 		return Array.from(langs).sort()
 		return Array.from(langs).sort()
 	}, [exercises.data])
 	}, [exercises.data])
 
 
-	// Get exercises for a specific language
 	const getExercisesForLanguage = useCallback(
 	const getExercisesForLanguage = useCallback(
 		(lang: string) => {
 		(lang: string) => {
-			if (!exercises.data) return []
+			if (!exercises.data) {
+				return []
+			}
+
 			return exercises.data.filter((path) => path.startsWith(`${lang}/`))
 			return exercises.data.filter((path) => path.startsWith(`${lang}/`))
 		},
 		},
 		[exercises.data],
 		[exercises.data],
 	)
 	)
 
 
-	// Toggle all exercises for a language
 	const toggleLanguage = useCallback(
 	const toggleLanguage = useCallback(
 		(lang: string) => {
 		(lang: string) => {
 			const langExercises = getExercisesForLanguage(lang)
 			const langExercises = getExercisesForLanguage(lang)
 			const allSelected = langExercises.every((ex) => selectedExercises.includes(ex))
 			const allSelected = langExercises.every((ex) => selectedExercises.includes(ex))
 
 
 			let newSelected: string[]
 			let newSelected: string[]
+
 			if (allSelected) {
 			if (allSelected) {
-				// Remove all exercises for this language
 				newSelected = selectedExercises.filter((ex) => !ex.startsWith(`${lang}/`))
 				newSelected = selectedExercises.filter((ex) => !ex.startsWith(`${lang}/`))
 			} else {
 			} else {
-				// Add all exercises for this language (avoiding duplicates)
 				const existing = new Set(selectedExercises)
 				const existing = new Set(selectedExercises)
+
 				for (const ex of langExercises) {
 				for (const ex of langExercises) {
 					existing.add(ex)
 					existing.add(ex)
 				}
 				}
+
 				newSelected = Array.from(existing)
 				newSelected = Array.from(existing)
 			}
 			}
 
 
@@ -241,7 +270,6 @@ export function NewRun() {
 		[getExercisesForLanguage, selectedExercises, setValue],
 		[getExercisesForLanguage, selectedExercises, setValue],
 	)
 	)
 
 
-	// Check if all exercises for a language are selected
 	const isLanguageSelected = useCallback(
 	const isLanguageSelected = useCallback(
 		(lang: string) => {
 		(lang: string) => {
 			const langExercises = getExercisesForLanguage(lang)
 			const langExercises = getExercisesForLanguage(lang)
@@ -250,7 +278,6 @@ export function NewRun() {
 		[getExercisesForLanguage, selectedExercises],
 		[getExercisesForLanguage, selectedExercises],
 	)
 	)
 
 
-	// Check if some (but not all) exercises for a language are selected
 	const isLanguagePartiallySelected = useCallback(
 	const isLanguagePartiallySelected = useCallback(
 		(lang: string) => {
 		(lang: string) => {
 			const langExercises = getExercisesForLanguage(lang)
 			const langExercises = getExercisesForLanguage(lang)
@@ -260,46 +287,40 @@ export function NewRun() {
 		[getExercisesForLanguage, selectedExercises],
 		[getExercisesForLanguage, selectedExercises],
 	)
 	)
 
 
-	// Add a new model selection
 	const addModelSelection = useCallback(() => {
 	const addModelSelection = useCallback(() => {
 		setModelSelections((prev) => [...prev, { id: crypto.randomUUID(), model: "", popoverOpen: false }])
 		setModelSelections((prev) => [...prev, { id: crypto.randomUUID(), model: "", popoverOpen: false }])
 	}, [])
 	}, [])
 
 
-	// Remove a model selection
 	const removeModelSelection = useCallback((id: string) => {
 	const removeModelSelection = useCallback((id: string) => {
 		setModelSelections((prev) => prev.filter((s) => s.id !== id))
 		setModelSelections((prev) => prev.filter((s) => s.id !== id))
 	}, [])
 	}, [])
 
 
-	// Update a model selection
 	const updateModelSelection = useCallback(
 	const updateModelSelection = useCallback(
 		(id: string, model: string) => {
 		(id: string, model: string) => {
 			setModelSelections((prev) => prev.map((s) => (s.id === id ? { ...s, model, popoverOpen: false } : s)))
 			setModelSelections((prev) => prev.map((s) => (s.id === id ? { ...s, model, popoverOpen: false } : s)))
-			// Also set the form model field for validation (use first non-empty model)
+			// Also set the form model field for validation (use first non-empty model).
 			setValue("model", model)
 			setValue("model", model)
 		},
 		},
 		[setValue],
 		[setValue],
 	)
 	)
 
 
-	// Toggle popover for a model selection
 	const toggleModelPopover = useCallback((id: string, open: boolean) => {
 	const toggleModelPopover = useCallback((id: string, open: boolean) => {
 		setModelSelections((prev) => prev.map((s) => (s.id === id ? { ...s, popoverOpen: open } : s)))
 		setModelSelections((prev) => prev.map((s) => (s.id === id ? { ...s, popoverOpen: open } : s)))
 	}, [])
 	}, [])
 
 
-	// Add a new config selection
 	const addConfigSelection = useCallback(() => {
 	const addConfigSelection = useCallback(() => {
 		setConfigSelections((prev) => [...prev, { id: crypto.randomUUID(), configName: "", popoverOpen: false }])
 		setConfigSelections((prev) => [...prev, { id: crypto.randomUUID(), configName: "", popoverOpen: false }])
 	}, [])
 	}, [])
 
 
-	// Remove a config selection
 	const removeConfigSelection = useCallback((id: string) => {
 	const removeConfigSelection = useCallback((id: string) => {
 		setConfigSelections((prev) => prev.filter((s) => s.id !== id))
 		setConfigSelections((prev) => prev.filter((s) => s.id !== id))
 	}, [])
 	}, [])
 
 
-	// Update a config selection
 	const updateConfigSelection = useCallback(
 	const updateConfigSelection = useCallback(
 		(id: string, configName: string) => {
 		(id: string, configName: string) => {
 			setConfigSelections((prev) => prev.map((s) => (s.id === id ? { ...s, configName, popoverOpen: false } : s)))
 			setConfigSelections((prev) => prev.map((s) => (s.id === id ? { ...s, configName, popoverOpen: false } : s)))
-			// Also update the form settings for the first config (for validation)
+
+			// Also update the form settings for the first config (for validation).
 			if (importedSettings) {
 			if (importedSettings) {
 				const providerSettings = importedSettings.apiConfigs[configName] ?? {}
 				const providerSettings = importedSettings.apiConfigs[configName] ?? {}
 				setValue("model", getModelId(providerSettings) ?? "")
 				setValue("model", getModelId(providerSettings) ?? "")
@@ -309,7 +330,6 @@ export function NewRun() {
 		[importedSettings, setValue],
 		[importedSettings, setValue],
 	)
 	)
 
 
-	// Toggle popover for a config selection
 	const toggleConfigPopover = useCallback((id: string, open: boolean) => {
 	const toggleConfigPopover = useCallback((id: string, open: boolean) => {
 		setConfigSelections((prev) => prev.map((s) => (s.id === id ? { ...s, popoverOpen: open } : s)))
 		setConfigSelections((prev) => prev.map((s) => (s.id === id ? { ...s, popoverOpen: open } : s)))
 	}, [])
 	}, [])
@@ -317,24 +337,20 @@ export function NewRun() {
 	const onSubmit = useCallback(
 	const onSubmit = useCallback(
 		async (values: CreateRun) => {
 		async (values: CreateRun) => {
 			try {
 			try {
-				// Validate jobToken for Roo Code Cloud provider
 				if (provider === "roo" && !values.jobToken?.trim()) {
 				if (provider === "roo" && !values.jobToken?.trim()) {
 					toast.error("Roo Code Cloud Token is required")
 					toast.error("Roo Code Cloud Token is required")
 					return
 					return
 				}
 				}
 
 
-				// Determine which selections to use based on provider
 				const selectionsToLaunch: { model: string; configName?: string }[] = []
 				const selectionsToLaunch: { model: string; configName?: string }[] = []
 
 
 				if (provider === "other") {
 				if (provider === "other") {
-					// For import mode, use config selections
 					for (const config of configSelections) {
 					for (const config of configSelections) {
 						if (config.configName) {
 						if (config.configName) {
 							selectionsToLaunch.push({ model: "", configName: config.configName })
 							selectionsToLaunch.push({ model: "", configName: config.configName })
 						}
 						}
 					}
 					}
 				} else {
 				} else {
-					// For openrouter/roo, use model selections
 					for (const selection of modelSelections) {
 					for (const selection of modelSelections) {
 						if (selection.model) {
 						if (selection.model) {
 							selectionsToLaunch.push({ model: selection.model })
 							selectionsToLaunch.push({ model: selection.model })
@@ -347,20 +363,19 @@ export function NewRun() {
 					return
 					return
 				}
 				}
 
 
-				// Show launching toast
 				const totalRuns = selectionsToLaunch.length
 				const totalRuns = selectionsToLaunch.length
 				toast.info(totalRuns > 1 ? `Launching ${totalRuns} runs (every 20 seconds)...` : "Launching run...")
 				toast.info(totalRuns > 1 ? `Launching ${totalRuns} runs (every 20 seconds)...` : "Launching run...")
 
 
-				// Launch runs with 20-second delay between each
 				for (let i = 0; i < selectionsToLaunch.length; i++) {
 				for (let i = 0; i < selectionsToLaunch.length; i++) {
 					const selection = selectionsToLaunch[i]!
 					const selection = selectionsToLaunch[i]!
 
 
-					// Wait 20 seconds between runs (except for the first one)
+					// Wait 20 seconds between runs (except for the first one).
 					if (i > 0) {
 					if (i > 0) {
-						await new Promise((resolve) => setTimeout(resolve, 20000))
+						await new Promise((resolve) => setTimeout(resolve, 20_000))
 					}
 					}
 
 
 					const runValues = { ...values }
 					const runValues = { ...values }
+					runValues.executionMethod = executionMethod
 
 
 					if (provider === "openrouter") {
 					if (provider === "openrouter") {
 						runValues.model = selection.model
 						runValues.model = selection.model
@@ -403,7 +418,6 @@ export function NewRun() {
 					}
 					}
 				}
 				}
 
 
-				// Navigate back to main evals UI
 				router.push("/")
 				router.push("/")
 			} catch (e) {
 			} catch (e) {
 				toast.error(e instanceof Error ? e.message : "An unknown error occurred.")
 				toast.error(e instanceof Error ? e.message : "An unknown error occurred.")
@@ -411,6 +425,7 @@ export function NewRun() {
 		},
 		},
 		[
 		[
 			provider,
 			provider,
+			executionMethod,
 			modelSelections,
 			modelSelections,
 			configSelections,
 			configSelections,
 			importedSettings,
 			importedSettings,
@@ -442,18 +457,15 @@ export function NewRun() {
 					})
 					})
 					.parse(JSON.parse(await file.text()))
 					.parse(JSON.parse(await file.text()))
 
 
-				// Store all imported configs for user selection
 				setImportedSettings({
 				setImportedSettings({
 					apiConfigs: providerProfiles.apiConfigs,
 					apiConfigs: providerProfiles.apiConfigs,
 					globalSettings,
 					globalSettings,
 					currentApiConfigName: providerProfiles.currentApiConfigName,
 					currentApiConfigName: providerProfiles.currentApiConfigName,
 				})
 				})
 
 
-				// Default to the current config for the first selection
 				const defaultConfigName = providerProfiles.currentApiConfigName
 				const defaultConfigName = providerProfiles.currentApiConfigName
 				setConfigSelections([{ id: crypto.randomUUID(), configName: defaultConfigName, popoverOpen: false }])
 				setConfigSelections([{ id: crypto.randomUUID(), configName: defaultConfigName, popoverOpen: false }])
 
 
-				// Apply the default config
 				const providerSettings = providerProfiles.apiConfigs[defaultConfigName] ?? {}
 				const providerSettings = providerProfiles.apiConfigs[defaultConfigName] ?? {}
 				setValue("model", getModelId(providerSettings) ?? "")
 				setValue("model", getModelId(providerSettings) ?? "")
 				setValue("settings", { ...EVALS_SETTINGS, ...providerSettings, ...globalSettings })
 				setValue("settings", { ...EVALS_SETTINGS, ...providerSettings, ...globalSettings })
@@ -971,6 +983,36 @@ export function NewRun() {
 						</FormItem>
 						</FormItem>
 					</div>
 					</div>
 
 
+					{/* Execution Method */}
+					<FormField
+						control={form.control}
+						name="executionMethod"
+						render={() => (
+							<FormItem>
+								<FormLabel>Execution Method</FormLabel>
+								<Tabs
+									value={executionMethod}
+									onValueChange={(value) => {
+										const newExecutionMethod = value as ExecutionMethod
+										setExecutionMethod(newExecutionMethod)
+										setValue("executionMethod", newExecutionMethod)
+									}}>
+									<TabsList>
+										<TabsTrigger value="vscode" className="flex items-center gap-2">
+											<MonitorPlay className="size-4" />
+											VSCode
+										</TabsTrigger>
+										<TabsTrigger value="cli" className="flex items-center gap-2">
+											<Terminal className="size-4" />
+											CLI
+										</TabsTrigger>
+									</TabsList>
+								</Tabs>
+								<FormMessage />
+							</FormItem>
+						)}
+					/>
+
 					<FormField
 					<FormField
 						control={form.control}
 						control={form.control}
 						name="description"
 						name="description"

+ 8 - 0
apps/web-evals/src/lib/schemas.ts

@@ -2,6 +2,13 @@ import { z } from "zod"
 
 
 import { rooCodeSettingsSchema } from "@roo-code/types"
 import { rooCodeSettingsSchema } from "@roo-code/types"
 
 
+/**
+ * ExecutionMethod
+ */
+
+export const executionMethodSchema = z.enum(["vscode", "cli"])
+export type ExecutionMethod = z.infer<typeof executionMethodSchema>
+
 /**
 /**
  * CreateRun
  * CreateRun
  */
  */
@@ -29,6 +36,7 @@ export const createRunSchema = z
 		timeout: z.number().int().min(TIMEOUT_MIN).max(TIMEOUT_MAX),
 		timeout: z.number().int().min(TIMEOUT_MIN).max(TIMEOUT_MAX),
 		iterations: z.number().int().min(ITERATIONS_MIN).max(ITERATIONS_MAX),
 		iterations: z.number().int().min(ITERATIONS_MIN).max(ITERATIONS_MAX),
 		jobToken: z.string().optional(),
 		jobToken: z.string().optional(),
+		executionMethod: executionMethodSchema,
 	})
 	})
 	.refine((data) => data.suite === "full" || (data.exercises || []).length > 0, {
 	.refine((data) => data.suite === "full" || (data.exercises || []).length > 0, {
 		message: "Exercises are required when running a partial suite.",
 		message: "Exercises are required when running a partial suite.",

+ 60 - 26
packages/evals/Dockerfile.runner

@@ -1,14 +1,14 @@
-FROM node:20-slim AS base
+# Build with:
+# docker compose -f packages/evals/docker-compose.yml build runner
 
 
-# Install pnpm
-ENV PNPM_HOME="/pnpm"
-ENV PATH="$PNPM_HOME:$PATH"
-RUN corepack enable
-RUN npm install -g npm@latest npm-run-all
+# Test with:
+# docker compose -f packages/evals/docker-compose.yml run --rm runner bash
+
+FROM debian:bookworm-slim AS base
 
 
-# Install system packages
-RUN apt update && \
-  apt install -y \
+# Install system packages (excluding language runtimes - those come from mise)
+RUN apt-get update && \
+  apt-get install -y \
   curl \
   curl \
   git \
   git \
   vim \
   vim \
@@ -22,18 +22,13 @@ RUN apt update && \
   gpg \
   gpg \
   xvfb \
   xvfb \
   cmake \
   cmake \
-  golang-go \
-  default-jre \
-  python3 \
-  python3-venv \
-  python3-dev \
-  python3-pip \
+  build-essential \
   && rm -rf /var/lib/apt/lists/*
   && rm -rf /var/lib/apt/lists/*
 
 
 # Install Docker cli
 # Install Docker cli
 RUN curl -fsSL https://download.docker.com/linux/debian/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \
 RUN curl -fsSL https://download.docker.com/linux/debian/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \
   && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null \
   && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null \
-  && apt update && apt install -y docker-ce-cli \
+  && apt-get update && apt-get install -y docker-ce-cli \
   && rm -rf /var/lib/apt/lists/*
   && rm -rf /var/lib/apt/lists/*
 
 
 # Install VS Code
 # Install VS Code
@@ -41,15 +36,43 @@ RUN wget -qO- https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor
   && install -D -o root -g root -m 644 packages.microsoft.gpg /etc/apt/keyrings/packages.microsoft.gpg \
   && install -D -o root -g root -m 644 packages.microsoft.gpg /etc/apt/keyrings/packages.microsoft.gpg \
   && echo "deb [arch=amd64,arm64,armhf signed-by=/etc/apt/keyrings/packages.microsoft.gpg] https://packages.microsoft.com/repos/code stable main" | tee /etc/apt/sources.list.d/vscode.list > /dev/null \
   && echo "deb [arch=amd64,arm64,armhf signed-by=/etc/apt/keyrings/packages.microsoft.gpg] https://packages.microsoft.com/repos/code stable main" | tee /etc/apt/sources.list.d/vscode.list > /dev/null \
   && rm -f packages.microsoft.gpg \
   && rm -f packages.microsoft.gpg \
-  && apt update && apt install -y code \
+  && apt-get update && apt-get install -y code \
   && rm -rf /var/lib/apt/lists/*
   && rm -rf /var/lib/apt/lists/*
 
 
 WORKDIR /roo
 WORKDIR /roo
 
 
-# Install rust
-ARG RUST_VERSION=1.87.0
-RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain ${RUST_VERSION} \
-  && echo 'source $HOME/.cargo/env' >> $HOME/.bashrc
+# Install mise (https://mise.jdx.dev) for language runtime management
+RUN curl https://mise.run | sh \
+  && /root/.local/bin/mise --version
+
+# Set up mise environment
+ENV MISE_DATA_DIR="/root/.local/share/mise"
+ENV PATH="/root/.local/share/mise/shims:/root/.local/bin:$PATH"
+
+# Define language runtime versions (matching setup.sh)
+ARG NODE_VERSION=20.19.2
+ARG PYTHON_VERSION=3.13.2
+ARG GO_VERSION=1.24.2
+ARG RUST_VERSION=1.85.1
+ARG JAVA_VERSION=openjdk-17
+ARG UV_VERSION=0.7.11
+
+# Install language runtimes via mise
+RUN mise use --global node@${NODE_VERSION} \
+  && mise use --global python@${PYTHON_VERSION} \
+  && mise use --global go@${GO_VERSION} \
+  && mise use --global rust@${RUST_VERSION} \
+  && mise use --global java@${JAVA_VERSION} \
+  && mise use --global uv@${UV_VERSION} \
+  && mise reshim
+
+# Verify installations
+RUN node --version && python --version && go version && rustc --version && java --version && uv --version
+
+# Install pnpm (after node is available from mise)
+ENV PNPM_HOME="/root/.local/share/pnpm"
+ENV PATH="$PNPM_HOME:$PATH"
+RUN npm install -g pnpm npm-run-all
 
 
 # Install VS Code extensions
 # Install VS Code extensions
 ARG GOLANG_EXT_VERSION=0.46.1
 ARG GOLANG_EXT_VERSION=0.46.1
@@ -72,17 +95,20 @@ RUN git clone ${EVALS_REPO_URL} evals \
   && cd evals \
   && cd evals \
   && git checkout ${EVALS_COMMIT}
   && git checkout ${EVALS_COMMIT}
 
 
-# Install uv and sync python dependencies
-ARG UV_VERSION=0.7.11
+# Pre-warm Gradle wrapper cache (./gradlew downloads its own Gradle regardless of system install).
+# Find a Java project with gradlew and run it to cache the distribution.
+RUN find /roo/evals -name "gradlew" -type f | head -1 | xargs -I {} sh -c 'cd $(dirname {}) && ./gradlew --version'
+
+# Sync python dependencies for evals
 WORKDIR /roo/evals/python
 WORKDIR /roo/evals/python
-RUN curl -LsSf https://github.com/astral-sh/uv/releases/download/${UV_VERSION}/uv-installer.sh | sh \
-  && /root/.local/bin/uv sync
+RUN uv sync
 
 
 WORKDIR /roo/repo
 WORKDIR /roo/repo
 
 
 # Install npm packages
 # Install npm packages
 RUN mkdir -p \
 RUN mkdir -p \
   scripts \
   scripts \
+  apps/cli \
   packages/build \
   packages/build \
   packages/config-eslint \
   packages/config-eslint \
   packages/config-typescript \
   packages/config-typescript \
@@ -92,6 +118,7 @@ RUN mkdir -p \
   packages/telemetry \
   packages/telemetry \
   packages/types \
   packages/types \
   packages/cloud \
   packages/cloud \
+  packages/vscode-shim \
   src \
   src \
   webview-ui
   webview-ui
 
 
@@ -99,6 +126,7 @@ COPY ./package.json                            ./
 COPY ./pnpm-lock.yaml                          ./
 COPY ./pnpm-lock.yaml                          ./
 COPY ./pnpm-workspace.yaml                     ./
 COPY ./pnpm-workspace.yaml                     ./
 COPY ./scripts/bootstrap.mjs                   ./scripts/
 COPY ./scripts/bootstrap.mjs                   ./scripts/
+COPY ./apps/cli/package.json                   ./apps/cli/
 COPY ./packages/build/package.json             ./packages/build/
 COPY ./packages/build/package.json             ./packages/build/
 COPY ./packages/config-eslint/package.json     ./packages/config-eslint/
 COPY ./packages/config-eslint/package.json     ./packages/config-eslint/
 COPY ./packages/config-typescript/package.json ./packages/config-typescript/
 COPY ./packages/config-typescript/package.json ./packages/config-typescript/
@@ -108,6 +136,7 @@ COPY ./packages/ipc/package.json               ./packages/ipc/
 COPY ./packages/telemetry/package.json         ./packages/telemetry/
 COPY ./packages/telemetry/package.json         ./packages/telemetry/
 COPY ./packages/types/package.json             ./packages/types/
 COPY ./packages/types/package.json             ./packages/types/
 COPY ./packages/cloud/package.json             ./packages/cloud/
 COPY ./packages/cloud/package.json             ./packages/cloud/
+COPY ./packages/vscode-shim/package.json       ./packages/vscode-shim/
 COPY ./src/package.json                        ./src/
 COPY ./src/package.json                        ./src/
 COPY ./webview-ui/package.json                 ./webview-ui/
 COPY ./webview-ui/package.json                 ./webview-ui/
 
 
@@ -128,10 +157,15 @@ COPY packages/evals/.env.local ./packages/evals/
 # Copy the pre-installed VS Code extensions
 # Copy the pre-installed VS Code extensions
 RUN cp -r /roo/.vscode-template /roo/.vscode
 RUN cp -r /roo/.vscode-template /roo/.vscode
 
 
-# Build the Roo Code extension
+# Build the Roo Code extension (for VSCode execution method)
 RUN pnpm vsix -- --out ../bin/roo-code.vsix \
 RUN pnpm vsix -- --out ../bin/roo-code.vsix \
     && yes | code --no-sandbox --user-data-dir /roo/.vscode --install-extension bin/roo-code.vsix
     && yes | code --no-sandbox --user-data-dir /roo/.vscode --install-extension bin/roo-code.vsix
 
 
+# Build the extension bundle and CLI (for CLI execution method)
+# The CLI requires the extension bundle (src/dist/extension.js) and the CLI build (apps/cli/dist/index.js)
+RUN pnpm --filter roo-cline bundle \
+    && pnpm --filter @roo-code/cli build
+
 # Copy entrypoint script
 # Copy entrypoint script
 COPY packages/evals/.docker/entrypoints/runner.sh /usr/local/bin/entrypoint.sh
 COPY packages/evals/.docker/entrypoints/runner.sh /usr/local/bin/entrypoint.sh
 RUN chmod +x /usr/local/bin/entrypoint.sh
 RUN chmod +x /usr/local/bin/entrypoint.sh

+ 1 - 1
packages/evals/src/cli/messageLogDeduper.test.ts → packages/evals/src/cli/__tests__/messageLogDeduper.test.ts

@@ -1,4 +1,4 @@
-import { MessageLogDeduper } from "./messageLogDeduper.js"
+import { MessageLogDeduper } from "../messageLogDeduper.js"
 
 
 describe("MessageLogDeduper", () => {
 describe("MessageLogDeduper", () => {
 	it("dedupes identical messages for same action+ts", () => {
 	it("dedupes identical messages for same action+ts", () => {

+ 1 - 1
packages/evals/src/cli/index.ts

@@ -6,7 +6,7 @@ import { EVALS_REPO_PATH } from "../exercises/index.js"
 
 
 import { runCi } from "./runCi.js"
 import { runCi } from "./runCi.js"
 import { runEvals } from "./runEvals.js"
 import { runEvals } from "./runEvals.js"
-import { processTask } from "./runTask.js"
+import { processTask } from "./processTask.js"
 
 
 const main = async () => {
 const main = async () => {
 	await run(
 	await run(

+ 150 - 0
packages/evals/src/cli/processTask.ts

@@ -0,0 +1,150 @@
+import { execa } from "execa"
+
+import { type TaskEvent, RooCodeEventName } from "@roo-code/types"
+
+import { findRun, findTask, updateTask } from "../db/index.js"
+
+import { Logger, getTag, isDockerContainer } from "./utils.js"
+import { redisClient, getPubSubKey, registerRunner, deregisterRunner } from "./redis.js"
+import { runUnitTest } from "./runUnitTest.js"
+import { runTaskWithCli } from "./runTaskInCli.js"
+import { runTaskInVscode } from "./runTaskInVscode.js"
+
+export const processTask = async ({
+	taskId,
+	jobToken,
+	logger,
+}: {
+	taskId: number
+	jobToken: string | null
+	logger?: Logger
+}) => {
+	const task = await findTask(taskId)
+	const { language, exercise } = task
+	const run = await findRun(task.runId)
+	await registerRunner({ runId: run.id, taskId, timeoutSeconds: (run.timeout || 5) * 60 })
+
+	const containerized = isDockerContainer()
+
+	logger =
+		logger ||
+		new Logger({
+			logDir: containerized ? `/var/log/evals/runs/${run.id}` : `/tmp/evals/runs/${run.id}`,
+			filename: `${language}-${exercise}.log`,
+			tag: getTag("runTask", { run, task }),
+		})
+
+	try {
+		const publish = async (e: TaskEvent) => {
+			const redis = await redisClient()
+			await redis.publish(getPubSubKey(run.id), JSON.stringify(e))
+		}
+
+		const executionMethod = run.executionMethod || "vscode"
+		logger.info(`running task ${task.id} (${language}/${exercise}) via ${executionMethod}...`)
+
+		if (executionMethod === "cli") {
+			await runTaskWithCli({ run, task, jobToken, publish, logger })
+		} else {
+			await runTaskInVscode({ run, task, jobToken, publish, logger })
+		}
+
+		logger.info(`testing task ${task.id} (${language}/${exercise})...`)
+		const passed = await runUnitTest({ task, logger })
+
+		logger.info(`task ${task.id} (${language}/${exercise}) -> ${passed}`)
+		await updateTask(task.id, { passed })
+
+		await publish({
+			eventName: passed ? RooCodeEventName.EvalPass : RooCodeEventName.EvalFail,
+			taskId: task.id,
+		})
+	} finally {
+		await deregisterRunner({ runId: run.id, taskId })
+	}
+}
+
+export const processTaskInContainer = async ({
+	taskId,
+	jobToken,
+	logger,
+	maxRetries = 10,
+}: {
+	taskId: number
+	jobToken: string | null
+	logger: Logger
+	maxRetries?: number
+}) => {
+	const baseArgs = [
+		"--rm",
+		"--network evals_default",
+		"-v /var/run/docker.sock:/var/run/docker.sock",
+		"-v /tmp/evals:/var/log/evals",
+		"-e HOST_EXECUTION_METHOD=docker",
+	]
+
+	if (jobToken) {
+		baseArgs.push(`-e ROO_CODE_CLOUD_TOKEN=${jobToken}`)
+	}
+
+	// Pass API keys to the container so the CLI can authenticate
+	const apiKeyEnvVars = [
+		"OPENROUTER_API_KEY",
+		"ANTHROPIC_API_KEY",
+		"OPENAI_API_KEY",
+		"GOOGLE_API_KEY",
+		"DEEPSEEK_API_KEY",
+		"MISTRAL_API_KEY",
+	]
+
+	for (const envVar of apiKeyEnvVars) {
+		if (process.env[envVar]) {
+			baseArgs.push(`-e ${envVar}=${process.env[envVar]}`)
+		}
+	}
+
+	const command = `pnpm --filter @roo-code/evals cli --taskId ${taskId}`
+	logger.info(command)
+
+	for (let attempt = 0; attempt <= maxRetries; attempt++) {
+		const containerName = `evals-task-${taskId}.${attempt}`
+		const args = [`--name ${containerName}`, `-e EVALS_ATTEMPT=${attempt}`, ...baseArgs]
+		const isRetry = attempt > 0
+
+		if (isRetry) {
+			const delayMs = Math.pow(2, attempt - 1) * 1000 * (0.5 + Math.random())
+			logger.info(`retrying in ${delayMs}ms (attempt ${attempt + 1}/${maxRetries + 1})`)
+			await new Promise((resolve) => setTimeout(resolve, delayMs))
+		}
+
+		logger.info(
+			`${isRetry ? "retrying" : "executing"} container command (attempt ${attempt + 1}/${maxRetries + 1})`,
+		)
+
+		const subprocess = execa(`docker run ${args.join(" ")} evals-runner sh -c "${command}"`, { shell: true })
+		// subprocess.stdout?.on("data", (data) => console.log(data.toString()))
+		// subprocess.stderr?.on("data", (data) => console.error(data.toString()))
+
+		try {
+			const result = await subprocess
+			logger.info(`container process completed with exit code: ${result.exitCode}`)
+			return
+		} catch (error) {
+			if (error && typeof error === "object" && "exitCode" in error) {
+				logger.error(
+					`container process failed with exit code: ${error.exitCode} (attempt ${attempt + 1}/${maxRetries + 1})`,
+				)
+			} else {
+				logger.error(`container process failed with error: ${error} (attempt ${attempt + 1}/${maxRetries + 1})`)
+			}
+
+			if (attempt === maxRetries) {
+				break
+			}
+		}
+	}
+
+	logger.error(`all ${maxRetries + 1} attempts failed, giving up`)
+
+	// TODO: Mark task as failed.
+}

+ 8 - 3
packages/evals/src/cli/runEvals.ts

@@ -5,7 +5,7 @@ import { EVALS_REPO_PATH } from "../exercises/index.js"
 
 
 import { Logger, getTag, isDockerContainer, resetEvalsRepo, commitEvalsRepoChanges } from "./utils.js"
 import { Logger, getTag, isDockerContainer, resetEvalsRepo, commitEvalsRepoChanges } from "./utils.js"
 import { startHeartbeat, stopHeartbeat } from "./redis.js"
 import { startHeartbeat, stopHeartbeat } from "./redis.js"
-import { processTask, processTaskInContainer } from "./runTask.js"
+import { processTask, processTaskInContainer } from "./processTask.js"
 
 
 export const runEvals = async (runId: number) => {
 export const runEvals = async (runId: number) => {
 	const run = await findRun(runId)
 	const run = await findRun(runId)
@@ -53,13 +53,18 @@ export const runEvals = async (runId: number) => {
 	}
 	}
 
 
 	try {
 	try {
-		// Add tasks with staggered start times when concurrency > 1
+		// Add tasks with staggered start times when concurrency > 1.
 		for (let i = 0; i < filteredTasks.length; i++) {
 		for (let i = 0; i < filteredTasks.length; i++) {
 			const task = filteredTasks[i]
 			const task = filteredTasks[i]
-			if (!task) continue
+
+			if (!task) {
+				continue
+			}
+
 			if (run.concurrency > 1 && i > 0) {
 			if (run.concurrency > 1 && i > 0) {
 				await new Promise((resolve) => setTimeout(resolve, STAGGER_DELAY_MS))
 				await new Promise((resolve) => setTimeout(resolve, STAGGER_DELAY_MS))
 			}
 			}
+
 			queue.add(createTaskRunner(task))
 			queue.add(createTaskRunner(task))
 		}
 		}
 
 

+ 313 - 0
packages/evals/src/cli/runTaskInCli.ts

@@ -0,0 +1,313 @@
+import * as fs from "fs"
+import * as path from "path"
+import * as os from "node:os"
+
+import pWaitFor from "p-wait-for"
+import { execa } from "execa"
+
+import { type ToolUsage, TaskCommandName, RooCodeEventName, IpcMessageType } from "@roo-code/types"
+import { IpcClient } from "@roo-code/ipc"
+
+import { updateTask, createTaskMetrics, updateTaskMetrics, createToolError } from "../db/index.js"
+import { EVALS_REPO_PATH } from "../exercises/index.js"
+
+import { type RunTaskOptions } from "./types.js"
+import { mergeToolUsage, waitForSubprocessWithTimeout } from "./utils.js"
+
+/**
+ * Run a task using the Roo Code CLI (headless mode).
+ * Uses the same IPC protocol as VSCode since the CLI loads the same extension bundle.
+ */
+export const runTaskWithCli = async ({ run, task, publish, logger, jobToken }: RunTaskOptions) => {
+	const { language, exercise } = task
+	const prompt = fs.readFileSync(path.resolve(EVALS_REPO_PATH, `prompts/${language}.md`), "utf-8")
+	const workspacePath = path.resolve(EVALS_REPO_PATH, language, exercise)
+	const ipcSocketPath = path.resolve(os.tmpdir(), `evals-cli-${run.id}-${task.id}.sock`)
+
+	const env: Record<string, string> = {
+		...(process.env as Record<string, string>),
+		ROO_CODE_IPC_SOCKET_PATH: ipcSocketPath,
+	}
+
+	if (jobToken) {
+		env.ROO_CODE_CLOUD_TOKEN = jobToken
+	}
+
+	const controller = new AbortController()
+	const cancelSignal = controller.signal
+
+	const cliArgs = [
+		"--filter",
+		"@roo-code/cli",
+		"start",
+		"--yes",
+		"--exit-on-complete",
+		"--reasoning-effort",
+		"disabled",
+		"--workspace",
+		workspacePath,
+	]
+
+	if (run.settings?.mode) {
+		cliArgs.push("-M", run.settings.mode)
+	}
+
+	if (run.settings?.apiProvider) {
+		cliArgs.push("-p", run.settings.apiProvider)
+	}
+
+	const modelId = run.settings?.apiModelId || run.settings?.openRouterModelId
+
+	if (modelId) {
+		cliArgs.push("-m", modelId)
+	}
+
+	cliArgs.push(prompt)
+
+	logger.info(`CLI command: pnpm ${cliArgs.join(" ")}`)
+
+	const subprocess = execa("pnpm", cliArgs, { env, cancelSignal, cwd: process.cwd() })
+
+	// Buffer for accumulating streaming output until we have complete lines.
+	let stdoutBuffer = ""
+	let stderrBuffer = ""
+
+	// Track subprocess exit code - with -x flag the CLI exits immediately after task completion.
+	let subprocessExitCode: number | null = null
+
+	// Pipe CLI stdout/stderr to the logger for easier debugging.
+	// Buffer output and only log complete lines to avoid fragmented token-by-token logging.
+	// Use logger.raw() to output without the verbose prefix (timestamp, tag, etc).
+	subprocess.stdout?.on("data", (data: Buffer) => {
+		stdoutBuffer += data.toString()
+		const lines = stdoutBuffer.split("\n")
+
+		// Keep the last incomplete line in the buffer.
+		stdoutBuffer = lines.pop() || ""
+
+		// Log all complete lines without the verbose prefix.
+		for (const line of lines) {
+			if (line.trim()) {
+				logger.raw(line)
+			}
+		}
+	})
+
+	subprocess.stderr?.on("data", (data: Buffer) => {
+		stderrBuffer += data.toString()
+		const lines = stderrBuffer.split("\n")
+
+		// Keep the last incomplete line in the buffer.
+		stderrBuffer = lines.pop() || ""
+
+		// Log all complete lines without the verbose prefix.
+		for (const line of lines) {
+			if (line.trim()) {
+				logger.raw(line)
+			}
+		}
+	})
+
+	// Log any remaining buffered output when the subprocess exits.
+	subprocess.on("exit", (code) => {
+		subprocessExitCode = code
+
+		if (stdoutBuffer.trim()) {
+			logger.raw(stdoutBuffer)
+		}
+
+		if (stderrBuffer.trim()) {
+			logger.raw(stderrBuffer)
+		}
+	})
+
+	// Give CLI some time to start and create IPC server.
+	await new Promise((resolve) => setTimeout(resolve, 5_000))
+
+	let client: IpcClient | undefined = undefined
+	let attempts = 10 // More attempts for CLI startup.
+
+	while (true) {
+		try {
+			client = new IpcClient(ipcSocketPath)
+			await pWaitFor(() => client!.isReady, { interval: 500, timeout: 2_000 })
+			break
+		} catch (_error) {
+			client?.disconnect()
+			attempts--
+
+			if (attempts <= 0) {
+				logger.error(`unable to connect to IPC socket -> ${ipcSocketPath}`)
+				throw new Error("Unable to connect to CLI IPC socket.")
+			}
+
+			// Wait a bit before retrying.
+			await new Promise((resolve) => setTimeout(resolve, 1_000))
+		}
+	}
+
+	// For CLI mode, we need to create taskMetrics immediately because the CLI starts
+	// the task right away (from command line args). By the time we connect to IPC,
+	// the TaskStarted event may have already been sent and missed.
+	// This is different from VSCode mode where we send StartNewTask via IPC and can
+	// reliably receive TaskStarted.
+	const taskMetrics = await createTaskMetrics({
+		cost: 0,
+		tokensIn: 0,
+		tokensOut: 0,
+		tokensContext: 0,
+		duration: 0,
+		cacheWrites: 0,
+		cacheReads: 0,
+	})
+
+	await updateTask(task.id, { taskMetricsId: taskMetrics.id, startedAt: new Date() })
+	logger.info(`created taskMetrics with id ${taskMetrics.id}`)
+
+	// The rest of the logic handles IPC events for metrics updates.
+	let taskStartedAt = Date.now()
+	let taskFinishedAt: number | undefined
+	let taskAbortedAt: number | undefined
+	let taskTimedOut: boolean = false
+	const taskMetricsId = taskMetrics.id // Already set, no need to wait for TaskStarted.
+	let rooTaskId: string | undefined
+	let isClientDisconnected = false
+	const accumulatedToolUsage: ToolUsage = {}
+
+	// For CLI mode, we don't need verbose IPC message logging since we're logging stdout instead.
+	// We only track what's needed for metrics and task state management.
+	const ignoreEventsForBroadcast = [RooCodeEventName.Message]
+	let isApiUnstable = false
+
+	client.on(IpcMessageType.TaskEvent, async (taskEvent) => {
+		const { eventName, payload } = taskEvent
+
+		// Track API instability for retry logic.
+		if (
+			eventName === RooCodeEventName.Message &&
+			payload[0].message.say &&
+			["api_req_retry_delayed", "api_req_retried"].includes(payload[0].message.say)
+		) {
+			isApiUnstable = true
+		}
+
+		// Publish events to Redis (except Message events) for the web UI.
+		if (!ignoreEventsForBroadcast.includes(eventName)) {
+			await publish({ ...taskEvent, taskId: task.id })
+		}
+
+		// Handle task lifecycle events.
+		// For CLI mode, we already created taskMetrics before connecting to IPC,
+		// but we still want to capture the rooTaskId from TaskStarted if we receive it.
+		if (eventName === RooCodeEventName.TaskStarted) {
+			taskStartedAt = Date.now()
+			rooTaskId = payload[0]
+			logger.info(`received TaskStarted event, rooTaskId: ${rooTaskId}`)
+		}
+
+		if (eventName === RooCodeEventName.TaskToolFailed) {
+			const [_taskId, toolName, error] = payload
+			await createToolError({ taskId: task.id, toolName, error })
+		}
+
+		if (eventName === RooCodeEventName.TaskTokenUsageUpdated || eventName === RooCodeEventName.TaskCompleted) {
+			// In CLI mode, taskMetricsId is always set before we register event handlers.
+			const duration = Date.now() - taskStartedAt
+
+			const { totalCost, totalTokensIn, totalTokensOut, contextTokens, totalCacheWrites, totalCacheReads } =
+				payload[1]
+
+			const incomingToolUsage: ToolUsage = payload[2] ?? {}
+			mergeToolUsage(accumulatedToolUsage, incomingToolUsage)
+
+			await updateTaskMetrics(taskMetricsId, {
+				cost: totalCost,
+				tokensIn: totalTokensIn,
+				tokensOut: totalTokensOut,
+				tokensContext: contextTokens,
+				duration,
+				cacheWrites: totalCacheWrites ?? 0,
+				cacheReads: totalCacheReads ?? 0,
+				toolUsage: accumulatedToolUsage,
+			})
+		}
+
+		if (eventName === RooCodeEventName.TaskAborted) {
+			taskAbortedAt = Date.now()
+		}
+
+		if (eventName === RooCodeEventName.TaskCompleted) {
+			taskFinishedAt = Date.now()
+		}
+	})
+
+	client.on(IpcMessageType.Disconnect, async () => {
+		logger.info(`disconnected from IPC socket -> ${ipcSocketPath}`)
+		isClientDisconnected = true
+		// Note: In CLI mode, we don't need to resolve taskMetricsReady since
+		// taskMetrics is created synchronously before event handlers are registered.
+	})
+
+	// Note: We do NOT send StartNewTask via IPC here because the CLI already
+	// starts the task from its command line arguments. The IPC connection is
+	// only used to receive events (TaskStarted, TaskCompleted, etc.) and metrics.
+	// Sending StartNewTask here would start a SECOND task.
+
+	try {
+		const timeoutMs = (run.timeout || 5) * 60 * 1_000
+
+		await pWaitFor(() => !!taskFinishedAt || !!taskAbortedAt || isClientDisconnected, {
+			interval: 1_000,
+			timeout: timeoutMs,
+		})
+	} catch (_error) {
+		taskTimedOut = true
+		logger.error("time limit reached")
+
+		if (rooTaskId && !isClientDisconnected) {
+			logger.info("cancelling task")
+			client.sendCommand({ commandName: TaskCommandName.CancelTask, data: rooTaskId })
+			await new Promise((resolve) => setTimeout(resolve, 5_000))
+		}
+
+		taskFinishedAt = Date.now()
+	}
+
+	if (!taskFinishedAt && !taskTimedOut) {
+		// With -x flag, CLI exits immediately after task completion, which can cause
+		// IPC disconnection before we receive the TaskCompleted event.
+		// If subprocess exited cleanly (code 0), treat as successful completion.
+		if (subprocessExitCode === 0) {
+			taskFinishedAt = Date.now()
+			logger.info("subprocess exited cleanly (code 0), treating as task completion")
+		} else {
+			logger.error(`client disconnected before task finished (subprocess exit code: ${subprocessExitCode})`)
+			throw new Error("Client disconnected before task completion.")
+		}
+	}
+
+	logger.info("setting task finished at")
+	await updateTask(task.id, { finishedAt: new Date() })
+
+	if (rooTaskId && !isClientDisconnected) {
+		logger.info("closing task")
+		client.sendCommand({ commandName: TaskCommandName.CloseTask, data: rooTaskId })
+		await new Promise((resolve) => setTimeout(resolve, 2_000))
+	}
+
+	if (!isClientDisconnected) {
+		logger.info("disconnecting client")
+		client.disconnect()
+	}
+
+	logger.info("waiting for subprocess to finish")
+	controller.abort()
+
+	await waitForSubprocessWithTimeout({ subprocess, logger })
+
+	logger.close()
+
+	if (isApiUnstable && !taskFinishedAt) {
+		throw new Error("API is unstable, throwing to trigger a retry.")
+	}
+}

+ 7 - 248
packages/evals/src/cli/runTask.ts → packages/evals/src/cli/runTaskInVscode.ts

@@ -1,5 +1,4 @@
 import * as fs from "fs"
 import * as fs from "fs"
-import * as fsp from "fs/promises"
 import * as path from "path"
 import * as path from "path"
 import * as os from "node:os"
 import * as os from "node:os"
 
 
@@ -7,218 +6,23 @@ import pWaitFor from "p-wait-for"
 import { execa } from "execa"
 import { execa } from "execa"
 
 
 import {
 import {
-	type TaskEvent,
 	type ClineSay,
 	type ClineSay,
+	type ToolUsage,
 	TaskCommandName,
 	TaskCommandName,
 	RooCodeEventName,
 	RooCodeEventName,
 	IpcMessageType,
 	IpcMessageType,
 	EVALS_SETTINGS,
 	EVALS_SETTINGS,
-	type ToolUsage,
 } from "@roo-code/types"
 } from "@roo-code/types"
 import { IpcClient } from "@roo-code/ipc"
 import { IpcClient } from "@roo-code/ipc"
 
 
-import {
-	type Run,
-	type Task,
-	findRun,
-	findTask,
-	updateTask,
-	createTaskMetrics,
-	updateTaskMetrics,
-	createToolError,
-} from "../db/index.js"
+import { updateTask, createTaskMetrics, updateTaskMetrics, createToolError } from "../db/index.js"
 import { EVALS_REPO_PATH } from "../exercises/index.js"
 import { EVALS_REPO_PATH } from "../exercises/index.js"
 
 
-import { Logger, getTag, isDockerContainer } from "./utils.js"
-import { redisClient, getPubSubKey, registerRunner, deregisterRunner } from "./redis.js"
-import { runUnitTest } from "./runUnitTest.js"
+import { type RunTaskOptions } from "./types.js"
+import { isDockerContainer, copyConversationHistory, mergeToolUsage, waitForSubprocessWithTimeout } from "./utils.js"
 import { MessageLogDeduper } from "./messageLogDeduper.js"
 import { MessageLogDeduper } from "./messageLogDeduper.js"
 
 
-class SubprocessTimeoutError extends Error {
-	constructor(timeout: number) {
-		super(`Subprocess timeout after ${timeout}ms`)
-		this.name = "SubprocessTimeoutError"
-	}
-}
-
-/**
- * Copy conversation history files from VS Code extension storage to the log directory.
- * This allows us to preserve the api_conversation_history.json and ui_messages.json
- * files for post-mortem analysis alongside the log files.
- */
-async function copyConversationHistory({
-	rooTaskId,
-	logDir,
-	language,
-	exercise,
-	iteration,
-	logger,
-}: {
-	rooTaskId: string
-	logDir: string
-	language: string
-	exercise: string
-	iteration: number
-	logger: Logger
-}): Promise<void> {
-	// VS Code extension global storage path within the container
-	const extensionStoragePath = "/roo/.vscode/User/globalStorage/rooveterinaryinc.roo-cline"
-	const taskStoragePath = path.join(extensionStoragePath, "tasks", rooTaskId)
-
-	const filesToCopy = ["api_conversation_history.json", "ui_messages.json"]
-
-	for (const filename of filesToCopy) {
-		const sourcePath = path.join(taskStoragePath, filename)
-		// Use sanitized exercise name (replace slashes with dashes) for the destination filename
-		// Include iteration number to handle multiple attempts at the same exercise
-		const sanitizedExercise = exercise.replace(/\//g, "-")
-		const destFilename = `${language}-${sanitizedExercise}.${iteration}_${filename}`
-		const destPath = path.join(logDir, destFilename)
-
-		try {
-			// Check if source file exists
-			await fsp.access(sourcePath)
-
-			// Copy the file
-			await fsp.copyFile(sourcePath, destPath)
-			logger.info(`copied ${filename} to ${destPath}`)
-		} catch (error) {
-			// File may not exist if task didn't complete properly - this is not fatal
-			if ((error as NodeJS.ErrnoException).code === "ENOENT") {
-				logger.info(`${filename} not found at ${sourcePath} - skipping`)
-			} else {
-				logger.error(`failed to copy ${filename}:`, error)
-			}
-		}
-	}
-}
-
-export const processTask = async ({
-	taskId,
-	jobToken,
-	logger,
-}: {
-	taskId: number
-	jobToken: string | null
-	logger?: Logger
-}) => {
-	const task = await findTask(taskId)
-	const { language, exercise } = task
-	const run = await findRun(task.runId)
-	await registerRunner({ runId: run.id, taskId, timeoutSeconds: (run.timeout || 5) * 60 })
-
-	const containerized = isDockerContainer()
-
-	logger =
-		logger ||
-		new Logger({
-			logDir: containerized ? `/var/log/evals/runs/${run.id}` : `/tmp/evals/runs/${run.id}`,
-			filename: `${language}-${exercise}.log`,
-			tag: getTag("runTask", { run, task }),
-		})
-
-	try {
-		const publish = async (e: TaskEvent) => {
-			const redis = await redisClient()
-			await redis.publish(getPubSubKey(run.id), JSON.stringify(e))
-		}
-
-		logger.info(`running task ${task.id} (${language}/${exercise})...`)
-		await runTask({ run, task, jobToken, publish, logger })
-
-		logger.info(`testing task ${task.id} (${language}/${exercise})...`)
-		const passed = await runUnitTest({ task, logger })
-
-		logger.info(`task ${task.id} (${language}/${exercise}) -> ${passed}`)
-		await updateTask(task.id, { passed })
-
-		await publish({
-			eventName: passed ? RooCodeEventName.EvalPass : RooCodeEventName.EvalFail,
-			taskId: task.id,
-		})
-	} finally {
-		await deregisterRunner({ runId: run.id, taskId })
-	}
-}
-
-export const processTaskInContainer = async ({
-	taskId,
-	jobToken,
-	logger,
-	maxRetries = 10,
-}: {
-	taskId: number
-	jobToken: string | null
-	logger: Logger
-	maxRetries?: number
-}) => {
-	const baseArgs = [
-		"--rm",
-		"--network evals_default",
-		"-v /var/run/docker.sock:/var/run/docker.sock",
-		"-v /tmp/evals:/var/log/evals",
-		"-e HOST_EXECUTION_METHOD=docker",
-	]
-
-	if (jobToken) {
-		baseArgs.push(`-e ROO_CODE_CLOUD_TOKEN=${jobToken}`)
-	}
-
-	const command = `pnpm --filter @roo-code/evals cli --taskId ${taskId}`
-	logger.info(command)
-
-	for (let attempt = 0; attempt <= maxRetries; attempt++) {
-		const containerName = `evals-task-${taskId}.${attempt}`
-		const args = [`--name ${containerName}`, `-e EVALS_ATTEMPT=${attempt}`, ...baseArgs]
-		const isRetry = attempt > 0
-
-		if (isRetry) {
-			const delayMs = Math.pow(2, attempt - 1) * 1000 * (0.5 + Math.random())
-			logger.info(`retrying in ${delayMs}ms (attempt ${attempt + 1}/${maxRetries + 1})`)
-			await new Promise((resolve) => setTimeout(resolve, delayMs))
-		}
-
-		logger.info(
-			`${isRetry ? "retrying" : "executing"} container command (attempt ${attempt + 1}/${maxRetries + 1})`,
-		)
-
-		const subprocess = execa(`docker run ${args.join(" ")} evals-runner sh -c "${command}"`, { shell: true })
-		// subprocess.stdout?.on("data", (data) => console.log(data.toString()))
-		// subprocess.stderr?.on("data", (data) => console.error(data.toString()))
-
-		try {
-			const result = await subprocess
-			logger.info(`container process completed with exit code: ${result.exitCode}`)
-			return
-		} catch (error) {
-			if (error && typeof error === "object" && "exitCode" in error) {
-				logger.error(
-					`container process failed with exit code: ${error.exitCode} (attempt ${attempt + 1}/${maxRetries + 1})`,
-				)
-			} else {
-				logger.error(`container process failed with error: ${error} (attempt ${attempt + 1}/${maxRetries + 1})`)
-			}
-
-			if (attempt === maxRetries) {
-				break
-			}
-		}
-	}
-
-	logger.error(`all ${maxRetries + 1} attempts failed, giving up`)
-
-	// TODO: Mark task as failed.
-}
-
-type RunTaskOptions = {
-	run: Run
-	task: Task
-	jobToken: string | null
-	publish: (taskEvent: TaskEvent) => Promise<void>
-	logger: Logger
-}
-
-export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskOptions) => {
+export const runTaskInVscode = async ({ run, task, publish, logger, jobToken }: RunTaskOptions) => {
 	const { language, exercise } = task
 	const { language, exercise } = task
 	const prompt = fs.readFileSync(path.resolve(EVALS_REPO_PATH, `prompts/${language}.md`), "utf-8")
 	const prompt = fs.readFileSync(path.resolve(EVALS_REPO_PATH, `prompts/${language}.md`), "utf-8")
 	const workspacePath = path.resolve(EVALS_REPO_PATH, language, exercise)
 	const workspacePath = path.resolve(EVALS_REPO_PATH, language, exercise)
@@ -410,24 +214,7 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO
 
 
 			// For both TaskTokenUsageUpdated and TaskCompleted: toolUsage is payload[2]
 			// For both TaskTokenUsageUpdated and TaskCompleted: toolUsage is payload[2]
 			const incomingToolUsage: ToolUsage = payload[2] ?? {}
 			const incomingToolUsage: ToolUsage = payload[2] ?? {}
-
-			// Merge incoming tool usage with accumulated data using MAX strategy.
-			// This handles the case where a task is rehydrated after abort:
-			// - Empty rehydrated data won't overwrite existing: max(5, 0) = 5
-			// - Legitimate restart with additional work is captured: max(5, 8) = 8
-			// Each task instance tracks its own cumulative values, so we take the max
-			// to preserve the highest values seen across all instances.
-			for (const [toolName, usage] of Object.entries(incomingToolUsage)) {
-				const existing = accumulatedToolUsage[toolName as keyof ToolUsage]
-				if (existing) {
-					accumulatedToolUsage[toolName as keyof ToolUsage] = {
-						attempts: Math.max(existing.attempts, usage.attempts),
-						failures: Math.max(existing.failures, usage.failures),
-					}
-				} else {
-					accumulatedToolUsage[toolName as keyof ToolUsage] = { ...usage }
-				}
-			}
+			mergeToolUsage(accumulatedToolUsage, incomingToolUsage)
 
 
 			await updateTaskMetrics(taskMetricsId, {
 			await updateTaskMetrics(taskMetricsId, {
 				cost: totalCost,
 				cost: totalCost,
@@ -514,35 +301,7 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO
 	logger.info("waiting for subprocess to finish")
 	logger.info("waiting for subprocess to finish")
 	controller.abort()
 	controller.abort()
 
 
-	// Wait for subprocess to finish gracefully, with a timeout.
-	const SUBPROCESS_TIMEOUT = 10_000
-
-	try {
-		await Promise.race([
-			subprocess,
-			new Promise((_, reject) =>
-				setTimeout(() => reject(new SubprocessTimeoutError(SUBPROCESS_TIMEOUT)), SUBPROCESS_TIMEOUT),
-			),
-		])
-
-		logger.info("subprocess finished gracefully")
-	} catch (error) {
-		if (error instanceof SubprocessTimeoutError) {
-			logger.error("subprocess did not finish within timeout, force killing")
-
-			try {
-				if (subprocess.kill("SIGKILL")) {
-					logger.info("SIGKILL sent to subprocess")
-				} else {
-					logger.error("failed to send SIGKILL to subprocess")
-				}
-			} catch (killError) {
-				logger.error("subprocess.kill(SIGKILL) failed:", killError)
-			}
-		} else {
-			throw error
-		}
-	}
+	await waitForSubprocessWithTimeout({ subprocess, logger })
 
 
 	// Copy conversation history files from VS Code extension storage to the log directory
 	// Copy conversation history files from VS Code extension storage to the log directory
 	// for post-mortem analysis. Only do this in containerized mode where we have a known path.
 	// for post-mortem analysis. Only do this in containerized mode where we have a known path.

+ 19 - 0
packages/evals/src/cli/types.ts

@@ -0,0 +1,19 @@
+import { type TaskEvent } from "@roo-code/types"
+
+import type { Run, Task } from "../db/index.js"
+import { Logger } from "./utils.js"
+
+export class SubprocessTimeoutError extends Error {
+	constructor(timeout: number) {
+		super(`Subprocess timeout after ${timeout}ms`)
+		this.name = "SubprocessTimeoutError"
+	}
+}
+
+export type RunTaskOptions = {
+	run: Run
+	task: Task
+	jobToken: string | null
+	publish: (taskEvent: TaskEvent) => Promise<void>
+	logger: Logger
+}

+ 136 - 1
packages/evals/src/cli/utils.ts

@@ -1,10 +1,15 @@
 import * as fs from "fs"
 import * as fs from "fs"
+import * as fsp from "fs/promises"
 import * as path from "path"
 import * as path from "path"
 
 
-import { execa } from "execa"
+import { execa, type ResultPromise } from "execa"
+
+import type { ToolUsage } from "@roo-code/types"
 
 
 import type { Run, Task } from "../db/index.js"
 import type { Run, Task } from "../db/index.js"
 
 
+import { SubprocessTimeoutError } from "./types.js"
+
 export const getTag = (caller: string, { run, task }: { run: Run; task?: Task }) =>
 export const getTag = (caller: string, { run, task }: { run: Run; task?: Task }) =>
 	task
 	task
 		? `${caller} | pid:${process.pid} | run:${run.id} | task:${task.id} | ${task.language}/${task.exercise}`
 		? `${caller} | pid:${process.pid} | run:${run.id} | task:${task.id} | ${task.language}/${task.exercise}`
@@ -107,6 +112,22 @@ export class Logger {
 		this.info(message, ...args)
 		this.info(message, ...args)
 	}
 	}
 
 
+	/**
+	 * Write raw output without any prefix (timestamp, level, tag).
+	 * Useful for streaming CLI output where the prefix would be noise.
+	 */
+	public raw(message: string): void {
+		try {
+			console.log(message)
+
+			if (this.logStream) {
+				this.logStream.write(message + "\n")
+			}
+		} catch (error) {
+			console.error(`Failed to write to log file ${this.logFilePath}:`, error)
+		}
+	}
+
 	public close(): void {
 	public close(): void {
 		if (this.logStream) {
 		if (this.logStream) {
 			this.logStream.end()
 			this.logStream.end()
@@ -114,3 +135,117 @@ export class Logger {
 		}
 		}
 	}
 	}
 }
 }
+
+/**
+ * Copy conversation history files from VS Code extension storage to the log directory.
+ * This allows us to preserve the api_conversation_history.json and ui_messages.json
+ * files for post-mortem analysis alongside the log files.
+ */
+export async function copyConversationHistory({
+	rooTaskId,
+	logDir,
+	language,
+	exercise,
+	iteration,
+	logger,
+}: {
+	rooTaskId: string
+	logDir: string
+	language: string
+	exercise: string
+	iteration: number
+	logger: Logger
+}): Promise<void> {
+	// VS Code extension global storage path within the container
+	const extensionStoragePath = "/roo/.vscode/User/globalStorage/rooveterinaryinc.roo-cline"
+	const taskStoragePath = path.join(extensionStoragePath, "tasks", rooTaskId)
+
+	const filesToCopy = ["api_conversation_history.json", "ui_messages.json"]
+
+	for (const filename of filesToCopy) {
+		const sourcePath = path.join(taskStoragePath, filename)
+		// Use sanitized exercise name (replace slashes with dashes) for the destination filename
+		// Include iteration number to handle multiple attempts at the same exercise
+		const sanitizedExercise = exercise.replace(/\//g, "-")
+		const destFilename = `${language}-${sanitizedExercise}.${iteration}_${filename}`
+		const destPath = path.join(logDir, destFilename)
+
+		try {
+			// Check if source file exists
+			await fsp.access(sourcePath)
+
+			// Copy the file
+			await fsp.copyFile(sourcePath, destPath)
+			logger.info(`copied ${filename} to ${destPath}`)
+		} catch (error) {
+			// File may not exist if task didn't complete properly - this is not fatal
+			if ((error as NodeJS.ErrnoException).code === "ENOENT") {
+				logger.info(`${filename} not found at ${sourcePath} - skipping`)
+			} else {
+				logger.error(`failed to copy ${filename}:`, error)
+			}
+		}
+	}
+}
+
+/**
+ * Merge incoming tool usage with accumulated data using MAX strategy.
+ * This handles the case where a task is rehydrated after abort:
+ * - Empty rehydrated data won't overwrite existing: max(5, 0) = 5
+ * - Legitimate restart with additional work is captured: max(5, 8) = 8
+ * Each task instance tracks its own cumulative values, so we take the max
+ * to preserve the highest values seen across all instances.
+ */
+export function mergeToolUsage(accumulated: ToolUsage, incoming: ToolUsage): void {
+	for (const [toolName, usage] of Object.entries(incoming)) {
+		const existing = accumulated[toolName as keyof ToolUsage]
+
+		if (existing) {
+			accumulated[toolName as keyof ToolUsage] = {
+				attempts: Math.max(existing.attempts, usage.attempts),
+				failures: Math.max(existing.failures, usage.failures),
+			}
+		} else {
+			accumulated[toolName as keyof ToolUsage] = { ...usage }
+		}
+	}
+}
+
+/**
+ * Wait for a subprocess to finish gracefully, with a timeout.
+ * If the subprocess doesn't finish within the timeout, force kill it with SIGKILL.
+ */
+export async function waitForSubprocessWithTimeout({
+	subprocess,
+	timeoutMs = 10_000,
+	logger,
+}: {
+	subprocess: ResultPromise
+	timeoutMs?: number
+	logger: Logger
+}): Promise<void> {
+	try {
+		await Promise.race([
+			subprocess,
+			new Promise((_, reject) => setTimeout(() => reject(new SubprocessTimeoutError(timeoutMs)), timeoutMs)),
+		])
+
+		logger.info("subprocess finished gracefully")
+	} catch (error) {
+		if (error instanceof SubprocessTimeoutError) {
+			logger.error("subprocess did not finish within timeout, force killing")
+
+			try {
+				if (subprocess.kill("SIGKILL")) {
+					logger.info("SIGKILL sent to subprocess")
+				} else {
+					logger.error("failed to send SIGKILL to subprocess")
+				}
+			} catch (killError) {
+				logger.error("subprocess.kill(SIGKILL) failed:", killError)
+			}
+		} else {
+			throw error
+		}
+	}
+}

+ 1 - 0
packages/evals/src/db/migrations/0006_worried_spectrum.sql

@@ -0,0 +1 @@
+ALTER TABLE "runs" ADD COLUMN "execution_method" text DEFAULT 'vscode' NOT NULL;

+ 479 - 0
packages/evals/src/db/migrations/meta/0006_snapshot.json

@@ -0,0 +1,479 @@
+{
+	"id": "ae1ebc36-8f5b-43e1-8e47-5a63d72ed05f",
+	"prevId": "71b54967-86df-42ec-a200-bfd8dad85069",
+	"version": "7",
+	"dialect": "postgresql",
+	"tables": {
+		"public.runs": {
+			"name": "runs",
+			"schema": "",
+			"columns": {
+				"id": {
+					"name": "id",
+					"type": "integer",
+					"primaryKey": true,
+					"notNull": true,
+					"identity": {
+						"type": "always",
+						"name": "runs_id_seq",
+						"schema": "public",
+						"increment": "1",
+						"startWith": "1",
+						"minValue": "1",
+						"maxValue": "2147483647",
+						"cache": "1",
+						"cycle": false
+					}
+				},
+				"task_metrics_id": {
+					"name": "task_metrics_id",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"model": {
+					"name": "model",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"name": {
+					"name": "name",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"description": {
+					"name": "description",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"contextWindow": {
+					"name": "contextWindow",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"inputPrice": {
+					"name": "inputPrice",
+					"type": "real",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"outputPrice": {
+					"name": "outputPrice",
+					"type": "real",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"cacheWritesPrice": {
+					"name": "cacheWritesPrice",
+					"type": "real",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"cacheReadsPrice": {
+					"name": "cacheReadsPrice",
+					"type": "real",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"settings": {
+					"name": "settings",
+					"type": "jsonb",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"jobToken": {
+					"name": "jobToken",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"pid": {
+					"name": "pid",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"socket_path": {
+					"name": "socket_path",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"execution_method": {
+					"name": "execution_method",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": true,
+					"default": "'vscode'"
+				},
+				"concurrency": {
+					"name": "concurrency",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true,
+					"default": 2
+				},
+				"timeout": {
+					"name": "timeout",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true,
+					"default": 5
+				},
+				"passed": {
+					"name": "passed",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true,
+					"default": 0
+				},
+				"failed": {
+					"name": "failed",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true,
+					"default": 0
+				},
+				"created_at": {
+					"name": "created_at",
+					"type": "timestamp",
+					"primaryKey": false,
+					"notNull": true
+				}
+			},
+			"indexes": {},
+			"foreignKeys": {
+				"runs_task_metrics_id_taskMetrics_id_fk": {
+					"name": "runs_task_metrics_id_taskMetrics_id_fk",
+					"tableFrom": "runs",
+					"tableTo": "taskMetrics",
+					"columnsFrom": ["task_metrics_id"],
+					"columnsTo": ["id"],
+					"onDelete": "no action",
+					"onUpdate": "no action"
+				}
+			},
+			"compositePrimaryKeys": {},
+			"uniqueConstraints": {},
+			"policies": {},
+			"checkConstraints": {},
+			"isRLSEnabled": false
+		},
+		"public.taskMetrics": {
+			"name": "taskMetrics",
+			"schema": "",
+			"columns": {
+				"id": {
+					"name": "id",
+					"type": "integer",
+					"primaryKey": true,
+					"notNull": true,
+					"identity": {
+						"type": "always",
+						"name": "taskMetrics_id_seq",
+						"schema": "public",
+						"increment": "1",
+						"startWith": "1",
+						"minValue": "1",
+						"maxValue": "2147483647",
+						"cache": "1",
+						"cycle": false
+					}
+				},
+				"tokens_in": {
+					"name": "tokens_in",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"tokens_out": {
+					"name": "tokens_out",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"tokens_context": {
+					"name": "tokens_context",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"cache_writes": {
+					"name": "cache_writes",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"cache_reads": {
+					"name": "cache_reads",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"cost": {
+					"name": "cost",
+					"type": "real",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"duration": {
+					"name": "duration",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"tool_usage": {
+					"name": "tool_usage",
+					"type": "jsonb",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"created_at": {
+					"name": "created_at",
+					"type": "timestamp",
+					"primaryKey": false,
+					"notNull": true
+				}
+			},
+			"indexes": {},
+			"foreignKeys": {},
+			"compositePrimaryKeys": {},
+			"uniqueConstraints": {},
+			"policies": {},
+			"checkConstraints": {},
+			"isRLSEnabled": false
+		},
+		"public.tasks": {
+			"name": "tasks",
+			"schema": "",
+			"columns": {
+				"id": {
+					"name": "id",
+					"type": "integer",
+					"primaryKey": true,
+					"notNull": true,
+					"identity": {
+						"type": "always",
+						"name": "tasks_id_seq",
+						"schema": "public",
+						"increment": "1",
+						"startWith": "1",
+						"minValue": "1",
+						"maxValue": "2147483647",
+						"cache": "1",
+						"cycle": false
+					}
+				},
+				"run_id": {
+					"name": "run_id",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"task_metrics_id": {
+					"name": "task_metrics_id",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"language": {
+					"name": "language",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"exercise": {
+					"name": "exercise",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"iteration": {
+					"name": "iteration",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true,
+					"default": 1
+				},
+				"passed": {
+					"name": "passed",
+					"type": "boolean",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"started_at": {
+					"name": "started_at",
+					"type": "timestamp",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"finished_at": {
+					"name": "finished_at",
+					"type": "timestamp",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"created_at": {
+					"name": "created_at",
+					"type": "timestamp",
+					"primaryKey": false,
+					"notNull": true
+				}
+			},
+			"indexes": {
+				"tasks_language_exercise_iteration_idx": {
+					"name": "tasks_language_exercise_iteration_idx",
+					"columns": [
+						{
+							"expression": "run_id",
+							"isExpression": false,
+							"asc": true,
+							"nulls": "last"
+						},
+						{
+							"expression": "language",
+							"isExpression": false,
+							"asc": true,
+							"nulls": "last"
+						},
+						{
+							"expression": "exercise",
+							"isExpression": false,
+							"asc": true,
+							"nulls": "last"
+						},
+						{
+							"expression": "iteration",
+							"isExpression": false,
+							"asc": true,
+							"nulls": "last"
+						}
+					],
+					"isUnique": true,
+					"concurrently": false,
+					"method": "btree",
+					"with": {}
+				}
+			},
+			"foreignKeys": {
+				"tasks_run_id_runs_id_fk": {
+					"name": "tasks_run_id_runs_id_fk",
+					"tableFrom": "tasks",
+					"tableTo": "runs",
+					"columnsFrom": ["run_id"],
+					"columnsTo": ["id"],
+					"onDelete": "cascade",
+					"onUpdate": "no action"
+				},
+				"tasks_task_metrics_id_taskMetrics_id_fk": {
+					"name": "tasks_task_metrics_id_taskMetrics_id_fk",
+					"tableFrom": "tasks",
+					"tableTo": "taskMetrics",
+					"columnsFrom": ["task_metrics_id"],
+					"columnsTo": ["id"],
+					"onDelete": "set null",
+					"onUpdate": "no action"
+				}
+			},
+			"compositePrimaryKeys": {},
+			"uniqueConstraints": {},
+			"policies": {},
+			"checkConstraints": {},
+			"isRLSEnabled": false
+		},
+		"public.toolErrors": {
+			"name": "toolErrors",
+			"schema": "",
+			"columns": {
+				"id": {
+					"name": "id",
+					"type": "integer",
+					"primaryKey": true,
+					"notNull": true,
+					"identity": {
+						"type": "always",
+						"name": "toolErrors_id_seq",
+						"schema": "public",
+						"increment": "1",
+						"startWith": "1",
+						"minValue": "1",
+						"maxValue": "2147483647",
+						"cache": "1",
+						"cycle": false
+					}
+				},
+				"run_id": {
+					"name": "run_id",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"task_id": {
+					"name": "task_id",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"tool_name": {
+					"name": "tool_name",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"error": {
+					"name": "error",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"created_at": {
+					"name": "created_at",
+					"type": "timestamp",
+					"primaryKey": false,
+					"notNull": true
+				}
+			},
+			"indexes": {},
+			"foreignKeys": {
+				"toolErrors_run_id_runs_id_fk": {
+					"name": "toolErrors_run_id_runs_id_fk",
+					"tableFrom": "toolErrors",
+					"tableTo": "runs",
+					"columnsFrom": ["run_id"],
+					"columnsTo": ["id"],
+					"onDelete": "cascade",
+					"onUpdate": "no action"
+				},
+				"toolErrors_task_id_tasks_id_fk": {
+					"name": "toolErrors_task_id_tasks_id_fk",
+					"tableFrom": "toolErrors",
+					"tableTo": "tasks",
+					"columnsFrom": ["task_id"],
+					"columnsTo": ["id"],
+					"onDelete": "cascade",
+					"onUpdate": "no action"
+				}
+			},
+			"compositePrimaryKeys": {},
+			"uniqueConstraints": {},
+			"policies": {},
+			"checkConstraints": {},
+			"isRLSEnabled": false
+		}
+	},
+	"enums": {},
+	"schemas": {},
+	"sequences": {},
+	"roles": {},
+	"policies": {},
+	"views": {},
+	"_meta": {
+		"columns": {},
+		"schemas": {},
+		"tables": {}
+	}
+}

+ 7 - 0
packages/evals/src/db/migrations/meta/_journal.json

@@ -43,6 +43,13 @@
 			"when": 1765167049182,
 			"when": 1765167049182,
 			"tag": "0005_strong_skrulls",
 			"tag": "0005_strong_skrulls",
 			"breakpoints": true
 			"breakpoints": true
+		},
+		{
+			"idx": 6,
+			"version": "7",
+			"when": 1767550126096,
+			"tag": "0006_worried_spectrum",
+			"breakpoints": true
 		}
 		}
 	]
 	]
 }
 }

+ 7 - 0
packages/evals/src/db/schema.ts

@@ -5,6 +5,12 @@ import type { RooCodeSettings, ToolName, ToolUsage } from "@roo-code/types"
 
 
 import type { ExerciseLanguage } from "../exercises/index.js"
 import type { ExerciseLanguage } from "../exercises/index.js"
 
 
+/**
+ * ExecutionMethod
+ */
+
+export type ExecutionMethod = "vscode" | "cli"
+
 /**
 /**
  * runs
  * runs
  */
  */
@@ -24,6 +30,7 @@ export const runs = pgTable("runs", {
 	jobToken: text(),
 	jobToken: text(),
 	pid: integer(),
 	pid: integer(),
 	socketPath: text("socket_path").notNull(),
 	socketPath: text("socket_path").notNull(),
+	executionMethod: text("execution_method").default("vscode").notNull().$type<ExecutionMethod>(),
 	concurrency: integer().default(2).notNull(),
 	concurrency: integer().default(2).notNull(),
 	timeout: integer().default(5).notNull(),
 	timeout: integer().default(5).notNull(),
 	passed: integer().default(0).notNull(),
 	passed: integer().default(0).notNull(),