Browse Source

Add web-evals updates and kill run functionality (#9681)

Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
Co-authored-by: Matt Rubens <[email protected]>
Co-authored-by: Roo Code <[email protected]>
Hannes Rudolph 3 months ago
parent
commit
d2017c80cf

+ 207 - 0
apps/web-evals/src/actions/__tests__/killRun.spec.ts

@@ -0,0 +1,207 @@
+// npx vitest run src/actions/__tests__/killRun.spec.ts
+
+import { execFileSync } from "child_process"
+
+// Mock child_process
+vi.mock("child_process", () => ({
+	execFileSync: vi.fn(),
+	spawn: vi.fn(),
+}))
+
+// Mock next/cache
+vi.mock("next/cache", () => ({
+	revalidatePath: vi.fn(),
+}))
+
+// Mock redis client
+vi.mock("@/lib/server/redis", () => ({
+	redisClient: vi.fn().mockResolvedValue({
+		del: vi.fn().mockResolvedValue(1),
+	}),
+}))
+
+// Mock @roo-code/evals
+vi.mock("@roo-code/evals", () => ({
+	createRun: vi.fn(),
+	deleteRun: vi.fn(),
+	createTask: vi.fn(),
+	exerciseLanguages: [],
+	getExercisesForLanguage: vi.fn().mockResolvedValue([]),
+}))
+
+// Mock timers to speed up tests
+vi.useFakeTimers()
+
+// Import after mocks
+import { killRun } from "../runs"
+
+const mockExecFileSync = execFileSync as ReturnType<typeof vi.fn>
+
+describe("killRun", () => {
+	beforeEach(() => {
+		vi.clearAllMocks()
+	})
+
+	afterEach(() => {
+		vi.clearAllTimers()
+	})
+
+	it("should kill controller first, wait, then kill task containers", async () => {
+		const runId = 123
+
+		// execFileSync is used for all docker commands
+		mockExecFileSync
+			.mockReturnValueOnce("") // docker kill controller
+			.mockReturnValueOnce("evals-task-123-456.0\nevals-task-123-789.1\n") // docker ps
+			.mockReturnValueOnce("") // docker kill evals-task-123-456.0
+			.mockReturnValueOnce("") // docker kill evals-task-123-789.1
+
+		const resultPromise = killRun(runId)
+
+		// Fast-forward past the 10 second sleep
+		await vi.advanceTimersByTimeAsync(10000)
+
+		const result = await resultPromise
+
+		expect(result.success).toBe(true)
+		expect(result.killedContainers).toContain("evals-controller-123")
+		expect(result.killedContainers).toContain("evals-task-123-456.0")
+		expect(result.killedContainers).toContain("evals-task-123-789.1")
+		expect(result.errors).toHaveLength(0)
+
+		// Verify execFileSync was called for docker kill
+		expect(mockExecFileSync).toHaveBeenNthCalledWith(
+			1,
+			"docker",
+			["kill", "evals-controller-123"],
+			expect.any(Object),
+		)
+		// Verify execFileSync was called for docker ps with run-specific filter
+		expect(mockExecFileSync).toHaveBeenNthCalledWith(
+			2,
+			"docker",
+			["ps", "--format", "{{.Names}}", "--filter", "name=evals-task-123-"],
+			expect.any(Object),
+		)
+	})
+
+	it("should continue killing runners even if controller is not running", async () => {
+		const runId = 456
+
+		mockExecFileSync
+			.mockImplementationOnce(() => {
+				throw new Error("No such container")
+			}) // controller kill fails
+			.mockReturnValueOnce("evals-task-456-100.0\n") // docker ps
+			.mockReturnValueOnce("") // docker kill task
+
+		const resultPromise = killRun(runId)
+		await vi.advanceTimersByTimeAsync(10000)
+		const result = await resultPromise
+
+		expect(result.success).toBe(true)
+		expect(result.killedContainers).toContain("evals-task-456-100.0")
+		// Controller not in list since it failed
+		expect(result.killedContainers).not.toContain("evals-controller-456")
+	})
+
+	it("should clear Redis state after killing containers", async () => {
+		const runId = 789
+
+		const mockDel = vi.fn().mockResolvedValue(1)
+		const { redisClient } = await import("@/lib/server/redis")
+		vi.mocked(redisClient).mockResolvedValue({ del: mockDel } as never)
+
+		mockExecFileSync
+			.mockReturnValueOnce("") // controller kill
+			.mockReturnValueOnce("") // docker ps (no tasks)
+
+		const resultPromise = killRun(runId)
+		await vi.advanceTimersByTimeAsync(10000)
+		await resultPromise
+
+		expect(mockDel).toHaveBeenCalledWith("heartbeat:789")
+		expect(mockDel).toHaveBeenCalledWith("runners:789")
+	})
+
+	it("should handle docker ps failure gracefully", async () => {
+		const runId = 111
+
+		mockExecFileSync
+			.mockReturnValueOnce("") // controller kill succeeds
+			.mockImplementationOnce(() => {
+				throw new Error("Docker error")
+			}) // docker ps fails
+
+		const resultPromise = killRun(runId)
+		await vi.advanceTimersByTimeAsync(10000)
+		const result = await resultPromise
+
+		// Should still be successful because controller was killed
+		expect(result.success).toBe(true)
+		expect(result.killedContainers).toContain("evals-controller-111")
+		expect(result.errors).toContain("Failed to list Docker task containers")
+	})
+
+	it("should handle individual task kill failures", async () => {
+		const runId = 222
+
+		mockExecFileSync
+			.mockReturnValueOnce("") // controller kill
+			.mockReturnValueOnce("evals-task-222-300.0\nevals-task-222-400.0\n") // docker ps
+			.mockImplementationOnce(() => {
+				throw new Error("Kill failed")
+			}) // first task kill fails
+			.mockReturnValueOnce("") // second task kill succeeds
+
+		const resultPromise = killRun(runId)
+		await vi.advanceTimersByTimeAsync(10000)
+		const result = await resultPromise
+
+		expect(result.success).toBe(true)
+		expect(result.killedContainers).toContain("evals-controller-222")
+		expect(result.killedContainers).toContain("evals-task-222-400.0")
+		expect(result.errors.length).toBe(1)
+		expect(result.errors[0]).toContain("evals-task-222-300.0")
+	})
+
+	it("should return success with no containers when nothing is running", async () => {
+		const runId = 333
+
+		mockExecFileSync
+			.mockImplementationOnce(() => {
+				throw new Error("No such container")
+			}) // controller not running
+			.mockReturnValueOnce("") // no task containers
+
+		const resultPromise = killRun(runId)
+		await vi.advanceTimersByTimeAsync(10000)
+		const result = await resultPromise
+
+		expect(result.success).toBe(true)
+		expect(result.killedContainers).toHaveLength(0)
+		expect(result.errors).toHaveLength(0)
+	})
+
+	it("should only kill containers belonging to the specific run", async () => {
+		const runId = 555
+
+		mockExecFileSync
+			.mockReturnValueOnce("") // controller kill
+			.mockReturnValueOnce("evals-task-555-100.0\n") // docker ps
+			.mockReturnValueOnce("") // docker kill task
+
+		const resultPromise = killRun(runId)
+		await vi.advanceTimersByTimeAsync(10000)
+		const result = await resultPromise
+
+		expect(result.success).toBe(true)
+		// Verify execFileSync was called for docker ps with run-specific filter
+		expect(mockExecFileSync).toHaveBeenNthCalledWith(
+			2,
+			"docker",
+			["ps", "--format", "{{.Names}}", "--filter", "name=evals-task-555-"],
+			expect.any(Object),
+		)
+	})
+})

+ 99 - 1
apps/web-evals/src/actions/runs.ts

@@ -3,7 +3,7 @@
 import * as path from "path"
 import fs from "fs"
 import { fileURLToPath } from "url"
-import { spawn } from "child_process"
+import { spawn, execFileSync } from "child_process"
 
 import { revalidatePath } from "next/cache"
 import pMap from "p-map"
@@ -18,6 +18,7 @@ import {
 } from "@roo-code/evals"
 
 import { CreateRun } from "@/lib/schemas"
+import { redisClient } from "@/lib/server/redis"
 
 const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals")
 
@@ -116,3 +117,100 @@ export async function deleteRun(runId: number) {
 	await _deleteRun(runId)
 	revalidatePath("/runs")
 }
+
+export type KillRunResult = {
+	success: boolean
+	killedContainers: string[]
+	errors: string[]
+}
+
+const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms))
+
+/**
+ * Kill all Docker containers associated with a run (controller and task runners).
+ * Kills the controller first, waits 10 seconds, then kills runners.
+ * Also clears Redis state for heartbeat and runners.
+ *
+ * Container naming conventions:
+ * - Controller: evals-controller-{runId}
+ * - Task runners: evals-task-{runId}-{taskId}.{attempt}
+ */
+export async function killRun(runId: number): Promise<KillRunResult> {
+	const killedContainers: string[] = []
+	const errors: string[] = []
+	const controllerPattern = `evals-controller-${runId}`
+	const taskPattern = `evals-task-${runId}-`
+
+	try {
+		// Step 1: Kill the controller first
+		console.log(`Killing controller: ${controllerPattern}`)
+		try {
+			execFileSync("docker", ["kill", controllerPattern], { encoding: "utf-8", timeout: 10000 })
+			killedContainers.push(controllerPattern)
+			console.log(`Killed controller container: ${controllerPattern}`)
+		} catch (_error) {
+			// Controller might not be running - that's ok, continue to kill runners
+			console.log(`Controller ${controllerPattern} not running or already stopped`)
+		}
+
+		// Step 2: Wait 10 seconds before killing runners
+		console.log("Waiting 10 seconds before killing runners...")
+		await sleep(10000)
+
+		// Step 3: Find and kill all task runner containers for THIS run only
+		let taskContainerNames: string[] = []
+
+		try {
+			const output = execFileSync("docker", ["ps", "--format", "{{.Names}}", "--filter", `name=${taskPattern}`], {
+				encoding: "utf-8",
+				timeout: 10000,
+			})
+			taskContainerNames = output
+				.split("\n")
+				.map((name) => name.trim())
+				.filter((name) => name.length > 0 && name.startsWith(taskPattern))
+		} catch (error) {
+			console.error("Failed to list task containers:", error)
+			errors.push("Failed to list Docker task containers")
+		}
+
+		// Kill each task runner container
+		for (const containerName of taskContainerNames) {
+			try {
+				execFileSync("docker", ["kill", containerName], { encoding: "utf-8", timeout: 10000 })
+				killedContainers.push(containerName)
+				console.log(`Killed task container: ${containerName}`)
+			} catch (error) {
+				// Container might have already stopped
+				console.error(`Failed to kill container ${containerName}:`, error)
+				errors.push(`Failed to kill container: ${containerName}`)
+			}
+		}
+
+		// Step 4: Clear Redis state
+		try {
+			const redis = await redisClient()
+			const heartbeatKey = `heartbeat:${runId}`
+			const runnersKey = `runners:${runId}`
+
+			await redis.del(heartbeatKey)
+			await redis.del(runnersKey)
+			console.log(`Cleared Redis keys: ${heartbeatKey}, ${runnersKey}`)
+		} catch (error) {
+			console.error("Failed to clear Redis state:", error)
+			errors.push("Failed to clear Redis state")
+		}
+	} catch (error) {
+		console.error("Error in killRun:", error)
+		errors.push("Unexpected error while killing containers")
+	}
+
+	revalidatePath(`/runs/${runId}`)
+	revalidatePath("/runs")
+
+	return {
+		success: killedContainers.length > 0 || errors.length === 0,
+		killedContainers,
+		errors,
+	}
+}

+ 1 - 1
apps/web-evals/src/app/runs/[id]/page.tsx

@@ -7,7 +7,7 @@ export default async function Page({ params }: { params: Promise<{ id: string }>
 	const run = await findRun(Number(id))
 
 	return (
-		<div className="max-w-3xl mx-auto px-12 p-12">
+		<div className="w-full px-6 py-12">
 			<Run run={run} />
 		</div>
 	)

+ 74 - 50
apps/web-evals/src/app/runs/[id]/run-status.tsx

@@ -1,55 +1,79 @@
 "use client"
 
+import { Link2, Link2Off, CheckCircle2 } from "lucide-react"
 import type { RunStatus as _RunStatus } from "@/hooks/use-run-status"
 import { cn } from "@/lib/utils"
+import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui"
 
-export const RunStatus = ({ runStatus: { sseStatus, heartbeat, runners = [] } }: { runStatus: _RunStatus }) => (
-	<div>
-		<div className="flex items-center gap-2">
-			<div className="flex items-center gap-2">
-				<div>Task Stream:</div>
-				<div className="font-mono text-sm text-muted-foreground">{sseStatus}</div>
-			</div>
-			<div className="relative">
-				<div
-					className={cn("absolute size-2.5 rounded-full opacity-50 animate-ping", {
-						"bg-green-500": sseStatus === "connected",
-						"bg-amber-500": sseStatus === "waiting",
-						"bg-rose-500": sseStatus === "error",
-					})}
-				/>
-				<div
-					className={cn("size-2.5 rounded-full", {
-						"bg-green-500": sseStatus === "connected",
-						"bg-amber-500": sseStatus === "waiting",
-						"bg-rose-500": sseStatus === "error",
-					})}
-				/>
-			</div>
-		</div>
-		<div className="flex items-center gap-2">
-			<div className="flex items-center gap-2">
-				<div>Task Controller:</div>
-				<div className="font-mono text-sm text-muted-foreground">{heartbeat ?? "dead"}</div>
-			</div>
-			<div className="relative">
-				<div
-					className={cn("absolute size-2.5 rounded-full opacity-50 animate-ping", {
-						"bg-green-500": !!heartbeat,
-						"bg-rose-500": !heartbeat,
-					})}
-				/>
-				<div
-					className={cn("size-2.5 rounded-full", {
-						"bg-green-500": !!heartbeat,
-						"bg-rose-500": !heartbeat,
-					})}
-				/>
-			</div>
-		</div>
-		<div className="flex items-center gap-2">
-			<div>Task Runners:</div>
-			{runners.length > 0 && <div className="font-mono text-sm text-muted-foreground">{runners?.join(", ")}</div>}
-		</div>
-	</div>
-)
+function StreamIcon({ status }: { status: "connected" | "waiting" | "error" }) {
+	if (status === "connected") {
+		return <Link2 className="size-4 text-green-500" />
+	}
+	return <Link2Off className={cn("size-4", status === "waiting" ? "text-amber-500" : "text-rose-500")} />
+}
+
+export const RunStatus = ({
+	runStatus: { sseStatus, heartbeat, runners = [] },
+	isComplete = false,
+}: {
+	runStatus: _RunStatus
+	isComplete?: boolean
+}) => {
+	// For completed runs, show a simple "Complete" badge
+	if (isComplete) {
+		return (
+			<Tooltip>
+				<TooltipTrigger asChild>
+					<div className="flex items-center gap-1 cursor-default text-muted-foreground">
+						<CheckCircle2 className="size-4" />
+					</div>
+				</TooltipTrigger>
+				<TooltipContent side="bottom" className="font-mono text-xs">
+					Run complete
+				</TooltipContent>
+			</Tooltip>
+		)
+	}
+
+	return (
+		<Tooltip>
+			<TooltipTrigger asChild>
+				<div className="flex items-center gap-2 cursor-default text-xs font-mono">
+					{/* Task Stream status icon */}
+					<StreamIcon status={sseStatus} />
+
+					{/* Task Controller ID */}
+					<span className={heartbeat ? "text-green-500" : "text-rose-500"}>{heartbeat ?? "-"}</span>
+
+					{/* Task Runners count */}
+					<span className={runners.length > 0 ? "text-green-500" : "text-rose-500"}>
+						{runners.length > 0 ? `${runners.length}r` : "0r"}
+					</span>
+				</div>
+			</TooltipTrigger>
+			<TooltipContent side="bottom" className="font-mono text-xs max-w-md">
+				<div className="space-y-1">
+					<div className="flex items-center gap-2">
+						<StreamIcon status={sseStatus} />
+						<span>Task Stream: {sseStatus}</span>
+					</div>
+					<div className="flex items-center gap-2">
+						<span className={heartbeat ? "text-green-500" : "text-rose-500"}>●</span>
+						<span>Task Controller: {heartbeat ?? "dead"}</span>
+					</div>
+					<div className="flex items-center gap-2">
+						<span className={runners.length > 0 ? "text-green-500" : "text-rose-500"}>●</span>
+						<span>Task Runners: {runners.length > 0 ? runners.length : "none"}</span>
+					</div>
+					{runners.length > 0 && (
+						<div className="mt-2 pt-2 border-t border-border text-muted-foreground space-y-0.5">
+							{runners.map((runner) => (
+								<div key={runner}>{runner}</div>
+							))}
+						</div>
+					)}
+				</div>
+			</TooltipContent>
+		</Tooltip>
+	)
+}

+ 418 - 141
apps/web-evals/src/app/runs/[id]/run.tsx

@@ -2,12 +2,14 @@
 
 import { useMemo, useState, useCallback, useEffect } from "react"
 import { toast } from "sonner"
-import { LoaderCircle, FileText, Copy, Check } from "lucide-react"
+import { LoaderCircle, FileText, Copy, Check, StopCircle } from "lucide-react"
 
 import type { Run, TaskMetrics as _TaskMetrics, Task } from "@roo-code/evals"
+import type { ToolName } from "@roo-code/types"
 
 import { formatCurrency, formatDuration, formatTokens, formatToolUsageSuccessRate } from "@/lib/formatters"
 import { useRunStatus } from "@/hooks/use-run-status"
+import { killRun } from "@/actions/runs"
 import {
 	Table,
 	TableBody,
@@ -24,6 +26,14 @@ import {
 	DialogTitle,
 	ScrollArea,
 	Button,
+	AlertDialog,
+	AlertDialogAction,
+	AlertDialogCancel,
+	AlertDialogContent,
+	AlertDialogDescription,
+	AlertDialogFooter,
+	AlertDialogHeader,
+	AlertDialogTitle,
 } from "@/components/ui"
 
 import { TaskStatus } from "./task-status"
@@ -51,19 +61,80 @@ type HighlightPattern = {
 }
 
 const HIGHLIGHT_PATTERNS: HighlightPattern[] = [
-	// Timestamps [YYYY-MM-DDTHH:MM:SS.sssZ]
-	{ pattern: /\[(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z)\]/g, className: "text-blue-400" },
-	// Log levels
+	// Log levels - styled as badges
 	{ pattern: /\|\s*(INFO)\s*\|/g, className: "text-green-400", wrapGroup: 1 },
 	{ pattern: /\|\s*(WARN|WARNING)\s*\|/g, className: "text-yellow-400", wrapGroup: 1 },
-	{ pattern: /\|\s*(ERROR)\s*\|/g, className: "text-red-400", wrapGroup: 1 },
+	{ pattern: /\|\s*(ERROR)\s*\|/g, className: "text-red-400 font-semibold", wrapGroup: 1 },
 	{ pattern: /\|\s*(DEBUG)\s*\|/g, className: "text-gray-400", wrapGroup: 1 },
-	// Task identifiers
-	{ pattern: /(taskCreated|taskFocused|taskStarted|taskCompleted|EvalPass|EvalFail)/g, className: "text-purple-400" },
+	// Task identifiers - important events
+	{
+		pattern: /(taskCreated|taskFocused|taskStarted|taskCompleted|taskAborted|taskResumable)/g,
+		className: "text-purple-400 font-medium",
+	},
+	// Tool failures - highlight in red
+	{ pattern: /(taskToolFailed)/g, className: "text-red-400 font-bold" },
+	{ pattern: /(Tool execution failed|tool.*failed|failed.*tool)/gi, className: "text-red-400" },
+	{ pattern: /(EvalPass)/g, className: "text-green-400 font-bold" },
+	{ pattern: /(EvalFail)/g, className: "text-red-400 font-bold" },
 	// Message arrows
 	{ pattern: /→/g, className: "text-cyan-400" },
+	// Tool names in quotes
+	{ pattern: /"(tool)":\s*"([^"]+)"/g, className: "text-orange-400" },
+	// JSON keys
+	{ pattern: /"([^"]+)":/g, className: "text-sky-300" },
+	// Boolean values
+	{ pattern: /:\s*(true|false)/g, className: "text-amber-400", wrapGroup: 1 },
+	// Numbers
+	{ pattern: /:\s*(-?\d+\.?\d*)/g, className: "text-emerald-400", wrapGroup: 1 },
 ]
 
+// Extract timestamp from a log line and return elapsed time from baseline
+function formatElapsedTime(timestamp: string, baselineMs: number): string {
+	const currentMs = new Date(timestamp).getTime()
+	const elapsedMs = currentMs - baselineMs
+	const totalSeconds = Math.floor(elapsedMs / 1000)
+	const minutes = Math.floor(totalSeconds / 60)
+	const seconds = totalSeconds % 60
+	return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`
+}
+
+// Extract the first timestamp from the log to use as baseline
+function extractFirstTimestamp(log: string): number | null {
+	// Match timestamp at start of line: [2025-11-28T09:35:23.187Z | ... or [2025-11-28T09:35:23.187Z]
+	const match = log.match(/\[(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z)[\s|\]]/)
+	const isoString = match?.[1]
+	if (!isoString) return null
+	return new Date(isoString).getTime()
+}
+
+// Simplify log line by removing redundant metadata
+function simplifyLogLine(line: string, baselineMs: number | null): { timestamp: string; simplified: string } {
+	// Extract timestamp - matches [2025-11-28T09:35:23.187Z | ... format
+	const timestampMatch = line.match(/\[(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z)[\s|\]]/)
+	const isoTimestamp = timestampMatch?.[1]
+	if (!isoTimestamp) {
+		return { timestamp: "", simplified: line }
+	}
+
+	const timestamp = baselineMs !== null ? formatElapsedTime(isoTimestamp, baselineMs) : isoTimestamp.slice(11, 19)
+
+	// Remove the timestamp from the line (handles both [timestamp] and [timestamp | formats)
+	let simplified = line.replace(/\[\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z\s*\|?\s*/, "")
+
+	// Remove redundant metadata: pid, run, task IDs (they're same for entire log)
+	simplified = simplified.replace(/\|\s*pid:\d+\s*/g, "")
+	simplified = simplified.replace(/\|\s*run:\d+\s*/g, "")
+	simplified = simplified.replace(/\|\s*task:\d+\s*/g, "")
+	simplified = simplified.replace(/runTask\s*\|\s*/g, "")
+
+	// Clean up extra pipes, spaces, and trailing brackets
+	simplified = simplified.replace(/\|\s*\|/g, "|")
+	simplified = simplified.replace(/^\s*\|\s*/, "")
+	simplified = simplified.replace(/\]\s*$/, "") // Remove trailing bracket if present
+
+	return { timestamp, simplified }
+}
+
 // Format a single line with syntax highlighting using React elements (XSS-safe)
 function formatLine(line: string): React.ReactNode[] {
 	// Find all matches with their positions
@@ -125,24 +196,83 @@ function formatLine(line: string): React.ReactNode[] {
 	return result.length > 0 ? result : [line]
 }
 
+// Determine the visual style for a log line based on its content
+function getLineStyle(line: string): string {
+	if (line.includes("ERROR")) return "bg-red-950/30 border-l-2 border-red-500"
+	if (line.includes("WARN") || line.includes("WARNING")) return "bg-yellow-950/20 border-l-2 border-yellow-500"
+	if (line.includes("taskToolFailed")) return "bg-red-950/30 border-l-2 border-red-500"
+	if (line.includes("taskStarted") || line.includes("taskCreated")) return "bg-purple-950/20"
+	if (line.includes("EvalPass")) return "bg-green-950/30 border-l-2 border-green-500"
+	if (line.includes("EvalFail")) return "bg-red-950/30 border-l-2 border-red-500"
+	if (line.includes("taskCompleted") || line.includes("taskAborted")) return "bg-blue-950/20"
+	return ""
+}
+
 // Format log content with basic highlighting (XSS-safe - no dangerouslySetInnerHTML)
 function formatLogContent(log: string): React.ReactNode[] {
 	const lines = log.split("\n")
-	return lines.map((line, index) => (
-		<div key={index} className="hover:bg-white/5">
-			{line ? formatLine(line) : " "}
-		</div>
-	))
+	const baselineMs = extractFirstTimestamp(log)
+
+	return lines.map((line, index) => {
+		if (!line.trim()) {
+			return (
+				<div key={index} className="h-2">
+					{" "}
+				</div>
+			)
+		}
+
+		const parsed = simplifyLogLine(line, baselineMs)
+		const lineStyle = getLineStyle(line)
+
+		return (
+			<div key={index} className={`flex hover:bg-white/10 py-0.5 rounded-sm transition-colors ${lineStyle}`}>
+				{/* Elapsed time */}
+				<span className="text-blue-400 font-mono w-12 flex-shrink-0 tabular-nums text-right pr-2">
+					{parsed.timestamp}
+				</span>
+				{/* Log content - pl-12 ensures wrapped lines are indented under the timestamp */}
+				<span className="flex-1 break-words" style={{ textIndent: "-0.5rem", paddingLeft: "0.5rem" }}>
+					{formatLine(parsed.simplified)}
+				</span>
+			</div>
+		)
+	})
 }
 
 export function Run({ run }: { run: Run }) {
 	const runStatus = useRunStatus(run)
-	const { tasks, tokenUsage, usageUpdatedAt } = runStatus
+	const { tasks, tokenUsage, usageUpdatedAt, heartbeat, runners } = runStatus
 
 	const [selectedTask, setSelectedTask] = useState<Task | null>(null)
 	const [taskLog, setTaskLog] = useState<string | null>(null)
 	const [isLoadingLog, setIsLoadingLog] = useState(false)
 	const [copied, setCopied] = useState(false)
+	const [showKillDialog, setShowKillDialog] = useState(false)
+	const [isKilling, setIsKilling] = useState(false)
+
+	// Determine if run is still active (has heartbeat or runners)
+	const isRunActive = !run.taskMetricsId && (!!heartbeat || (runners && runners.length > 0))
+
+	const onKillRun = useCallback(async () => {
+		setIsKilling(true)
+		try {
+			const result = await killRun(run.id)
+			if (result.killedContainers.length > 0) {
+				toast.success(`Killed ${result.killedContainers.length} container(s)`)
+			} else if (result.errors.length === 0) {
+				toast.info("No running containers found")
+			} else {
+				toast.error(result.errors.join(", "))
+			}
+		} catch (error) {
+			console.error("Failed to kill run:", error)
+			toast.error("Failed to kill run")
+		} finally {
+			setIsKilling(false)
+			setShowKillDialog(false)
+		}
+	}, [run.id])
 
 	const onCopyLog = useCallback(async () => {
 		if (!taskLog) return
@@ -172,9 +302,9 @@ export function Run({ run }: { run: Run }) {
 
 	const onViewTaskLog = useCallback(
 		async (task: Task) => {
-			// Only allow viewing logs for completed tasks
-			if (task.passed === null || task.passed === undefined) {
-				toast.error("Task is still running")
+			// Only allow viewing logs for tasks that have started
+			if (!task.startedAt && !tokenUsage.get(task.id)) {
+				toast.error("Task has not started yet")
 				return
 			}
 
@@ -202,7 +332,7 @@ export function Run({ run }: { run: Run }) {
 				setIsLoadingLog(false)
 			}
 		},
-		[run.id],
+		[run.id, tokenUsage],
 	)
 
 	const taskMetrics: Record<number, TaskMetrics> = useMemo(() => {
@@ -228,22 +358,34 @@ export function Run({ run }: { run: Run }) {
 		// eslint-disable-next-line react-hooks/exhaustive-deps
 	}, [tasks, tokenUsage, usageUpdatedAt])
 
+	// Collect all unique tool names from all tasks and sort by total attempts
+	const toolColumns = useMemo<ToolName[]>(() => {
+		if (!tasks) return []
+
+		const toolTotals = new Map<ToolName, number>()
+
+		for (const task of tasks) {
+			if (task.taskMetrics?.toolUsage) {
+				for (const [toolName, usage] of Object.entries(task.taskMetrics.toolUsage)) {
+					const tool = toolName as ToolName
+					const current = toolTotals.get(tool) ?? 0
+					toolTotals.set(tool, current + usage.attempts)
+				}
+			}
+		}
+
+		// Sort by total attempts descending
+		return Array.from(toolTotals.entries())
+			.sort((a, b) => b[1] - a[1])
+			.map(([name]): ToolName => name)
+	}, [tasks])
+
 	// Compute aggregate stats
 	const stats = useMemo(() => {
 		if (!tasks) return null
 
 		const passed = tasks.filter((t) => t.passed === true).length
 		const failed = tasks.filter((t) => t.passed === false).length
-		// Count running tasks exactly like TaskStatus shows spinner:
-		// - passed is not true and not false (null/undefined)
-		// - AND has activity (startedAt or tokenUsage)
-		const running = tasks.filter(
-			(t) => t.passed !== true && t.passed !== false && (t.startedAt || tokenUsage.get(t.id)),
-		).length
-		const pending = tasks.filter(
-			(t) => t.passed !== true && t.passed !== false && !t.startedAt && !tokenUsage.get(t.id),
-		).length
-		const total = tasks.length
 		const completed = passed + failed
 
 		let totalTokensIn = 0
@@ -279,9 +421,6 @@ export function Run({ run }: { run: Run }) {
 		return {
 			passed,
 			failed,
-			running,
-			pending,
-			total,
 			completed,
 			passRate: completed > 0 ? ((passed / completed) * 100).toFixed(1) : null,
 			totalTokensIn,
@@ -293,42 +432,96 @@ export function Run({ run }: { run: Run }) {
 		// eslint-disable-next-line react-hooks/exhaustive-deps
 	}, [tasks, taskMetrics, tokenUsage, usageUpdatedAt])
 
+	// Calculate elapsed time (wall-clock time from run creation to completion or now)
+	const elapsedTime = useMemo(() => {
+		if (!tasks || tasks.length === 0) return null
+
+		const startTime = new Date(run.createdAt).getTime()
+
+		// If run is complete, find the latest finishedAt from tasks
+		if (run.taskMetricsId) {
+			const latestFinish = tasks.reduce((latest, task) => {
+				if (task.finishedAt) {
+					const finishTime = new Date(task.finishedAt).getTime()
+					return finishTime > latest ? finishTime : latest
+				}
+				return latest
+			}, startTime)
+			return latestFinish - startTime
+		}
+
+		// If still running, use current time
+		return Date.now() - startTime
+		// eslint-disable-next-line react-hooks/exhaustive-deps
+	}, [tasks, run.createdAt, run.taskMetricsId, usageUpdatedAt])
+
 	return (
 		<>
 			<div>
-				<div className="mb-4">
-					<div>
-						<div className="font-mono">{run.model}</div>
-						{run.description && <div className="text-sm text-muted-foreground">{run.description}</div>}
-					</div>
-					{!run.taskMetricsId && <RunStatus runStatus={runStatus} />}
-				</div>
-
 				{stats && (
-					<div className="mb-4 p-4 border rounded-lg bg-muted/50">
+					<div className="mb-4 p-4 border rounded-lg bg-muted sticky top-0 z-10">
+						{/* Provider, Model title and status */}
+						<div className="flex items-center justify-center gap-3 mb-3 relative">
+							{run.settings?.apiProvider && (
+								<span className="text-sm text-muted-foreground">{run.settings.apiProvider}</span>
+							)}
+							<div className="font-mono">{run.model}</div>
+							<RunStatus runStatus={runStatus} isComplete={!!run.taskMetricsId} />
+							{run.description && (
+								<span className="text-sm text-muted-foreground">- {run.description}</span>
+							)}
+							{isRunActive && (
+								<Tooltip>
+									<TooltipTrigger asChild>
+										<Button
+											variant="ghost"
+											size="sm"
+											onClick={() => setShowKillDialog(true)}
+											disabled={isKilling}
+											className="absolute right-0 flex items-center gap-1 text-muted-foreground hover:text-destructive">
+											{isKilling ? (
+												<LoaderCircle className="size-4 animate-spin" />
+											) : (
+												<StopCircle className="size-4" />
+											)}
+											Kill
+										</Button>
+									</TooltipTrigger>
+									<TooltipContent>Stop all containers for this run</TooltipContent>
+								</Tooltip>
+							)}
+						</div>
 						{/* Main Stats Row */}
-						<div className="flex flex-wrap items-start justify-between gap-x-6 gap-y-3">
+						<div className="flex items-start justify-center gap-x-8 gap-y-3">
 							{/* Passed/Failed */}
-							<div className="text-center">
+							<div className="text-center min-w-[80px]">
 								<div className="text-2xl font-bold whitespace-nowrap">
 									<span className="text-green-600">{stats.passed}</span>
 									<span className="text-muted-foreground mx-1">/</span>
 									<span className="text-red-600">{stats.failed}</span>
-									{stats.running > 0 && (
-										<span className="text-yellow-600 text-sm ml-2">({stats.running})</span>
-									)}
 								</div>
 								<div className="text-xs text-muted-foreground">Passed / Failed</div>
 							</div>
 
 							{/* Pass Rate */}
-							<div className="text-center">
-								<div className="text-2xl font-bold">{stats.passRate ? `${stats.passRate}%` : "-"}</div>
+							<div className="text-center min-w-[80px]">
+								<div
+									className={`text-2xl font-bold ${
+										stats.passRate === null
+											? ""
+											: parseFloat(stats.passRate) === 100
+												? ""
+												: parseFloat(stats.passRate) >= 80
+													? "text-yellow-500"
+													: "text-red-500"
+									}`}>
+									{stats.passRate ? `${stats.passRate}%` : "-"}
+								</div>
 								<div className="text-xs text-muted-foreground">Pass Rate</div>
 							</div>
 
 							{/* Tokens */}
-							<div className="text-center">
+							<div className="text-center min-w-[140px]">
 								<div className="text-xl font-bold font-mono whitespace-nowrap">
 									{formatTokens(stats.totalTokensIn)}
 									<span className="text-muted-foreground mx-1">/</span>
@@ -338,58 +531,64 @@ export function Run({ run }: { run: Run }) {
 							</div>
 
 							{/* Cost */}
-							<div className="text-center">
+							<div className="text-center min-w-[70px]">
 								<div className="text-2xl font-bold font-mono">{formatCurrency(stats.totalCost)}</div>
 								<div className="text-xs text-muted-foreground">Cost</div>
 							</div>
 
 							{/* Duration */}
-							<div className="text-center">
+							<div className="text-center min-w-[90px]">
 								<div className="text-2xl font-bold font-mono whitespace-nowrap">
 									{stats.totalDuration > 0 ? formatDuration(stats.totalDuration) : "-"}
 								</div>
 								<div className="text-xs text-muted-foreground">Duration</div>
 							</div>
 
-							{/* Tool Usage - Inline */}
-							{Object.keys(stats.toolUsage).length > 0 && (
-								<div className="flex items-center gap-2 flex-wrap">
-									{Object.entries(stats.toolUsage)
-										.sort(([, a], [, b]) => b.attempts - a.attempts)
-										.map(([toolName, usage]) => {
-											const abbr = getToolAbbreviation(toolName)
-											const successRate =
-												usage.attempts > 0
-													? ((usage.attempts - usage.failures) / usage.attempts) * 100
-													: 100
-											const rateColor =
-												successRate === 100
-													? "text-green-500"
-													: successRate >= 80
-														? "text-yellow-500"
-														: "text-red-500"
-											return (
-												<Tooltip key={toolName}>
-													<TooltipTrigger asChild>
-														<div className="flex items-center gap-1 px-2 py-1 rounded bg-background/50 border border-border/50 hover:border-border transition-colors cursor-default text-xs">
-															<span className="font-medium text-muted-foreground">
-																{abbr}
-															</span>
-															<span className="font-bold tabular-nums">
-																{usage.attempts}
-															</span>
-															<span className={`${rateColor}`}>
-																{formatToolUsageSuccessRate(usage)}
-															</span>
-														</div>
-													</TooltipTrigger>
-													<TooltipContent side="bottom">{toolName}</TooltipContent>
-												</Tooltip>
-											)
-										})}
+							{/* Elapsed Time */}
+							<div className="text-center min-w-[90px]">
+								<div className="text-2xl font-bold font-mono whitespace-nowrap">
+									{elapsedTime !== null ? formatDuration(elapsedTime) : "-"}
 								</div>
-							)}
+								<div className="text-xs text-muted-foreground">Elapsed</div>
+							</div>
 						</div>
+
+						{/* Tool Usage Row */}
+						{Object.keys(stats.toolUsage).length > 0 && (
+							<div className="flex items-center justify-center gap-2 flex-wrap mt-3">
+								{Object.entries(stats.toolUsage)
+									.sort(([, a], [, b]) => b.attempts - a.attempts)
+									.map(([toolName, usage]) => {
+										const abbr = getToolAbbreviation(toolName)
+										const successRate =
+											usage.attempts > 0
+												? ((usage.attempts - usage.failures) / usage.attempts) * 100
+												: 100
+										const rateColor =
+											successRate === 100
+												? "text-green-500"
+												: successRate >= 80
+													? "text-yellow-500"
+													: "text-red-500"
+										return (
+											<Tooltip key={toolName}>
+												<TooltipTrigger asChild>
+													<div className="flex items-center gap-1 px-2 py-1 rounded bg-background/50 border border-border/50 hover:border-border transition-colors cursor-default text-xs">
+														<span className="font-medium text-muted-foreground">
+															{abbr}
+														</span>
+														<span className="font-bold tabular-nums">{usage.attempts}</span>
+														<span className={`${rateColor}`}>
+															{formatToolUsageSuccessRate(usage)}
+														</span>
+													</div>
+												</TooltipTrigger>
+												<TooltipContent side="bottom">{toolName}</TooltipContent>
+											</Tooltip>
+										)
+									})}
+							</div>
+						)}
 					</div>
 				)}
 				{!tasks ? (
@@ -401,67 +600,104 @@ export function Run({ run }: { run: Run }) {
 								<TableHead>Exercise</TableHead>
 								<TableHead className="text-center">Tokens In / Out</TableHead>
 								<TableHead>Context</TableHead>
+								{toolColumns.map((toolName) => (
+									<TableHead key={toolName} className="text-xs text-center">
+										<Tooltip>
+											<TooltipTrigger>{getToolAbbreviation(toolName)}</TooltipTrigger>
+											<TooltipContent>{toolName}</TooltipContent>
+										</Tooltip>
+									</TableHead>
+								))}
 								<TableHead>Duration</TableHead>
 								<TableHead>Cost</TableHead>
 							</TableRow>
 						</TableHeader>
 						<TableBody>
-							{tasks.map((task) => (
-								<TableRow
-									key={task.id}
-									className={task.finishedAt ? "cursor-pointer hover:bg-muted/50" : ""}
-									onClick={() => task.finishedAt && onViewTaskLog(task)}>
-									<TableCell>
-										<div className="flex items-center gap-2">
-											<TaskStatus
-												task={task}
-												running={!!task.startedAt || !!tokenUsage.get(task.id)}
-											/>
+							{tasks.map((task) => {
+								const hasStarted = !!task.startedAt || !!tokenUsage.get(task.id)
+								return (
+									<TableRow
+										key={task.id}
+										className={`${hasStarted ? "cursor-pointer hover:bg-muted/50" : ""} ${task.passed === false ? "bg-red-950/30 border-l-2 border-l-red-500" : ""}`}
+										onClick={() => hasStarted && onViewTaskLog(task)}>
+										<TableCell>
 											<div className="flex items-center gap-2">
-												<span>
-													{task.language}/{task.exercise}
-													{task.iteration > 1 && (
-														<span className="text-muted-foreground ml-1">
-															(#{task.iteration})
-														</span>
+												<TaskStatus task={task} running={hasStarted} />
+												<div className="flex items-center gap-2">
+													<span>
+														{task.language}/{task.exercise}
+														{task.iteration > 1 && (
+															<span className="text-muted-foreground ml-1">
+																(#{task.iteration})
+															</span>
+														)}
+													</span>
+													{hasStarted && (
+														<Tooltip>
+															<TooltipTrigger asChild>
+																<FileText className="size-3 text-muted-foreground" />
+															</TooltipTrigger>
+															<TooltipContent>Click to view log</TooltipContent>
+														</Tooltip>
 													)}
-												</span>
-												{task.finishedAt && (
-													<Tooltip>
-														<TooltipTrigger asChild>
-															<FileText className="size-3 text-muted-foreground" />
-														</TooltipTrigger>
-														<TooltipContent>Click to view log</TooltipContent>
-													</Tooltip>
-												)}
-											</div>
-										</div>
-									</TableCell>
-									{taskMetrics[task.id] ? (
-										<>
-											<TableCell className="font-mono text-xs">
-												<div className="flex items-center justify-evenly">
-													<div>{formatTokens(taskMetrics[task.id]!.tokensIn)}</div>/
-													<div>{formatTokens(taskMetrics[task.id]!.tokensOut)}</div>
 												</div>
-											</TableCell>
-											<TableCell className="font-mono text-xs">
-												{formatTokens(taskMetrics[task.id]!.tokensContext)}
-											</TableCell>
-											<TableCell className="font-mono text-xs">
-												{taskMetrics[task.id]!.duration
-													? formatDuration(taskMetrics[task.id]!.duration)
-													: "-"}
-											</TableCell>
-											<TableCell className="font-mono text-xs">
-												{formatCurrency(taskMetrics[task.id]!.cost)}
-											</TableCell>
-										</>
-									) : (
-										<TableCell colSpan={4} />
-									)}
-								</TableRow>
-							))}
+											</div>
+										</TableCell>
+										{taskMetrics[task.id] ? (
+											<>
+												<TableCell className="font-mono text-xs">
+													<div className="flex items-center justify-evenly">
+														<div>{formatTokens(taskMetrics[task.id]!.tokensIn)}</div>/
+														<div>{formatTokens(taskMetrics[task.id]!.tokensOut)}</div>
+													</div>
+												</TableCell>
+												<TableCell className="font-mono text-xs">
+													{formatTokens(taskMetrics[task.id]!.tokensContext)}
+												</TableCell>
+												{toolColumns.map((toolName) => {
+													const usage = task.taskMetrics?.toolUsage?.[toolName]
+													const successRate =
+														usage && usage.attempts > 0
+															? ((usage.attempts - usage.failures) / usage.attempts) * 100
+															: 100
+													const rateColor =
+														successRate === 100
+															? "text-muted-foreground"
+															: successRate >= 80
+																? "text-yellow-500"
+																: "text-red-500"
+													return (
+														<TableCell key={toolName} className="text-xs text-center">
+															{usage ? (
+																<div className="flex flex-col items-center">
+																	<span className="font-medium">
+																		{usage.attempts}
+																	</span>
+																	<span className={rateColor}>
+																		{formatToolUsageSuccessRate(usage)}
+																	</span>
+																</div>
+															) : (
+																<span className="text-muted-foreground">-</span>
+															)}
+														</TableCell>
+													)
+												})}
+												<TableCell className="font-mono text-xs">
+													{taskMetrics[task.id]!.duration
+														? formatDuration(taskMetrics[task.id]!.duration)
+														: "-"}
+												</TableCell>
+												<TableCell className="font-mono text-xs">
+													{formatCurrency(taskMetrics[task.id]!.cost)}
+												</TableCell>
+											</>
+										) : (
+											<TableCell colSpan={4 + toolColumns.length} />
+										)}
+									</TableRow>
+								)
+							})}
 						</TableBody>
 					</Table>
 				)}
@@ -479,8 +715,20 @@ export function Run({ run }: { run: Run }) {
 									<span className="text-muted-foreground">(#{selectedTask.iteration})</span>
 								)}
 								<span
-									className={`ml-2 text-sm ${selectedTask?.passed ? "text-green-600" : "text-red-600"}`}>
-									({selectedTask?.passed ? "Passed" : "Failed"})
+									className={`ml-2 text-sm ${
+										selectedTask?.passed === true
+											? "text-green-600"
+											: selectedTask?.passed === false
+												? "text-red-600"
+												: "text-yellow-500"
+									}`}>
+									(
+									{selectedTask?.passed === true
+										? "Passed"
+										: selectedTask?.passed === false
+											? "Failed"
+											: "Running"}
+									)
 								</span>
 							</DialogTitle>
 							{taskLog && (
@@ -523,6 +771,35 @@ export function Run({ run }: { run: Run }) {
 					</div>
 				</DialogContent>
 			</Dialog>
+
+			{/* Kill Run Confirmation Dialog */}
+			<AlertDialog open={showKillDialog} onOpenChange={setShowKillDialog}>
+				<AlertDialogContent>
+					<AlertDialogHeader>
+						<AlertDialogTitle>Kill Run?</AlertDialogTitle>
+						<AlertDialogDescription>
+							This will stop the controller and all task runner containers for this run. Any running tasks
+							will be terminated immediately. This action cannot be undone.
+						</AlertDialogDescription>
+					</AlertDialogHeader>
+					<AlertDialogFooter>
+						<AlertDialogCancel disabled={isKilling}>Cancel</AlertDialogCancel>
+						<AlertDialogAction
+							onClick={onKillRun}
+							disabled={isKilling}
+							className="bg-destructive text-destructive-foreground hover:bg-destructive/90">
+							{isKilling ? (
+								<>
+									<LoaderCircle className="size-4 animate-spin mr-2" />
+									Killing...
+								</>
+							) : (
+								"Kill Run"
+							)}
+						</AlertDialogAction>
+					</AlertDialogFooter>
+				</AlertDialogContent>
+			</AlertDialog>
 		</>
 	)
 }

+ 130 - 14
apps/web-evals/src/app/runs/new/new-run.tsx

@@ -87,11 +87,13 @@ type ImportedSettings = {
 export function NewRun() {
 	const router = useRouter()
 
-	const [provider, setModelSource] = useState<"roo" | "openrouter" | "other">("roo")
+	const [provider, setModelSource] = useState<"roo" | "openrouter" | "other">("other")
 	const [modelPopoverOpen, setModelPopoverOpen] = useState(false)
 	const [useNativeToolProtocol, setUseNativeToolProtocol] = useState(true)
-	const [useMultipleNativeToolCalls, setUseMultipleNativeToolCalls] = useState(true)
+	const [useMultipleNativeToolCalls, setUseMultipleNativeToolCalls] = useState(false)
 	const [reasoningEffort, setReasoningEffort] = useState<ReasoningEffort | "">("")
+	const [commandExecutionTimeout, setCommandExecutionTimeout] = useState(20)
+	const [terminalShellIntegrationTimeout, setTerminalShellIntegrationTimeout] = useState(30) // seconds
 
 	// State for imported settings with config selection
 	const [importedSettings, setImportedSettings] = useState<ImportedSettings | null>(null)
@@ -134,7 +136,7 @@ export function NewRun() {
 
 	const [model, suite, settings] = watch(["model", "suite", "settings", "concurrency"])
 
-	// Load concurrency and timeout from localStorage on mount
+	// Load settings from localStorage on mount
 	useEffect(() => {
 		const savedConcurrency = localStorage.getItem("evals-concurrency")
 		if (savedConcurrency) {
@@ -150,6 +152,37 @@ export function NewRun() {
 				setValue("timeout", parsed)
 			}
 		}
+		const savedCommandTimeout = localStorage.getItem("evals-command-execution-timeout")
+		if (savedCommandTimeout) {
+			const parsed = parseInt(savedCommandTimeout, 10)
+			if (!isNaN(parsed) && parsed >= 20 && parsed <= 60) {
+				setCommandExecutionTimeout(parsed)
+			}
+		}
+		const savedShellTimeout = localStorage.getItem("evals-shell-integration-timeout")
+		if (savedShellTimeout) {
+			const parsed = parseInt(savedShellTimeout, 10)
+			if (!isNaN(parsed) && parsed >= 30 && parsed <= 60) {
+				setTerminalShellIntegrationTimeout(parsed)
+			}
+		}
+		// Load saved exercises selection
+		const savedSuite = localStorage.getItem("evals-suite")
+		if (savedSuite === "partial") {
+			setValue("suite", "partial")
+			const savedExercises = localStorage.getItem("evals-exercises")
+			if (savedExercises) {
+				try {
+					const parsed = JSON.parse(savedExercises) as string[]
+					if (Array.isArray(parsed)) {
+						setSelectedExercises(parsed)
+						setValue("exercises", parsed)
+					}
+				} catch {
+					// Invalid JSON, ignore
+				}
+			}
+		}
 	}, [setValue])
 
 	// Extract unique languages from exercises
@@ -193,6 +226,7 @@ export function NewRun() {
 
 			setSelectedExercises(newSelected)
 			setValue("exercises", newSelected)
+			localStorage.setItem("evals-exercises", JSON.stringify(newSelected))
 		},
 		[getExercisesForLanguage, selectedExercises, setValue],
 	)
@@ -236,6 +270,8 @@ export function NewRun() {
 						apiProvider: "openrouter",
 						openRouterModelId: model,
 						toolProtocol: useNativeToolProtocol ? "native" : "xml",
+						commandExecutionTimeout,
+						terminalShellIntegrationTimeout: terminalShellIntegrationTimeout * 1000, // Convert to ms
 						...experimentsSettings,
 					}
 				} else if (provider === "roo") {
@@ -244,6 +280,8 @@ export function NewRun() {
 						apiProvider: "roo",
 						apiModelId: model,
 						toolProtocol: useNativeToolProtocol ? "native" : "xml",
+						commandExecutionTimeout,
+						terminalShellIntegrationTimeout: terminalShellIntegrationTimeout * 1000, // Convert to ms
 						...experimentsSettings,
 						...(reasoningEffort
 							? {
@@ -257,6 +295,8 @@ export function NewRun() {
 					values.settings = {
 						...values.settings,
 						toolProtocol: useNativeToolProtocol ? "native" : "xml",
+						commandExecutionTimeout,
+						terminalShellIntegrationTimeout: terminalShellIntegrationTimeout * 1000, // Convert to ms
 						...experimentsSettings,
 					}
 				}
@@ -267,7 +307,16 @@ export function NewRun() {
 				toast.error(e instanceof Error ? e.message : "An unknown error occurred.")
 			}
 		},
-		[provider, model, router, useNativeToolProtocol, useMultipleNativeToolCalls, reasoningEffort],
+		[
+			provider,
+			model,
+			router,
+			useNativeToolProtocol,
+			useMultipleNativeToolCalls,
+			reasoningEffort,
+			commandExecutionTimeout,
+			terminalShellIntegrationTimeout,
+		],
 	)
 
 	const onSelectModel = useCallback(
@@ -355,9 +404,9 @@ export function NewRun() {
 									value={provider}
 									onValueChange={(value) => setModelSource(value as "roo" | "openrouter" | "other")}>
 									<TabsList className="mb-2">
+										<TabsTrigger value="other">Import</TabsTrigger>
 										<TabsTrigger value="roo">Roo Code Cloud</TabsTrigger>
 										<TabsTrigger value="openrouter">OpenRouter</TabsTrigger>
-										<TabsTrigger value="other">Other</TabsTrigger>
 									</TabsList>
 								</Tabs>
 
@@ -446,8 +495,8 @@ export function NewRun() {
 													<Checkbox
 														id="native-other"
 														checked={useNativeToolProtocol}
-														onCheckedChange={(checked) =>
-															setUseNativeToolProtocol(checked === true)
+														onCheckedChange={(checked: boolean) =>
+															setUseNativeToolProtocol(checked)
 														}
 													/>
 													<span className="text-sm">Use Native Tool Calls</span>
@@ -458,8 +507,8 @@ export function NewRun() {
 													<Checkbox
 														id="multipleNativeToolCalls-other"
 														checked={useMultipleNativeToolCalls}
-														onCheckedChange={(checked) =>
-															setUseMultipleNativeToolCalls(checked === true)
+														onCheckedChange={(checked: boolean) =>
+															setUseMultipleNativeToolCalls(checked)
 														}
 													/>
 													<span className="text-sm">Use Multiple Native Tool Calls</span>
@@ -529,8 +578,8 @@ export function NewRun() {
 														<Checkbox
 															id="native"
 															checked={useNativeToolProtocol}
-															onCheckedChange={(checked) =>
-																setUseNativeToolProtocol(checked === true)
+															onCheckedChange={(checked: boolean) =>
+																setUseNativeToolProtocol(checked)
 															}
 														/>
 														<span className="text-sm">Use Native Tool Calls</span>
@@ -541,8 +590,8 @@ export function NewRun() {
 														<Checkbox
 															id="multipleNativeToolCalls"
 															checked={useMultipleNativeToolCalls}
-															onCheckedChange={(checked) =>
-																setUseMultipleNativeToolCalls(checked === true)
+															onCheckedChange={(checked: boolean) =>
+																setUseMultipleNativeToolCalls(checked)
 															}
 														/>
 														<span className="text-sm">Use Multiple Native Tool Calls</span>
@@ -627,12 +676,14 @@ export function NewRun() {
 								<FormLabel>Exercises</FormLabel>
 								<div className="flex items-center gap-2 flex-wrap">
 									<Tabs
-										defaultValue="full"
+										value={suite}
 										onValueChange={(value) => {
 											setValue("suite", value as "full" | "partial")
+											localStorage.setItem("evals-suite", value)
 											if (value === "full") {
 												setSelectedExercises([])
 												setValue("exercises", [])
+												localStorage.removeItem("evals-exercises")
 											}
 										}}>
 										<TabsList>
@@ -669,6 +720,7 @@ export function NewRun() {
 										onValueChange={(value) => {
 											setSelectedExercises(value)
 											setValue("exercises", value)
+											localStorage.setItem("evals-exercises", JSON.stringify(value))
 										}}
 										placeholder="Select"
 										variant="inverted"
@@ -758,6 +810,70 @@ export function NewRun() {
 						)}
 					/>
 
+					<FormItem className="py-5">
+						<div className="flex items-center gap-1">
+							<Label>Terminal Command Timeout (Seconds)</Label>
+							<Tooltip>
+								<TooltipTrigger asChild>
+									<Info className="size-4 text-muted-foreground cursor-help" />
+								</TooltipTrigger>
+								<TooltipContent side="right" className="max-w-xs">
+									<p>
+										Maximum time in seconds to wait for terminal command execution to complete
+										before timing out. This applies to commands run via the execute_command tool.
+									</p>
+								</TooltipContent>
+							</Tooltip>
+						</div>
+						<div className="flex flex-row items-center gap-2">
+							<Slider
+								value={[commandExecutionTimeout]}
+								min={20}
+								max={60}
+								step={1}
+								onValueChange={([value]) => {
+									if (value !== undefined) {
+										setCommandExecutionTimeout(value)
+										localStorage.setItem("evals-command-execution-timeout", String(value))
+									}
+								}}
+							/>
+							<div className="w-8 text-right">{commandExecutionTimeout}</div>
+						</div>
+					</FormItem>
+
+					<FormItem className="py-5">
+						<div className="flex items-center gap-1">
+							<Label>Shell Integration Timeout (Seconds)</Label>
+							<Tooltip>
+								<TooltipTrigger asChild>
+									<Info className="size-4 text-muted-foreground cursor-help" />
+								</TooltipTrigger>
+								<TooltipContent side="right" className="max-w-xs">
+									<p>
+										Maximum time in seconds to wait for shell integration to initialize when opening
+										a new terminal.
+									</p>
+								</TooltipContent>
+							</Tooltip>
+						</div>
+						<div className="flex flex-row items-center gap-2">
+							<Slider
+								value={[terminalShellIntegrationTimeout]}
+								min={30}
+								max={60}
+								step={1}
+								onValueChange={([value]) => {
+									if (value !== undefined) {
+										setTerminalShellIntegrationTimeout(value)
+										localStorage.setItem("evals-shell-integration-timeout", String(value))
+									}
+								}}
+							/>
+							<div className="w-8 text-right">{terminalShellIntegrationTimeout}</div>
+						</div>
+					</FormItem>
+
 					<FormField
 						control={form.control}
 						name="description"

+ 3 - 1
apps/web-evals/src/app/runs/new/settings-diff.tsx

@@ -2,7 +2,9 @@ import { type Keys, type RooCodeSettings, GLOBAL_SETTINGS_KEYS, PROVIDER_SETTING
 
 import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui"
 
-export const ROO_CODE_SETTINGS_KEYS = [...GLOBAL_SETTINGS_KEYS, ...PROVIDER_SETTINGS_KEYS] as Keys<RooCodeSettings>[]
+export const ROO_CODE_SETTINGS_KEYS = [
+	...new Set([...GLOBAL_SETTINGS_KEYS, ...PROVIDER_SETTINGS_KEYS]),
+] as Keys<RooCodeSettings>[]
 
 type SettingsDiffProps = {
 	defaultSettings: RooCodeSettings

+ 16 - 4
apps/web-evals/src/components/home/run.tsx

@@ -124,9 +124,13 @@ export function Run({ run, taskMetrics, toolColumns }: RunProps) {
 				<TableCell>{run.passed}</TableCell>
 				<TableCell>{run.failed}</TableCell>
 				<TableCell>
-					{run.passed + run.failed > 0 && (
-						<span>{((run.passed / (run.passed + run.failed)) * 100).toFixed(1)}%</span>
-					)}
+					{run.passed + run.failed > 0 &&
+						(() => {
+							const percent = (run.passed / (run.passed + run.failed)) * 100
+							const colorClass =
+								percent === 100 ? "text-green-500" : percent >= 80 ? "text-yellow-500" : "text-red-500"
+							return <span className={colorClass}>{percent.toFixed(1)}%</span>
+						})()}
 				</TableCell>
 				<TableCell>
 					{taskMetrics && (
@@ -138,12 +142,20 @@ export function Run({ run, taskMetrics, toolColumns }: RunProps) {
 				</TableCell>
 				{toolColumns.map((toolName) => {
 					const usage = taskMetrics?.toolUsage?.[toolName]
+					const successRate =
+						usage && usage.attempts > 0 ? ((usage.attempts - usage.failures) / usage.attempts) * 100 : 100
+					const rateColor =
+						successRate === 100
+							? "text-muted-foreground"
+							: successRate >= 80
+								? "text-yellow-500"
+								: "text-red-500"
 					return (
 						<TableCell key={toolName} className="text-xs text-center">
 							{usage ? (
 								<div className="flex flex-col items-center">
 									<span className="font-medium">{usage.attempts}</span>
-									<span className="text-muted-foreground">{formatToolUsageSuccessRate(usage)}</span>
+									<span className={rateColor}>{formatToolUsageSuccessRate(usage)}</span>
 								</div>
 							) : (
 								<span className="text-muted-foreground">-</span>

+ 26 - 15
packages/evals/src/cli/runEvals.ts

@@ -37,22 +37,33 @@ export const runEvals = async (runId: number) => {
 	const heartbeat = await startHeartbeat(run.id)
 	const queue = new PQueue({ concurrency: run.concurrency })
 
+	const STAGGER_DELAY_MS = 5000
+	const filteredTasks = tasks.filter((task) => task.finishedAt === null)
+
+	const createTaskRunner = (task: (typeof filteredTasks)[number]) => async () => {
+		try {
+			if (containerized) {
+				await processTaskInContainer({ taskId: task.id, jobToken: run.jobToken, logger })
+			} else {
+				await processTask({ taskId: task.id, jobToken: run.jobToken, logger })
+			}
+		} catch (error) {
+			logger.error("error processing task", error)
+		}
+	}
+
 	try {
-		await queue.addAll(
-			tasks
-				.filter((task) => task.finishedAt === null)
-				.map((task) => async () => {
-					try {
-						if (containerized) {
-							await processTaskInContainer({ taskId: task.id, jobToken: run.jobToken, logger })
-						} else {
-							await processTask({ taskId: task.id, jobToken: run.jobToken, logger })
-						}
-					} catch (error) {
-						logger.error("error processing task", error)
-					}
-				}),
-		)
+		// Add tasks with staggered start times when concurrency > 1
+		for (let i = 0; i < filteredTasks.length; i++) {
+			const task = filteredTasks[i]
+			if (!task) continue
+			if (run.concurrency > 1 && i > 0) {
+				await new Promise((resolve) => setTimeout(resolve, STAGGER_DELAY_MS))
+			}
+			queue.add(createTaskRunner(task))
+		}
+
+		await queue.onIdle()
 
 		logger.info("finishRun")
 		const result = await finishRun(run.id)

+ 89 - 3
packages/evals/src/cli/runTask.ts

@@ -1,4 +1,5 @@
 import * as fs from "fs"
+import * as fsp from "fs/promises"
 import * as path from "path"
 import * as os from "node:os"
 
@@ -38,6 +39,58 @@ class SubprocessTimeoutError extends Error {
 	}
 }
 
+/**
+ * Copy conversation history files from VS Code extension storage to the log directory.
+ * This allows us to preserve the api_conversation_history.json and ui_messages.json
+ * files for post-mortem analysis alongside the log files.
+ */
+async function copyConversationHistory({
+	rooTaskId,
+	logDir,
+	language,
+	exercise,
+	iteration,
+	logger,
+}: {
+	rooTaskId: string
+	logDir: string
+	language: string
+	exercise: string
+	iteration: number
+	logger: Logger
+}): Promise<void> {
+	// VS Code extension global storage path within the container
+	const extensionStoragePath = "/roo/.vscode/User/globalStorage/rooveterinaryinc.roo-cline"
+	const taskStoragePath = path.join(extensionStoragePath, "tasks", rooTaskId)
+
+	const filesToCopy = ["api_conversation_history.json", "ui_messages.json"]
+
+	for (const filename of filesToCopy) {
+		const sourcePath = path.join(taskStoragePath, filename)
+		// Use sanitized exercise name (replace slashes with dashes) for the destination filename
+		// Include iteration number to handle multiple attempts at the same exercise
+		const sanitizedExercise = exercise.replace(/\//g, "-")
+		const destFilename = `${language}-${sanitizedExercise}.${iteration}_${filename}`
+		const destPath = path.join(logDir, destFilename)
+
+		try {
+			// Check if source file exists
+			await fsp.access(sourcePath)
+
+			// Copy the file
+			await fsp.copyFile(sourcePath, destPath)
+			logger.info(`copied ${filename} to ${destPath}`)
+		} catch (error) {
+			// File may not exist if task didn't complete properly - this is not fatal
+			if ((error as NodeJS.ErrnoException).code === "ENOENT") {
+				logger.info(`${filename} not found at ${sourcePath} - skipping`)
+			} else {
+				logger.error(`failed to copy ${filename}:`, error)
+			}
+		}
+	}
+}
+
 export const processTask = async ({
 	taskId,
 	jobToken,
@@ -114,7 +167,7 @@ export const processTaskInContainer = async ({
 
 	for (let attempt = 0; attempt <= maxRetries; attempt++) {
 		const containerName = `evals-task-${taskId}.${attempt}`
-		const args = [`--name ${containerName}`, ...baseArgs]
+		const args = [`--name ${containerName}`, `-e EVALS_ATTEMPT=${attempt}`, ...baseArgs]
 		const isRetry = attempt > 0
 
 		if (isRetry) {
@@ -172,6 +225,7 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO
 	const controller = new AbortController()
 	const cancelSignal = controller.signal
 	const containerized = isDockerContainer()
+	const logDir = containerized ? `/var/log/evals/runs/${run.id}` : `/tmp/evals/runs/${run.id}`
 
 	let codeCommand = containerized
 		? `xvfb-run --auto-servernum --server-num=1 code --wait --log trace --disable-workspace-trust --disable-gpu --disable-lcd-text --no-sandbox --user-data-dir /roo/.vscode --password-store="basic" -n ${workspacePath}`
@@ -266,7 +320,23 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO
 				(payload[0].message.say && loggableSays.includes(payload[0].message.say)) ||
 				payload[0].message.partial !== true)
 		) {
-			logger.info(`${eventName} ->`, payload)
+			// Extract tool name for tool-related messages for clearer logging
+			let logEventName: string = eventName
+			if (eventName === RooCodeEventName.Message && payload[0]?.message?.ask === "tool") {
+				try {
+					const textJson = JSON.parse(payload[0].message.text ?? "{}")
+					if (textJson.tool) {
+						logEventName = `${eventName} (tool: ${textJson.tool})`
+					}
+				} catch {
+					// If parsing fails, use the default event name
+				}
+			} else if (eventName === RooCodeEventName.Message && payload[0]?.message?.ask === "command") {
+				logEventName = `${eventName} (command)`
+			} else if (eventName === RooCodeEventName.Message && payload[0]?.message?.ask === "completion_result") {
+				logEventName = `${eventName} (completion_result)`
+			}
+			logger.info(`${logEventName} ->`, payload)
 		}
 
 		if (eventName === RooCodeEventName.TaskStarted) {
@@ -418,9 +488,25 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO
 		}
 	}
 
+	// Copy conversation history files from VS Code extension storage to the log directory
+	// for post-mortem analysis. Only do this in containerized mode where we have a known path.
+	if (containerized && rooTaskId) {
+		await copyConversationHistory({
+			rooTaskId,
+			logDir,
+			language,
+			exercise,
+			iteration: task.iteration,
+			logger,
+		})
+	}
+
 	logger.close()
 
-	if (isApiUnstable) {
+	// Only throw for API instability if the task didn't complete successfully.
+	// If taskFinishedAt is set via TaskCompleted event, the task succeeded despite
+	// API retries, so re-running from scratch would waste resources.
+	if (isApiUnstable && !taskFinishedAt) {
 		throw new Error("API is unstable, throwing to trigger a retry.")
 	}
 }