3 months ago · d2017c80cf
--- a/apps/web-evals/src/actions/__tests__/killRun.spec.ts
+++ b/apps/web-evals/src/actions/__tests__/killRun.spec.ts
@@ -0,0 +1,207 @@
 
				+// npx vitest run src/actions/__tests__/killRun.spec.ts
			
 
				+
			
 
				+import { execFileSync } from "child_process"
			
 
				+
			
 
				+// Mock child_process
			
 
				+vi.mock("child_process", () => ({
			
 
				+	execFileSync: vi.fn(),
			
 
				+	spawn: vi.fn(),
			
 
				+}))
			
 
				+
			
 
				+// Mock next/cache
			
 
				+vi.mock("next/cache", () => ({
			
 
				+	revalidatePath: vi.fn(),
			
 
				+}))
			
 
				+
			
 
				+// Mock redis client
			
 
				+vi.mock("@/lib/server/redis", () => ({
			
 
				+	redisClient: vi.fn().mockResolvedValue({
			
 
				+		del: vi.fn().mockResolvedValue(1),
			
 
				+	}),
			
 
				+}))
			
 
				+
			
 
				+// Mock @roo-code/evals
			
 
				+vi.mock("@roo-code/evals", () => ({
			
 
				+	createRun: vi.fn(),
			
 
				+	deleteRun: vi.fn(),
			
 
				+	createTask: vi.fn(),
			
 
				+	exerciseLanguages: [],
			
 
				+	getExercisesForLanguage: vi.fn().mockResolvedValue([]),
			
 
				+}))
			
 
				+
			
 
				+// Mock timers to speed up tests
			
 
				+vi.useFakeTimers()
			
 
				+
			
 
				+// Import after mocks
			
 
				+import { killRun } from "../runs"
			
 
				+
			
 
				+const mockExecFileSync = execFileSync as ReturnType<typeof vi.fn>
			
 
				+
			
 
				+describe("killRun", () => {
			
 
				+	beforeEach(() => {
			
 
				+		vi.clearAllMocks()
			
 
				+	})
			
 
				+
			
 
				+	afterEach(() => {
			
 
				+		vi.clearAllTimers()
			
 
				+	})
			
 
				+
			
 
				+	it("should kill controller first, wait, then kill task containers", async () => {
			
 
				+		const runId = 123
			
 
				+
			
 
				+		// execFileSync is used for all docker commands
			
 
				+		mockExecFileSync
			
 
				+			.mockReturnValueOnce("") // docker kill controller
			
 
				+			.mockReturnValueOnce("evals-task-123-456.0\nevals-task-123-789.1\n") // docker ps
			
 
				+			.mockReturnValueOnce("") // docker kill evals-task-123-456.0
			
 
				+			.mockReturnValueOnce("") // docker kill evals-task-123-789.1
			
 
				+
			
 
				+		const resultPromise = killRun(runId)
			
 
				+
			
 
				+		// Fast-forward past the 10 second sleep
			
 
				+		await vi.advanceTimersByTimeAsync(10000)
			
 
				+
			
 
				+		const result = await resultPromise
			
 
				+
			
 
				+		expect(result.success).toBe(true)
			
 
				+		expect(result.killedContainers).toContain("evals-controller-123")
			
 
				+		expect(result.killedContainers).toContain("evals-task-123-456.0")
			
 
				+		expect(result.killedContainers).toContain("evals-task-123-789.1")
			
 
				+		expect(result.errors).toHaveLength(0)
			
 
				+
			
 
				+		// Verify execFileSync was called for docker kill
			
 
				+		expect(mockExecFileSync).toHaveBeenNthCalledWith(
			
 
				+			1,
			
 
				+			"docker",
			
 
				+			["kill", "evals-controller-123"],
			
 
				+			expect.any(Object),
			
 
				+		)
			
 
				+		// Verify execFileSync was called for docker ps with run-specific filter
			
 
				+		expect(mockExecFileSync).toHaveBeenNthCalledWith(
			
 
				+			2,
			
 
				+			"docker",
			
 
				+			["ps", "--format", "{{.Names}}", "--filter", "name=evals-task-123-"],
			
 
				+			expect.any(Object),
			
 
				+		)
			
 
				+	})
			
 
				+
			
 
				+	it("should continue killing runners even if controller is not running", async () => {
			
 
				+		const runId = 456
			
 
				+
			
 
				+		mockExecFileSync
			
 
				+			.mockImplementationOnce(() => {
			
 
				+				throw new Error("No such container")
			
 
				+			}) // controller kill fails
			
 
				+			.mockReturnValueOnce("evals-task-456-100.0\n") // docker ps
			
 
				+			.mockReturnValueOnce("") // docker kill task
			
 
				+
			
 
				+		const resultPromise = killRun(runId)
			
 
				+		await vi.advanceTimersByTimeAsync(10000)
			
 
				+		const result = await resultPromise
			
 
				+
			
 
				+		expect(result.success).toBe(true)
			
 
				+		expect(result.killedContainers).toContain("evals-task-456-100.0")
			
 
				+		// Controller not in list since it failed
			
 
				+		expect(result.killedContainers).not.toContain("evals-controller-456")
			
 
				+	})
			
 
				+
			
 
				+	it("should clear Redis state after killing containers", async () => {
			
 
				+		const runId = 789
			
 
				+
			
 
				+		const mockDel = vi.fn().mockResolvedValue(1)
			
 
				+		const { redisClient } = await import("@/lib/server/redis")
			
 
				+		vi.mocked(redisClient).mockResolvedValue({ del: mockDel } as never)
			
 
				+
			
 
				+		mockExecFileSync
			
 
				+			.mockReturnValueOnce("") // controller kill
			
 
				+			.mockReturnValueOnce("") // docker ps (no tasks)
			
 
				+
			
 
				+		const resultPromise = killRun(runId)
			
 
				+		await vi.advanceTimersByTimeAsync(10000)
			
 
				+		await resultPromise
			
 
				+
			
 
				+		expect(mockDel).toHaveBeenCalledWith("heartbeat:789")
			
 
				+		expect(mockDel).toHaveBeenCalledWith("runners:789")
			
 
				+	})
			
 
				+
			
 
				+	it("should handle docker ps failure gracefully", async () => {
			
 
				+		const runId = 111
			
 
				+
			
 
				+		mockExecFileSync
			
 
				+			.mockReturnValueOnce("") // controller kill succeeds
			
 
				+			.mockImplementationOnce(() => {
			
 
				+				throw new Error("Docker error")
			
 
				+			}) // docker ps fails
			
 
				+
			
 
				+		const resultPromise = killRun(runId)
			
 
				+		await vi.advanceTimersByTimeAsync(10000)
			
 
				+		const result = await resultPromise
			
 
				+
			
 
				+		// Should still be successful because controller was killed
			
 
				+		expect(result.success).toBe(true)
			
 
				+		expect(result.killedContainers).toContain("evals-controller-111")
			
 
				+		expect(result.errors).toContain("Failed to list Docker task containers")
			
 
				+	})
			
 
				+
			
 
				+	it("should handle individual task kill failures", async () => {
			
 
				+		const runId = 222
			
 
				+
			
 
				+		mockExecFileSync
			
 
				+			.mockReturnValueOnce("") // controller kill
			
 
				+			.mockReturnValueOnce("evals-task-222-300.0\nevals-task-222-400.0\n") // docker ps
			
 
				+			.mockImplementationOnce(() => {
			
 
				+				throw new Error("Kill failed")
			
 
				+			}) // first task kill fails
			
 
				+			.mockReturnValueOnce("") // second task kill succeeds
			
 
				+
			
 
				+		const resultPromise = killRun(runId)
			
 
				+		await vi.advanceTimersByTimeAsync(10000)
			
 
				+		const result = await resultPromise
			
 
				+
			
 
				+		expect(result.success).toBe(true)
			
 
				+		expect(result.killedContainers).toContain("evals-controller-222")
			
 
				+		expect(result.killedContainers).toContain("evals-task-222-400.0")
			
 
				+		expect(result.errors.length).toBe(1)
			
 
				+		expect(result.errors[0]).toContain("evals-task-222-300.0")
			
 
				+	})
			
 
				+
			
 
				+	it("should return success with no containers when nothing is running", async () => {
			
 
				+		const runId = 333
			
 
				+
			
 
				+		mockExecFileSync
			
 
				+			.mockImplementationOnce(() => {
			
 
				+				throw new Error("No such container")
			
 
				+			}) // controller not running
			
 
				+			.mockReturnValueOnce("") // no task containers
			
 
				+
			
 
				+		const resultPromise = killRun(runId)
			
 
				+		await vi.advanceTimersByTimeAsync(10000)
			
 
				+		const result = await resultPromise
			
 
				+
			
 
				+		expect(result.success).toBe(true)
			
 
				+		expect(result.killedContainers).toHaveLength(0)
			
 
				+		expect(result.errors).toHaveLength(0)
			
 
				+	})
			
 
				+
			
 
				+	it("should only kill containers belonging to the specific run", async () => {
			
 
				+		const runId = 555
			
 
				+
			
 
				+		mockExecFileSync
			
 
				+			.mockReturnValueOnce("") // controller kill
			
 
				+			.mockReturnValueOnce("evals-task-555-100.0\n") // docker ps
			
 
				+			.mockReturnValueOnce("") // docker kill task
			
 
				+
			
 
				+		const resultPromise = killRun(runId)
			
 
				+		await vi.advanceTimersByTimeAsync(10000)
			
 
				+		const result = await resultPromise
			
 
				+
			
 
				+		expect(result.success).toBe(true)
			
 
				+		// Verify execFileSync was called for docker ps with run-specific filter
			
 
				+		expect(mockExecFileSync).toHaveBeenNthCalledWith(
			
 
				+			2,
			
 
				+			"docker",
			
 
				+			["ps", "--format", "{{.Names}}", "--filter", "name=evals-task-555-"],
			
 
				+			expect.any(Object),
			
 
				+		)
			
 
				+	})
			
 
				+})
			
--- a/apps/web-evals/src/actions/runs.ts
+++ b/apps/web-evals/src/actions/runs.ts
@@ -3,7 +3,7 @@
 
				 import * as path from "path"
			
 
				 import fs from "fs"
			
 
				 import { fileURLToPath } from "url"
			
 
				-import { spawn } from "child_process"
			
 
				+import { spawn, execFileSync } from "child_process"
			
 
				 
			
 
				 import { revalidatePath } from "next/cache"
			
 
				 import pMap from "p-map"
			
@@ -18,6 +18,7 @@ import {
 
				 } from "@roo-code/evals"
			
 
				 
			
 
				 import { CreateRun } from "@/lib/schemas"
			
 
				+import { redisClient } from "@/lib/server/redis"
			
 
				 
			
 
				 const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals")
			
 
				 
			
@@ -116,3 +117,100 @@ export async function deleteRun(runId: number) {
 
				 	await _deleteRun(runId)
			
 
				 	revalidatePath("/runs")
			
 
				 }
			
 
				+
			
 
				+export type KillRunResult = {
			
 
				+	success: boolean
			
 
				+	killedContainers: string[]
			
 
				+	errors: string[]
			
 
				+}
			
 
				+
			
 
				+const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms))
			
 
				+
			
 
				+/**
			
 
				+ * Kill all Docker containers associated with a run (controller and task runners).
			
 
				+ * Kills the controller first, waits 10 seconds, then kills runners.
			
 
				+ * Also clears Redis state for heartbeat and runners.
			
 
				+ *
			
 
				+ * Container naming conventions:
			
 
				+ * - Controller: evals-controller-{runId}
			
 
				+ * - Task runners: evals-task-{runId}-{taskId}.{attempt}
			
 
				+ */
			
 
				+export async function killRun(runId: number): Promise<KillRunResult> {
			
 
				+	const killedContainers: string[] = []
			
 
				+	const errors: string[] = []
			
 
				+	const controllerPattern = `evals-controller-${runId}`
			
 
				+	const taskPattern = `evals-task-${runId}-`
			
 
				+
			
 
				+	try {
			
 
				+		// Step 1: Kill the controller first
			
 
				+		console.log(`Killing controller: ${controllerPattern}`)
			
 
				+		try {
			
 
				+			execFileSync("docker", ["kill", controllerPattern], { encoding: "utf-8", timeout: 10000 })
			
 
				+			killedContainers.push(controllerPattern)
			
 
				+			console.log(`Killed controller container: ${controllerPattern}`)
			
 
				+		} catch (_error) {
			
 
				+			// Controller might not be running - that's ok, continue to kill runners
			
 
				+			console.log(`Controller ${controllerPattern} not running or already stopped`)
			
 
				+		}
			
 
				+
			
 
				+		// Step 2: Wait 10 seconds before killing runners
			
 
				+		console.log("Waiting 10 seconds before killing runners...")
			
 
				+		await sleep(10000)
			
 
				+
			
 
				+		// Step 3: Find and kill all task runner containers for THIS run only
			
 
				+		let taskContainerNames: string[] = []
			
 
				+
			
 
				+		try {
			
 
				+			const output = execFileSync("docker", ["ps", "--format", "{{.Names}}", "--filter", `name=${taskPattern}`], {
			
 
				+				encoding: "utf-8",
			
 
				+				timeout: 10000,
			
 
				+			})
			
 
				+			taskContainerNames = output
			
 
				+				.split("\n")
			
 
				+				.map((name) => name.trim())
			
 
				+				.filter((name) => name.length > 0 && name.startsWith(taskPattern))
			
 
				+		} catch (error) {
			
 
				+			console.error("Failed to list task containers:", error)
			
 
				+			errors.push("Failed to list Docker task containers")
			
 
				+		}
			
 
				+
			
 
				+		// Kill each task runner container
			
 
				+		for (const containerName of taskContainerNames) {
			
 
				+			try {
			
 
				+				execFileSync("docker", ["kill", containerName], { encoding: "utf-8", timeout: 10000 })
			
 
				+				killedContainers.push(containerName)
			
 
				+				console.log(`Killed task container: ${containerName}`)
			
 
				+			} catch (error) {
			
 
				+				// Container might have already stopped
			
 
				+				console.error(`Failed to kill container ${containerName}:`, error)
			
 
				+				errors.push(`Failed to kill container: ${containerName}`)
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		// Step 4: Clear Redis state
			
 
				+		try {
			
 
				+			const redis = await redisClient()
			
 
				+			const heartbeatKey = `heartbeat:${runId}`
			
 
				+			const runnersKey = `runners:${runId}`
			
 
				+
			
 
				+			await redis.del(heartbeatKey)
			
 
				+			await redis.del(runnersKey)
			
 
				+			console.log(`Cleared Redis keys: ${heartbeatKey}, ${runnersKey}`)
			
 
				+		} catch (error) {
			
 
				+			console.error("Failed to clear Redis state:", error)
			
 
				+			errors.push("Failed to clear Redis state")
			
 
				+		}
			
 
				+	} catch (error) {
			
 
				+		console.error("Error in killRun:", error)
			
 
				+		errors.push("Unexpected error while killing containers")
			
 
				+	}
			
 
				+
			
 
				+	revalidatePath(`/runs/${runId}`)
			
 
				+	revalidatePath("/runs")
			
 
				+
			
 
				+	return {
			
 
				+		success: killedContainers.length > 0 || errors.length === 0,
			
 
				+		killedContainers,
			
 
				+		errors,
			
 
				+	}
			
 
				+}
			
--- a/apps/web-evals/src/app/runs/[id]/page.tsx
+++ b/apps/web-evals/src/app/runs/[id]/page.tsx
@@ -7,7 +7,7 @@ export default async function Page({ params }: { params: Promise<{ id: string }>
 
				 	const run = await findRun(Number(id))
			
 
				 
			
 
				 	return (
			
 
				-		<div className="max-w-3xl mx-auto px-12 p-12">
			
 
				+		<div className="w-full px-6 py-12">
			
 
				 			<Run run={run} />
			
 
				 		</div>
			
 
				 	)
			
--- a/apps/web-evals/src/app/runs/[id]/run-status.tsx
+++ b/apps/web-evals/src/app/runs/[id]/run-status.tsx
@@ -1,55 +1,79 @@
 
				 "use client"
			
 
				 
			
 
				+import { Link2, Link2Off, CheckCircle2 } from "lucide-react"
			
 
				 import type { RunStatus as _RunStatus } from "@/hooks/use-run-status"
			
 
				 import { cn } from "@/lib/utils"
			
 
				+import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui"
			
 
				 
			
 
				-export const RunStatus = ({ runStatus: { sseStatus, heartbeat, runners = [] } }: { runStatus: _RunStatus }) => (
			
 
				-	<div>
			
 
				-		<div className="flex items-center gap-2">
			
 
				-			<div className="flex items-center gap-2">
			
 
				-				<div>Task Stream:</div>
			
 
				-				<div className="font-mono text-sm text-muted-foreground">{sseStatus}</div>
			
 
				-			</div>
			
 
				-			<div className="relative">
			
 
				-				<div
			
 
				-					className={cn("absolute size-2.5 rounded-full opacity-50 animate-ping", {
			
 
				-						"bg-green-500": sseStatus === "connected",
			
 
				-						"bg-amber-500": sseStatus === "waiting",
			
 
				-						"bg-rose-500": sseStatus === "error",
			
 
				-					})}
			
 
				-				/>
			
 
				-				<div
			
 
				-					className={cn("size-2.5 rounded-full", {
			
 
				-						"bg-green-500": sseStatus === "connected",
			
 
				-						"bg-amber-500": sseStatus === "waiting",
			
 
				-						"bg-rose-500": sseStatus === "error",
			
 
				-					})}
			
 
				-				/>
			
 
				-			</div>
			
 
				-		</div>
			
 
				-		<div className="flex items-center gap-2">
			
 
				-			<div className="flex items-center gap-2">
			
 
				-				<div>Task Controller:</div>
			
 
				-				<div className="font-mono text-sm text-muted-foreground">{heartbeat ?? "dead"}</div>
			
 
				-			</div>
			
 
				-			<div className="relative">
			
 
				-				<div
			
 
				-					className={cn("absolute size-2.5 rounded-full opacity-50 animate-ping", {
			
 
				-						"bg-green-500": !!heartbeat,
			
 
				-						"bg-rose-500": !heartbeat,
			
 
				-					})}
			
 
				-				/>
			
 
				-				<div
			
 
				-					className={cn("size-2.5 rounded-full", {
			
 
				-						"bg-green-500": !!heartbeat,
			
 
				-						"bg-rose-500": !heartbeat,
			
 
				-					})}
			
 
				-				/>
			
 
				-			</div>
			
 
				-		</div>
			
 
				-		<div className="flex items-center gap-2">
			
 
				-			<div>Task Runners:</div>
			
 
				-			{runners.length > 0 && <div className="font-mono text-sm text-muted-foreground">{runners?.join(", ")}</div>}
			
 
				-		</div>
			
 
				-	</div>
			
 
				-)
			
 
				+function StreamIcon({ status }: { status: "connected" | "waiting" | "error" }) {
			
 
				+	if (status === "connected") {
			
 
				+		return <Link2 className="size-4 text-green-500" />
			
 
				+	}
			
 
				+	return <Link2Off className={cn("size-4", status === "waiting" ? "text-amber-500" : "text-rose-500")} />
			
 
				+}
			
 
				+
			
 
				+export const RunStatus = ({
			
 
				+	runStatus: { sseStatus, heartbeat, runners = [] },
			
 
				+	isComplete = false,
			
 
				+}: {
			
 
				+	runStatus: _RunStatus
			
 
				+	isComplete?: boolean
			
 
				+}) => {
			
 
				+	// For completed runs, show a simple "Complete" badge
			
 
				+	if (isComplete) {
			
 
				+		return (
			
 
				+			<Tooltip>
			
 
				+				<TooltipTrigger asChild>
			
 
				+					<div className="flex items-center gap-1 cursor-default text-muted-foreground">
			
 
				+						<CheckCircle2 className="size-4" />
			
 
				+					</div>
			
 
				+				</TooltipTrigger>
			
 
				+				<TooltipContent side="bottom" className="font-mono text-xs">
			
 
				+					Run complete
			
 
				+				</TooltipContent>
			
 
				+			</Tooltip>
			
 
				+		)
			
 
				+	}
			
 
				+
			
 
				+	return (
			
 
				+		<Tooltip>
			
 
				+			<TooltipTrigger asChild>
			
 
				+				<div className="flex items-center gap-2 cursor-default text-xs font-mono">
			
 
				+					{/* Task Stream status icon */}
			
 
				+					<StreamIcon status={sseStatus} />
			
 
				+
			
 
				+					{/* Task Controller ID */}
			
 
				+					<span className={heartbeat ? "text-green-500" : "text-rose-500"}>{heartbeat ?? "-"}</span>
			
 
				+
			
 
				+					{/* Task Runners count */}
			
 
				+					<span className={runners.length > 0 ? "text-green-500" : "text-rose-500"}>
			
 
				+						{runners.length > 0 ? `${runners.length}r` : "0r"}
			
 
				+					</span>
			
 
				+				</div>
			
 
				+			</TooltipTrigger>
			
 
				+			<TooltipContent side="bottom" className="font-mono text-xs max-w-md">
			
 
				+				<div className="space-y-1">
			
 
				+					<div className="flex items-center gap-2">
			
 
				+						<StreamIcon status={sseStatus} />
			
 
				+						<span>Task Stream: {sseStatus}</span>
			
 
				+					</div>
			
 
				+					<div className="flex items-center gap-2">
			
 
				+						<span className={heartbeat ? "text-green-500" : "text-rose-500"}>●</span>
			
 
				+						<span>Task Controller: {heartbeat ?? "dead"}</span>
			
 
				+					</div>
			
 
				+					<div className="flex items-center gap-2">
			
 
				+						<span className={runners.length > 0 ? "text-green-500" : "text-rose-500"}>●</span>
			
 
				+						<span>Task Runners: {runners.length > 0 ? runners.length : "none"}</span>
			
 
				+					</div>
			
 
				+					{runners.length > 0 && (
			
 
				+						<div className="mt-2 pt-2 border-t border-border text-muted-foreground space-y-0.5">
			
 
				+							{runners.map((runner) => (
			
 
				+								<div key={runner}>{runner}</div>
			
 
				+							))}
			
 
				+						</div>
			
 
				+					)}
			
 
				+				</div>
			
 
				+			</TooltipContent>
			
 
				+		</Tooltip>
			
 
				+	)
			
 
				+}
			
--- a/apps/web-evals/src/app/runs/[id]/run.tsx
+++ b/apps/web-evals/src/app/runs/[id]/run.tsx
@@ -2,12 +2,14 @@
 
				 
			
 
				 import { useMemo, useState, useCallback, useEffect } from "react"
			
 
				 import { toast } from "sonner"
			
 
				-import { LoaderCircle, FileText, Copy, Check } from "lucide-react"
			
 
				+import { LoaderCircle, FileText, Copy, Check, StopCircle } from "lucide-react"
			
 
				 
			
 
				 import type { Run, TaskMetrics as _TaskMetrics, Task } from "@roo-code/evals"
			
 
				+import type { ToolName } from "@roo-code/types"
			
 
				 
			
 
				 import { formatCurrency, formatDuration, formatTokens, formatToolUsageSuccessRate } from "@/lib/formatters"
			
 
				 import { useRunStatus } from "@/hooks/use-run-status"
			
 
				+import { killRun } from "@/actions/runs"
			
 
				 import {
			
 
				 	Table,
			
 
				 	TableBody,
			
@@ -24,6 +26,14 @@ import {
 
				 	DialogTitle,
			
 
				 	ScrollArea,
			
 
				 	Button,
			
 
				+	AlertDialog,
			
 
				+	AlertDialogAction,
			
 
				+	AlertDialogCancel,
			
 
				+	AlertDialogContent,
			
 
				+	AlertDialogDescription,
			
 
				+	AlertDialogFooter,
			
 
				+	AlertDialogHeader,
			
 
				+	AlertDialogTitle,
			
 
				 } from "@/components/ui"
			
 
				 
			
 
				 import { TaskStatus } from "./task-status"
			
@@ -51,19 +61,80 @@ type HighlightPattern = {
 
				 }
			
 
				 
			
 
				 const HIGHLIGHT_PATTERNS: HighlightPattern[] = [
			
 
				-	// Timestamps [YYYY-MM-DDTHH:MM:SS.sssZ]
			
 
				-	{ pattern: /\[(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z)\]/g, className: "text-blue-400" },
			
 
				-	// Log levels
			
 
				+	// Log levels - styled as badges
			
 
				 	{ pattern: /\|\s*(INFO)\s*\|/g, className: "text-green-400", wrapGroup: 1 },
			
 
				 	{ pattern: /\|\s*(WARN|WARNING)\s*\|/g, className: "text-yellow-400", wrapGroup: 1 },
			
 
				-	{ pattern: /\|\s*(ERROR)\s*\|/g, className: "text-red-400", wrapGroup: 1 },
			
 
				+	{ pattern: /\|\s*(ERROR)\s*\|/g, className: "text-red-400 font-semibold", wrapGroup: 1 },
			
 
				 	{ pattern: /\|\s*(DEBUG)\s*\|/g, className: "text-gray-400", wrapGroup: 1 },
			
 
				-	// Task identifiers
			
 
				-	{ pattern: /(taskCreated|taskFocused|taskStarted|taskCompleted|EvalPass|EvalFail)/g, className: "text-purple-400" },
			
 
				+	// Task identifiers - important events
			
 
				+	{
			
 
				+		pattern: /(taskCreated|taskFocused|taskStarted|taskCompleted|taskAborted|taskResumable)/g,
			
 
				+		className: "text-purple-400 font-medium",
			
 
				+	},
			
 
				+	// Tool failures - highlight in red
			
 
				+	{ pattern: /(taskToolFailed)/g, className: "text-red-400 font-bold" },
			
 
				+	{ pattern: /(Tool execution failed|tool.*failed|failed.*tool)/gi, className: "text-red-400" },
			
 
				+	{ pattern: /(EvalPass)/g, className: "text-green-400 font-bold" },
			
 
				+	{ pattern: /(EvalFail)/g, className: "text-red-400 font-bold" },
			
 
				 	// Message arrows
			
 
				 	{ pattern: /→/g, className: "text-cyan-400" },
			
 
				+	// Tool names in quotes
			
 
				+	{ pattern: /"(tool)":\s*"([^"]+)"/g, className: "text-orange-400" },
			
 
				+	// JSON keys
			
 
				+	{ pattern: /"([^"]+)":/g, className: "text-sky-300" },
			
 
				+	// Boolean values
			
 
				+	{ pattern: /:\s*(true|false)/g, className: "text-amber-400", wrapGroup: 1 },
			
 
				+	// Numbers
			
 
				+	{ pattern: /:\s*(-?\d+\.?\d*)/g, className: "text-emerald-400", wrapGroup: 1 },
			
 
				 ]
			
 
				 
			
 
				+// Extract timestamp from a log line and return elapsed time from baseline
			
 
				+function formatElapsedTime(timestamp: string, baselineMs: number): string {
			
 
				+	const currentMs = new Date(timestamp).getTime()
			
 
				+	const elapsedMs = currentMs - baselineMs
			
 
				+	const totalSeconds = Math.floor(elapsedMs / 1000)
			
 
				+	const minutes = Math.floor(totalSeconds / 60)
			
 
				+	const seconds = totalSeconds % 60
			
 
				+	return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`
			
 
				+}
			
 
				+
			
 
				+// Extract the first timestamp from the log to use as baseline
			
 
				+function extractFirstTimestamp(log: string): number | null {
			
 
				+	// Match timestamp at start of line: [2025-11-28T09:35:23.187Z | ... or [2025-11-28T09:35:23.187Z]
			
 
				+	const match = log.match(/\[(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z)[\s|\]]/)
			
 
				+	const isoString = match?.[1]
			
 
				+	if (!isoString) return null
			
 
				+	return new Date(isoString).getTime()
			
 
				+}
			
 
				+
			
 
				+// Simplify log line by removing redundant metadata
			
 
				+function simplifyLogLine(line: string, baselineMs: number | null): { timestamp: string; simplified: string } {
			
 
				+	// Extract timestamp - matches [2025-11-28T09:35:23.187Z | ... format
			
 
				+	const timestampMatch = line.match(/\[(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z)[\s|\]]/)
			
 
				+	const isoTimestamp = timestampMatch?.[1]
			
 
				+	if (!isoTimestamp) {
			
 
				+		return { timestamp: "", simplified: line }
			
 
				+	}
			
 
				+
			
 
				+	const timestamp = baselineMs !== null ? formatElapsedTime(isoTimestamp, baselineMs) : isoTimestamp.slice(11, 19)
			
 
				+
			
 
				+	// Remove the timestamp from the line (handles both [timestamp] and [timestamp | formats)
			
 
				+	let simplified = line.replace(/\[\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z\s*\|?\s*/, "")
			
 
				+
			
 
				+	// Remove redundant metadata: pid, run, task IDs (they're same for entire log)
			
 
				+	simplified = simplified.replace(/\|\s*pid:\d+\s*/g, "")
			
 
				+	simplified = simplified.replace(/\|\s*run:\d+\s*/g, "")
			
 
				+	simplified = simplified.replace(/\|\s*task:\d+\s*/g, "")
			
 
				+	simplified = simplified.replace(/runTask\s*\|\s*/g, "")
			
 
				+
			
 
				+	// Clean up extra pipes, spaces, and trailing brackets
			
 
				+	simplified = simplified.replace(/\|\s*\|/g, "|")
			
 
				+	simplified = simplified.replace(/^\s*\|\s*/, "")
			
 
				+	simplified = simplified.replace(/\]\s*$/, "") // Remove trailing bracket if present
			
 
				+
			
 
				+	return { timestamp, simplified }
			
 
				+}
			
 
				+
			
 
				 // Format a single line with syntax highlighting using React elements (XSS-safe)
			
 
				 function formatLine(line: string): React.ReactNode[] {
			
 
				 	// Find all matches with their positions
			
@@ -125,24 +196,83 @@ function formatLine(line: string): React.ReactNode[] {
 
				 	return result.length > 0 ? result : [line]
			
 
				 }
			
 
				 
			
 
				+// Determine the visual style for a log line based on its content
			
 
				+function getLineStyle(line: string): string {
			
 
				+	if (line.includes("ERROR")) return "bg-red-950/30 border-l-2 border-red-500"
			
 
				+	if (line.includes("WARN") || line.includes("WARNING")) return "bg-yellow-950/20 border-l-2 border-yellow-500"
			
 
				+	if (line.includes("taskToolFailed")) return "bg-red-950/30 border-l-2 border-red-500"
			
 
				+	if (line.includes("taskStarted") || line.includes("taskCreated")) return "bg-purple-950/20"
			
 
				+	if (line.includes("EvalPass")) return "bg-green-950/30 border-l-2 border-green-500"
			
 
				+	if (line.includes("EvalFail")) return "bg-red-950/30 border-l-2 border-red-500"
			
 
				+	if (line.includes("taskCompleted") || line.includes("taskAborted")) return "bg-blue-950/20"
			
 
				+	return ""
			
 
				+}
			
 
				+
			
 
				 // Format log content with basic highlighting (XSS-safe - no dangerouslySetInnerHTML)
			
 
				 function formatLogContent(log: string): React.ReactNode[] {
			
 
				 	const lines = log.split("\n")
			
 
				-	return lines.map((line, index) => (
			
 
				-		<div key={index} className="hover:bg-white/5">
			
 
				-			{line ? formatLine(line) : " "}
			
 
				-		</div>
			
 
				-	))
			
 
				+	const baselineMs = extractFirstTimestamp(log)
			
 
				+
			
 
				+	return lines.map((line, index) => {
			
 
				+		if (!line.trim()) {
			
 
				+			return (
			
 
				+				<div key={index} className="h-2">
			
 
				+					{" "}
			
 
				+				</div>
			
 
				+			)
			
 
				+		}
			
 
				+
			
 
				+		const parsed = simplifyLogLine(line, baselineMs)
			
 
				+		const lineStyle = getLineStyle(line)
			
 
				+
			
 
				+		return (
			
 
				+			<div key={index} className={`flex hover:bg-white/10 py-0.5 rounded-sm transition-colors ${lineStyle}`}>
			
 
				+				{/* Elapsed time */}
			
 
				+				<span className="text-blue-400 font-mono w-12 flex-shrink-0 tabular-nums text-right pr-2">
			
 
				+					{parsed.timestamp}
			
 
				+				</span>
			
 
				+				{/* Log content - pl-12 ensures wrapped lines are indented under the timestamp */}
			
 
				+				<span className="flex-1 break-words" style={{ textIndent: "-0.5rem", paddingLeft: "0.5rem" }}>
			
 
				+					{formatLine(parsed.simplified)}
			
 
				+				</span>
			
 
				+			</div>
			
 
				+		)
			
 
				+	})
			
 
				 }
			
 
				 
			
 
				 export function Run({ run }: { run: Run }) {
			
 
				 	const runStatus = useRunStatus(run)
			
 
				-	const { tasks, tokenUsage, usageUpdatedAt } = runStatus
			
 
				+	const { tasks, tokenUsage, usageUpdatedAt, heartbeat, runners } = runStatus
			
 
				 
			
 
				 	const [selectedTask, setSelectedTask] = useState<Task | null>(null)
			
 
				 	const [taskLog, setTaskLog] = useState<string | null>(null)
			
 
				 	const [isLoadingLog, setIsLoadingLog] = useState(false)
			
 
				 	const [copied, setCopied] = useState(false)
			
 
				+	const [showKillDialog, setShowKillDialog] = useState(false)
			
 
				+	const [isKilling, setIsKilling] = useState(false)
			
 
				+
			
 
				+	// Determine if run is still active (has heartbeat or runners)
			
 
				+	const isRunActive = !run.taskMetricsId && (!!heartbeat || (runners && runners.length > 0))
			
 
				+
			
 
				+	const onKillRun = useCallback(async () => {
			
 
				+		setIsKilling(true)
			
 
				+		try {
			
 
				+			const result = await killRun(run.id)
			
 
				+			if (result.killedContainers.length > 0) {
			
 
				+				toast.success(`Killed ${result.killedContainers.length} container(s)`)
			
 
				+			} else if (result.errors.length === 0) {
			
 
				+				toast.info("No running containers found")
			
 
				+			} else {
			
 
				+				toast.error(result.errors.join(", "))
			
 
				+			}
			
 
				+		} catch (error) {
			
 
				+			console.error("Failed to kill run:", error)
			
 
				+			toast.error("Failed to kill run")
			
 
				+		} finally {
			
 
				+			setIsKilling(false)
			
 
				+			setShowKillDialog(false)
			
 
				+		}
			
 
				+	}, [run.id])
			
 
				 
			
 
				 	const onCopyLog = useCallback(async () => {
			
 
				 		if (!taskLog) return
			
@@ -172,9 +302,9 @@ export function Run({ run }: { run: Run }) {
 
				 
			
 
				 	const onViewTaskLog = useCallback(
			
 
				 		async (task: Task) => {
			
 
				-			// Only allow viewing logs for completed tasks
			
 
				-			if (task.passed === null || task.passed === undefined) {
			
 
				-				toast.error("Task is still running")
			
 
				+			// Only allow viewing logs for tasks that have started
			
 
				+			if (!task.startedAt && !tokenUsage.get(task.id)) {
			
 
				+				toast.error("Task has not started yet")
			
 
				 				return
			
 
				 			}
			
 
				 
			
@@ -202,7 +332,7 @@ export function Run({ run }: { run: Run }) {
 
				 				setIsLoadingLog(false)
			
 
				 			}
			
 
				 		},
			
 
				-		[run.id],
			
 
				+		[run.id, tokenUsage],
			
 
				 	)
			
 
				 
			
 
				 	const taskMetrics: Record<number, TaskMetrics> = useMemo(() => {
			
@@ -228,22 +358,34 @@ export function Run({ run }: { run: Run }) {
 
				 		// eslint-disable-next-line react-hooks/exhaustive-deps
			
 
				 	}, [tasks, tokenUsage, usageUpdatedAt])
			
 
				 
			
 
				+	// Collect all unique tool names from all tasks and sort by total attempts
			
 
				+	const toolColumns = useMemo<ToolName[]>(() => {
			
 
				+		if (!tasks) return []
			
 
				+
			
 
				+		const toolTotals = new Map<ToolName, number>()
			
 
				+
			
 
				+		for (const task of tasks) {
			
 
				+			if (task.taskMetrics?.toolUsage) {
			
 
				+				for (const [toolName, usage] of Object.entries(task.taskMetrics.toolUsage)) {
			
 
				+					const tool = toolName as ToolName
			
 
				+					const current = toolTotals.get(tool) ?? 0
			
 
				+					toolTotals.set(tool, current + usage.attempts)
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		// Sort by total attempts descending
			
 
				+		return Array.from(toolTotals.entries())
			
 
				+			.sort((a, b) => b[1] - a[1])
			
 
				+			.map(([name]): ToolName => name)
			
 
				+	}, [tasks])
			
 
				+
			
 
				 	// Compute aggregate stats
			
 
				 	const stats = useMemo(() => {
			
 
				 		if (!tasks) return null
			
 
				 
			
 
				 		const passed = tasks.filter((t) => t.passed === true).length
			
 
				 		const failed = tasks.filter((t) => t.passed === false).length
			
 
				-		// Count running tasks exactly like TaskStatus shows spinner:
			
 
				-		// - passed is not true and not false (null/undefined)
			
 
				-		// - AND has activity (startedAt or tokenUsage)
			
 
				-		const running = tasks.filter(
			
 
				-			(t) => t.passed !== true && t.passed !== false && (t.startedAt || tokenUsage.get(t.id)),
			
 
				-		).length
			
 
				-		const pending = tasks.filter(
			
 
				-			(t) => t.passed !== true && t.passed !== false && !t.startedAt && !tokenUsage.get(t.id),
			
 
				-		).length
			
 
				-		const total = tasks.length
			
 
				 		const completed = passed + failed
			
 
				 
			
 
				 		let totalTokensIn = 0
			
@@ -279,9 +421,6 @@ export function Run({ run }: { run: Run }) {
 
				 		return {
			
 
				 			passed,
			
 
				 			failed,
			
 
				-			running,
			
 
				-			pending,
			
 
				-			total,
			
 
				 			completed,
			
 
				 			passRate: completed > 0 ? ((passed / completed) * 100).toFixed(1) : null,
			
 
				 			totalTokensIn,
			
@@ -293,42 +432,96 @@ export function Run({ run }: { run: Run }) {
 
				 		// eslint-disable-next-line react-hooks/exhaustive-deps
			
 
				 	}, [tasks, taskMetrics, tokenUsage, usageUpdatedAt])
			
 
				 
			
 
				+	// Calculate elapsed time (wall-clock time from run creation to completion or now)
			
 
				+	const elapsedTime = useMemo(() => {
			
 
				+		if (!tasks || tasks.length === 0) return null
			
 
				+
			
 
				+		const startTime = new Date(run.createdAt).getTime()
			
 
				+
			
 
				+		// If run is complete, find the latest finishedAt from tasks
			
 
				+		if (run.taskMetricsId) {
			
 
				+			const latestFinish = tasks.reduce((latest, task) => {
			
 
				+				if (task.finishedAt) {
			
 
				+					const finishTime = new Date(task.finishedAt).getTime()
			
 
				+					return finishTime > latest ? finishTime : latest
			
 
				+				}
			
 
				+				return latest
			
 
				+			}, startTime)
			
 
				+			return latestFinish - startTime
			
 
				+		}
			
 
				+
			
 
				+		// If still running, use current time
			
 
				+		return Date.now() - startTime
			
 
				+		// eslint-disable-next-line react-hooks/exhaustive-deps
			
 
				+	}, [tasks, run.createdAt, run.taskMetricsId, usageUpdatedAt])
			
 
				+
			
 
				 	return (
			
 
				 		<>
			
 
				 			<div>
			
 
				-				<div className="mb-4">
			
 
				-					<div>
			
 
				-						<div className="font-mono">{run.model}</div>
			
 
				-						{run.description && <div className="text-sm text-muted-foreground">{run.description}</div>}
			
 
				-					</div>
			
 
				-					{!run.taskMetricsId && <RunStatus runStatus={runStatus} />}
			
 
				-				</div>
			
 
				-
			
 
				 				{stats && (
			
 
				-					<div className="mb-4 p-4 border rounded-lg bg-muted/50">
			
 
				+					<div className="mb-4 p-4 border rounded-lg bg-muted sticky top-0 z-10">
			
 
				+						{/* Provider, Model title and status */}
			
 
				+						<div className="flex items-center justify-center gap-3 mb-3 relative">
			
 
				+							{run.settings?.apiProvider && (
			
 
				+								<span className="text-sm text-muted-foreground">{run.settings.apiProvider}</span>
			
 
				+							)}
			
 
				+							<div className="font-mono">{run.model}</div>
			
 
				+							<RunStatus runStatus={runStatus} isComplete={!!run.taskMetricsId} />
			
 
				+							{run.description && (
			
 
				+								<span className="text-sm text-muted-foreground">- {run.description}</span>
			
 
				+							)}
			
 
				+							{isRunActive && (
			
 
				+								<Tooltip>
			
 
				+									<TooltipTrigger asChild>
			
 
				+										<Button
			
 
				+											variant="ghost"
			
 
				+											size="sm"
			
 
				+											onClick={() => setShowKillDialog(true)}
			
 
				+											disabled={isKilling}
			
 
				+											className="absolute right-0 flex items-center gap-1 text-muted-foreground hover:text-destructive">
			
 
				+											{isKilling ? (
			
 
				+												<LoaderCircle className="size-4 animate-spin" />
			
 
				+											) : (
			
 
				+												<StopCircle className="size-4" />
			
 
				+											)}
			
 
				+											Kill
			
 
				+										</Button>
			
 
				+									</TooltipTrigger>
			
 
				+									<TooltipContent>Stop all containers for this run</TooltipContent>
			
 
				+								</Tooltip>
			
 
				+							)}
			
 
				+						</div>
			
 
				 						{/* Main Stats Row */}
			
 
				-						<div className="flex flex-wrap items-start justify-between gap-x-6 gap-y-3">
			
 
				+						<div className="flex items-start justify-center gap-x-8 gap-y-3">
			
 
				 							{/* Passed/Failed */}
			
 
				-							<div className="text-center">
			
 
				+							<div className="text-center min-w-[80px]">
			
 
				 								<div className="text-2xl font-bold whitespace-nowrap">
			
 
				 									<span className="text-green-600">{stats.passed}</span>
			
 
				 									<span className="text-muted-foreground mx-1">/</span>
			
 
				 									<span className="text-red-600">{stats.failed}</span>
			
 
				-									{stats.running > 0 && (
			
 
				-										<span className="text-yellow-600 text-sm ml-2">({stats.running})</span>
			
 
				-									)}
			
 
				 								</div>
			
 
				 								<div className="text-xs text-muted-foreground">Passed / Failed</div>
			
 
				 							</div>
			
 
				 
			
 
				 							{/* Pass Rate */}
			
 
				-							<div className="text-center">
			
 
				-								<div className="text-2xl font-bold">{stats.passRate ? `${stats.passRate}%` : "-"}</div>
			
 
				+							<div className="text-center min-w-[80px]">
			
 
				+								<div
			
 
				+									className={`text-2xl font-bold ${
			
 
				+										stats.passRate === null
			
 
				+											? ""
			
 
				+											: parseFloat(stats.passRate) === 100
			
 
				+												? ""
			
 
				+												: parseFloat(stats.passRate) >= 80
			
 
				+													? "text-yellow-500"
			
 
				+													: "text-red-500"
			
 
				+									}`}>
			
 
				+									{stats.passRate ? `${stats.passRate}%` : "-"}
			
 
				+								</div>
			
 
				 								<div className="text-xs text-muted-foreground">Pass Rate</div>
			
 
				 							</div>
			
 
				 
			
 
				 							{/* Tokens */}
			
 
				-							<div className="text-center">
			
 
				+							<div className="text-center min-w-[140px]">
			
 
				 								<div className="text-xl font-bold font-mono whitespace-nowrap">
			
 
				 									{formatTokens(stats.totalTokensIn)}
			
 
				 									<span className="text-muted-foreground mx-1">/</span>
			
@@ -338,58 +531,64 @@ export function Run({ run }: { run: Run }) {
 
				 							</div>
			
 
				 
			
 
				 							{/* Cost */}
			
 
				-							<div className="text-center">
			
 
				+							<div className="text-center min-w-[70px]">
			
 
				 								<div className="text-2xl font-bold font-mono">{formatCurrency(stats.totalCost)}</div>
			
 
				 								<div className="text-xs text-muted-foreground">Cost</div>
			
 
				 							</div>
			
 
				 
			
 
				 							{/* Duration */}
			
 
				-							<div className="text-center">
			
 
				+							<div className="text-center min-w-[90px]">
			
 
				 								<div className="text-2xl font-bold font-mono whitespace-nowrap">
			
 
				 									{stats.totalDuration > 0 ? formatDuration(stats.totalDuration) : "-"}
			
 
				 								</div>
			
 
				 								<div className="text-xs text-muted-foreground">Duration</div>
			
 
				 							</div>
			
 
				 
			
 
				-							{/* Tool Usage - Inline */}
			
 
				-							{Object.keys(stats.toolUsage).length > 0 && (
			
 
				-								<div className="flex items-center gap-2 flex-wrap">
			
 
				-									{Object.entries(stats.toolUsage)
			
 
				-										.sort(([, a], [, b]) => b.attempts - a.attempts)
			
 
				-										.map(([toolName, usage]) => {
			
 
				-											const abbr = getToolAbbreviation(toolName)
			
 
				-											const successRate =
			
 
				-												usage.attempts > 0
			
 
				-													? ((usage.attempts - usage.failures) / usage.attempts) * 100
			
 
				-													: 100
			
 
				-											const rateColor =
			
 
				-												successRate === 100
			
 
				-													? "text-green-500"
			
 
				-													: successRate >= 80
			
 
				-														? "text-yellow-500"
			
 
				-														: "text-red-500"
			
 
				-											return (
			
 
				-												<Tooltip key={toolName}>
			
 
				-													<TooltipTrigger asChild>
			
 
				-														<div className="flex items-center gap-1 px-2 py-1 rounded bg-background/50 border border-border/50 hover:border-border transition-colors cursor-default text-xs">
			
 
				-															<span className="font-medium text-muted-foreground">
			
 
				-																{abbr}
			
 
				-															</span>
			
 
				-															<span className="font-bold tabular-nums">
			
 
				-																{usage.attempts}
			
 
				-															</span>
			
 
				-															<span className={`${rateColor}`}>
			
 
				-																{formatToolUsageSuccessRate(usage)}
			
 
				-															</span>
			
 
				-														</div>
			
 
				-													</TooltipTrigger>
			
 
				-													<TooltipContent side="bottom">{toolName}</TooltipContent>
			
 
				-												</Tooltip>
			
 
				-											)
			
 
				-										})}
			
 
				+							{/* Elapsed Time */}
			
 
				+							<div className="text-center min-w-[90px]">
			
 
				+								<div className="text-2xl font-bold font-mono whitespace-nowrap">
			
 
				+									{elapsedTime !== null ? formatDuration(elapsedTime) : "-"}
			
 
				 								</div>
			
 
				-							)}
			
 
				+								<div className="text-xs text-muted-foreground">Elapsed</div>
			
 
				+							</div>
			
 
				 						</div>
			
 
				+
			
 
				+						{/* Tool Usage Row */}
			
 
				+						{Object.keys(stats.toolUsage).length > 0 && (
			
 
				+							<div className="flex items-center justify-center gap-2 flex-wrap mt-3">
			
 
				+								{Object.entries(stats.toolUsage)
			
 
				+									.sort(([, a], [, b]) => b.attempts - a.attempts)
			
 
				+									.map(([toolName, usage]) => {
			
 
				+										const abbr = getToolAbbreviation(toolName)
			
 
				+										const successRate =
			
 
				+											usage.attempts > 0
			
 
				+												? ((usage.attempts - usage.failures) / usage.attempts) * 100
			
 
				+												: 100
			
 
				+										const rateColor =
			
 
				+											successRate === 100
			
 
				+												? "text-green-500"
			
 
				+												: successRate >= 80
			
 
				+													? "text-yellow-500"
			
 
				+													: "text-red-500"
			
 
				+										return (
			
 
				+											<Tooltip key={toolName}>
			
 
				+												<TooltipTrigger asChild>
			
 
				+													<div className="flex items-center gap-1 px-2 py-1 rounded bg-background/50 border border-border/50 hover:border-border transition-colors cursor-default text-xs">
			
 
				+														<span className="font-medium text-muted-foreground">
			
 
				+															{abbr}
			
 
				+														</span>
			
 
				+														<span className="font-bold tabular-nums">{usage.attempts}</span>
			
 
				+														<span className={`${rateColor}`}>
			
 
				+															{formatToolUsageSuccessRate(usage)}
			
 
				+														</span>
			
 
				+													</div>
			
 
				+												</TooltipTrigger>
			
 
				+												<TooltipContent side="bottom">{toolName}</TooltipContent>
			
 
				+											</Tooltip>
			
 
				+										)
			
 
				+									})}
			
 
				+							</div>
			
 
				+						)}
			
 
				 					</div>
			
 
				 				)}
			
 
				 				{!tasks ? (
			
@@ -401,67 +600,104 @@ export function Run({ run }: { run: Run }) {
 
				 								<TableHead>Exercise</TableHead>
			
 
				 								<TableHead className="text-center">Tokens In / Out</TableHead>
			
 
				 								<TableHead>Context</TableHead>
			
 
				+								{toolColumns.map((toolName) => (
			
 
				+									<TableHead key={toolName} className="text-xs text-center">
			
 
				+										<Tooltip>
			
 
				+											<TooltipTrigger>{getToolAbbreviation(toolName)}</TooltipTrigger>
			
 
				+											<TooltipContent>{toolName}</TooltipContent>
			
 
				+										</Tooltip>
			
 
				+									</TableHead>
			
 
				+								))}
			
 
				 								<TableHead>Duration</TableHead>
			
 
				 								<TableHead>Cost</TableHead>
			
 
				 							</TableRow>
			
 
				 						</TableHeader>
			
 
				 						<TableBody>
			
 
				-							{tasks.map((task) => (
			
 
				-								<TableRow
			
 
				-									key={task.id}
			
 
				-									className={task.finishedAt ? "cursor-pointer hover:bg-muted/50" : ""}
			
 
				-									onClick={() => task.finishedAt && onViewTaskLog(task)}>
			
 
				-									<TableCell>
			
 
				-										<div className="flex items-center gap-2">
			
 
				-											<TaskStatus
			
 
				-												task={task}
			
 
				-												running={!!task.startedAt || !!tokenUsage.get(task.id)}
			
 
				-											/>
			
 
				+							{tasks.map((task) => {
			
 
				+								const hasStarted = !!task.startedAt || !!tokenUsage.get(task.id)
			
 
				+								return (
			
 
				+									<TableRow
			
 
				+										key={task.id}
			
 
				+										className={`${hasStarted ? "cursor-pointer hover:bg-muted/50" : ""} ${task.passed === false ? "bg-red-950/30 border-l-2 border-l-red-500" : ""}`}
			
 
				+										onClick={() => hasStarted && onViewTaskLog(task)}>
			
 
				+										<TableCell>
			
 
				 											<div className="flex items-center gap-2">
			
 
				-												<span>
			
 
				-													{task.language}/{task.exercise}
			
 
				-													{task.iteration > 1 && (
			
 
				-														<span className="text-muted-foreground ml-1">
			
 
				-															(#{task.iteration})
			
 
				-														</span>
			
 
				+												<TaskStatus task={task} running={hasStarted} />
			
 
				+												<div className="flex items-center gap-2">
			
 
				+													<span>
			
 
				+														{task.language}/{task.exercise}
			
 
				+														{task.iteration > 1 && (
			
 
				+															<span className="text-muted-foreground ml-1">
			
 
				+																(#{task.iteration})
			
 
				+															</span>
			
 
				+														)}
			
 
				+													</span>
			
 
				+													{hasStarted && (
			
 
				+														<Tooltip>
			
 
				+															<TooltipTrigger asChild>
			
 
				+																<FileText className="size-3 text-muted-foreground" />
			
 
				+															</TooltipTrigger>
			
 
				+															<TooltipContent>Click to view log</TooltipContent>
			
 
				+														</Tooltip>
			
 
				 													)}
			
 
				-												</span>
			
 
				-												{task.finishedAt && (
			
 
				-													<Tooltip>
			
 
				-														<TooltipTrigger asChild>
			
 
				-															<FileText className="size-3 text-muted-foreground" />
			
 
				-														</TooltipTrigger>
			
 
				-														<TooltipContent>Click to view log</TooltipContent>
			
 
				-													</Tooltip>
			
 
				-												)}
			
 
				-											</div>
			
 
				-										</div>
			
 
				-									</TableCell>
			
 
				-									{taskMetrics[task.id] ? (
			
 
				-										<>
			
 
				-											<TableCell className="font-mono text-xs">
			
 
				-												<div className="flex items-center justify-evenly">
			
 
				-													<div>{formatTokens(taskMetrics[task.id]!.tokensIn)}</div>/
			
 
				-													<div>{formatTokens(taskMetrics[task.id]!.tokensOut)}</div>
			
 
				 												</div>
			
 
				-											</TableCell>
			
 
				-											<TableCell className="font-mono text-xs">
			
 
				-												{formatTokens(taskMetrics[task.id]!.tokensContext)}
			
 
				-											</TableCell>
			
 
				-											<TableCell className="font-mono text-xs">
			
 
				-												{taskMetrics[task.id]!.duration
			
 
				-													? formatDuration(taskMetrics[task.id]!.duration)
			
 
				-													: "-"}
			
 
				-											</TableCell>
			
 
				-											<TableCell className="font-mono text-xs">
			
 
				-												{formatCurrency(taskMetrics[task.id]!.cost)}
			
 
				-											</TableCell>
			
 
				-										</>
			
 
				-									) : (
			
 
				-										<TableCell colSpan={4} />
			
 
				-									)}
			
 
				-								</TableRow>
			
 
				-							))}
			
 
				+											</div>
			
 
				+										</TableCell>
			
 
				+										{taskMetrics[task.id] ? (
			
 
				+											<>
			
 
				+												<TableCell className="font-mono text-xs">
			
 
				+													<div className="flex items-center justify-evenly">
			
 
				+														<div>{formatTokens(taskMetrics[task.id]!.tokensIn)}</div>/
			
 
				+														<div>{formatTokens(taskMetrics[task.id]!.tokensOut)}</div>
			
 
				+													</div>
			
 
				+												</TableCell>
			
 
				+												<TableCell className="font-mono text-xs">
			
 
				+													{formatTokens(taskMetrics[task.id]!.tokensContext)}
			
 
				+												</TableCell>
			
 
				+												{toolColumns.map((toolName) => {
			
 
				+													const usage = task.taskMetrics?.toolUsage?.[toolName]
			
 
				+													const successRate =
			
 
				+														usage && usage.attempts > 0
			
 
				+															? ((usage.attempts - usage.failures) / usage.attempts) * 100
			
 
				+															: 100
			
 
				+													const rateColor =
			
 
				+														successRate === 100
			
 
				+															? "text-muted-foreground"
			
 
				+															: successRate >= 80
			
 
				+																? "text-yellow-500"
			
 
				+																: "text-red-500"
			
 
				+													return (
			
 
				+														<TableCell key={toolName} className="text-xs text-center">
			
 
				+															{usage ? (
			
 
				+																<div className="flex flex-col items-center">
			
 
				+																	<span className="font-medium">
			
 
				+																		{usage.attempts}
			
 
				+																	</span>
			
 
				+																	<span className={rateColor}>
			
 
				+																		{formatToolUsageSuccessRate(usage)}
			
 
				+																	</span>
			
 
				+																</div>
			
 
				+															) : (
			
 
				+																<span className="text-muted-foreground">-</span>
			
 
				+															)}
			
 
				+														</TableCell>
			
 
				+													)
			
 
				+												})}
			
 
				+												<TableCell className="font-mono text-xs">
			
 
				+													{taskMetrics[task.id]!.duration
			
 
				+														? formatDuration(taskMetrics[task.id]!.duration)
			
 
				+														: "-"}
			
 
				+												</TableCell>
			
 
				+												<TableCell className="font-mono text-xs">
			
 
				+													{formatCurrency(taskMetrics[task.id]!.cost)}
			
 
				+												</TableCell>
			
 
				+											</>
			
 
				+										) : (
			
 
				+											<TableCell colSpan={4 + toolColumns.length} />
			
 
				+										)}
			
 
				+									</TableRow>
			
 
				+								)
			
 
				+							})}
			
 
				 						</TableBody>
			
 
				 					</Table>
			
 
				 				)}
			
@@ -479,8 +715,20 @@ export function Run({ run }: { run: Run }) {
 
				 									<span className="text-muted-foreground">(#{selectedTask.iteration})</span>
			
 
				 								)}
			
 
				 								<span
			
 
				-									className={`ml-2 text-sm ${selectedTask?.passed ? "text-green-600" : "text-red-600"}`}>
			
 
				-									({selectedTask?.passed ? "Passed" : "Failed"})
			
 
				+									className={`ml-2 text-sm ${
			
 
				+										selectedTask?.passed === true
			
 
				+											? "text-green-600"
			
 
				+											: selectedTask?.passed === false
			
 
				+												? "text-red-600"
			
 
				+												: "text-yellow-500"
			
 
				+									}`}>
			
 
				+									(
			
 
				+									{selectedTask?.passed === true
			
 
				+										? "Passed"
			
 
				+										: selectedTask?.passed === false
			
 
				+											? "Failed"
			
 
				+											: "Running"}
			
 
				+									)
			
 
				 								</span>
			
 
				 							</DialogTitle>
			
 
				 							{taskLog && (
			
@@ -523,6 +771,35 @@ export function Run({ run }: { run: Run }) {
 
				 					</div>
			
 
				 				</DialogContent>
			
 
				 			</Dialog>
			
 
				+
			
 
				+			{/* Kill Run Confirmation Dialog */}
			
 
				+			<AlertDialog open={showKillDialog} onOpenChange={setShowKillDialog}>
			
 
				+				<AlertDialogContent>
			
 
				+					<AlertDialogHeader>
			
 
				+						<AlertDialogTitle>Kill Run?</AlertDialogTitle>
			
 
				+						<AlertDialogDescription>
			
 
				+							This will stop the controller and all task runner containers for this run. Any running tasks
			
 
				+							will be terminated immediately. This action cannot be undone.
			
 
				+						</AlertDialogDescription>
			
 
				+					</AlertDialogHeader>
			
 
				+					<AlertDialogFooter>
			
 
				+						<AlertDialogCancel disabled={isKilling}>Cancel</AlertDialogCancel>
			
 
				+						<AlertDialogAction
			
 
				+							onClick={onKillRun}
			
 
				+							disabled={isKilling}
			
 
				+							className="bg-destructive text-destructive-foreground hover:bg-destructive/90">
			
 
				+							{isKilling ? (
			
 
				+								<>
			
 
				+									<LoaderCircle className="size-4 animate-spin mr-2" />
			
 
				+									Killing...
			
 
				+								</>
			
 
				+							) : (
			
 
				+								"Kill Run"
			
 
				+							)}
			
 
				+						</AlertDialogAction>
			
 
				+					</AlertDialogFooter>
			
 
				+				</AlertDialogContent>
			
 
				+			</AlertDialog>
			
 
				 		</>
			
 
				 	)
			
 
				 }
			
--- a/apps/web-evals/src/app/runs/new/new-run.tsx
+++ b/apps/web-evals/src/app/runs/new/new-run.tsx
@@ -87,11 +87,13 @@ type ImportedSettings = {
 
				 export function NewRun() {
			
 
				 	const router = useRouter()
			
 
				 
			
 
				-	const [provider, setModelSource] = useState<"roo" | "openrouter" | "other">("roo")
			
 
				+	const [provider, setModelSource] = useState<"roo" | "openrouter" | "other">("other")
			
 
				 	const [modelPopoverOpen, setModelPopoverOpen] = useState(false)
			
 
				 	const [useNativeToolProtocol, setUseNativeToolProtocol] = useState(true)
			
 
				-	const [useMultipleNativeToolCalls, setUseMultipleNativeToolCalls] = useState(true)
			
 
				+	const [useMultipleNativeToolCalls, setUseMultipleNativeToolCalls] = useState(false)
			
 
				 	const [reasoningEffort, setReasoningEffort] = useState<ReasoningEffort | "">("")
			
 
				+	const [commandExecutionTimeout, setCommandExecutionTimeout] = useState(20)
			
 
				+	const [terminalShellIntegrationTimeout, setTerminalShellIntegrationTimeout] = useState(30) // seconds
			
 
				 
			
 
				 	// State for imported settings with config selection
			
 
				 	const [importedSettings, setImportedSettings] = useState<ImportedSettings | null>(null)
			
@@ -134,7 +136,7 @@ export function NewRun() {
 
				 
			
 
				 	const [model, suite, settings] = watch(["model", "suite", "settings", "concurrency"])
			
 
				 
			
 
				-	// Load concurrency and timeout from localStorage on mount
			
 
				+	// Load settings from localStorage on mount
			
 
				 	useEffect(() => {
			
 
				 		const savedConcurrency = localStorage.getItem("evals-concurrency")
			
 
				 		if (savedConcurrency) {
			
@@ -150,6 +152,37 @@ export function NewRun() {
 
				 				setValue("timeout", parsed)
			
 
				 			}
			
 
				 		}
			
 
				+		const savedCommandTimeout = localStorage.getItem("evals-command-execution-timeout")
			
 
				+		if (savedCommandTimeout) {
			
 
				+			const parsed = parseInt(savedCommandTimeout, 10)
			
 
				+			if (!isNaN(parsed) && parsed >= 20 && parsed <= 60) {
			
 
				+				setCommandExecutionTimeout(parsed)
			
 
				+			}
			
 
				+		}
			
 
				+		const savedShellTimeout = localStorage.getItem("evals-shell-integration-timeout")
			
 
				+		if (savedShellTimeout) {
			
 
				+			const parsed = parseInt(savedShellTimeout, 10)
			
 
				+			if (!isNaN(parsed) && parsed >= 30 && parsed <= 60) {
			
 
				+				setTerminalShellIntegrationTimeout(parsed)
			
 
				+			}
			
 
				+		}
			
 
				+		// Load saved exercises selection
			
 
				+		const savedSuite = localStorage.getItem("evals-suite")
			
 
				+		if (savedSuite === "partial") {
			
 
				+			setValue("suite", "partial")
			
 
				+			const savedExercises = localStorage.getItem("evals-exercises")
			
 
				+			if (savedExercises) {
			
 
				+				try {
			
 
				+					const parsed = JSON.parse(savedExercises) as string[]
			
 
				+					if (Array.isArray(parsed)) {
			
 
				+						setSelectedExercises(parsed)
			
 
				+						setValue("exercises", parsed)
			
 
				+					}
			
 
				+				} catch {
			
 
				+					// Invalid JSON, ignore
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				 	}, [setValue])
			
 
				 
			
 
				 	// Extract unique languages from exercises
			
@@ -193,6 +226,7 @@ export function NewRun() {
 
				 
			
 
				 			setSelectedExercises(newSelected)
			
 
				 			setValue("exercises", newSelected)
			
 
				+			localStorage.setItem("evals-exercises", JSON.stringify(newSelected))
			
 
				 		},
			
 
				 		[getExercisesForLanguage, selectedExercises, setValue],
			
 
				 	)
			
@@ -236,6 +270,8 @@ export function NewRun() {
 
				 						apiProvider: "openrouter",
			
 
				 						openRouterModelId: model,
			
 
				 						toolProtocol: useNativeToolProtocol ? "native" : "xml",
			
 
				+						commandExecutionTimeout,
			
 
				+						terminalShellIntegrationTimeout: terminalShellIntegrationTimeout * 1000, // Convert to ms
			
 
				 						...experimentsSettings,
			
 
				 					}
			
 
				 				} else if (provider === "roo") {
			
@@ -244,6 +280,8 @@ export function NewRun() {
 
				 						apiProvider: "roo",
			
 
				 						apiModelId: model,
			
 
				 						toolProtocol: useNativeToolProtocol ? "native" : "xml",
			
 
				+						commandExecutionTimeout,
			
 
				+						terminalShellIntegrationTimeout: terminalShellIntegrationTimeout * 1000, // Convert to ms
			
 
				 						...experimentsSettings,
			
 
				 						...(reasoningEffort
			
 
				 							? {
			
@@ -257,6 +295,8 @@ export function NewRun() {
 
				 					values.settings = {
			
 
				 						...values.settings,
			
 
				 						toolProtocol: useNativeToolProtocol ? "native" : "xml",
			
 
				+						commandExecutionTimeout,
			
 
				+						terminalShellIntegrationTimeout: terminalShellIntegrationTimeout * 1000, // Convert to ms
			
 
				 						...experimentsSettings,
			
 
				 					}
			
 
				 				}
			
@@ -267,7 +307,16 @@ export function NewRun() {
 
				 				toast.error(e instanceof Error ? e.message : "An unknown error occurred.")
			
 
				 			}
			
 
				 		},
			
 
				-		[provider, model, router, useNativeToolProtocol, useMultipleNativeToolCalls, reasoningEffort],
			
 
				+		[
			
 
				+			provider,
			
 
				+			model,
			
 
				+			router,
			
 
				+			useNativeToolProtocol,
			
 
				+			useMultipleNativeToolCalls,
			
 
				+			reasoningEffort,
			
 
				+			commandExecutionTimeout,
			
 
				+			terminalShellIntegrationTimeout,
			
 
				+		],
			
 
				 	)
			
 
				 
			
 
				 	const onSelectModel = useCallback(
			
@@ -355,9 +404,9 @@ export function NewRun() {
 
				 									value={provider}
			
 
				 									onValueChange={(value) => setModelSource(value as "roo" | "openrouter" | "other")}>
			
 
				 									<TabsList className="mb-2">
			
 
				+										<TabsTrigger value="other">Import</TabsTrigger>
			
 
				 										<TabsTrigger value="roo">Roo Code Cloud</TabsTrigger>
			
 
				 										<TabsTrigger value="openrouter">OpenRouter</TabsTrigger>
			
 
				-										<TabsTrigger value="other">Other</TabsTrigger>
			
 
				 									</TabsList>
			
 
				 								</Tabs>
			
 
				 
			
@@ -446,8 +495,8 @@ export function NewRun() {
 
				 													<Checkbox
			
 
				 														id="native-other"
			
 
				 														checked={useNativeToolProtocol}
			
 
				-														onCheckedChange={(checked) =>
			
 
				-															setUseNativeToolProtocol(checked === true)
			
 
				+														onCheckedChange={(checked: boolean) =>
			
 
				+															setUseNativeToolProtocol(checked)
			
 
				 														}
			
 
				 													/>
			
 
				 													<span className="text-sm">Use Native Tool Calls</span>
			
@@ -458,8 +507,8 @@ export function NewRun() {
 
				 													<Checkbox
			
 
				 														id="multipleNativeToolCalls-other"
			
 
				 														checked={useMultipleNativeToolCalls}
			
 
				-														onCheckedChange={(checked) =>
			
 
				-															setUseMultipleNativeToolCalls(checked === true)
			
 
				+														onCheckedChange={(checked: boolean) =>
			
 
				+															setUseMultipleNativeToolCalls(checked)
			
 
				 														}
			
 
				 													/>
			
 
				 													<span className="text-sm">Use Multiple Native Tool Calls</span>
			
@@ -529,8 +578,8 @@ export function NewRun() {
 
				 														<Checkbox
			
 
				 															id="native"
			
 
				 															checked={useNativeToolProtocol}
			
 
				-															onCheckedChange={(checked) =>
			
 
				-																setUseNativeToolProtocol(checked === true)
			
 
				+															onCheckedChange={(checked: boolean) =>
			
 
				+																setUseNativeToolProtocol(checked)
			
 
				 															}
			
 
				 														/>
			
 
				 														<span className="text-sm">Use Native Tool Calls</span>
			
@@ -541,8 +590,8 @@ export function NewRun() {
 
				 														<Checkbox
			
 
				 															id="multipleNativeToolCalls"
			
 
				 															checked={useMultipleNativeToolCalls}
			
 
				-															onCheckedChange={(checked) =>
			
 
				-																setUseMultipleNativeToolCalls(checked === true)
			
 
				+															onCheckedChange={(checked: boolean) =>
			
 
				+																setUseMultipleNativeToolCalls(checked)
			
 
				 															}
			
 
				 														/>
			
 
				 														<span className="text-sm">Use Multiple Native Tool Calls</span>
			
@@ -627,12 +676,14 @@ export function NewRun() {
 
				 								<FormLabel>Exercises</FormLabel>
			
 
				 								<div className="flex items-center gap-2 flex-wrap">
			
 
				 									<Tabs
			
 
				-										defaultValue="full"
			
 
				+										value={suite}
			
 
				 										onValueChange={(value) => {
			
 
				 											setValue("suite", value as "full" | "partial")
			
 
				+											localStorage.setItem("evals-suite", value)
			
 
				 											if (value === "full") {
			
 
				 												setSelectedExercises([])
			
 
				 												setValue("exercises", [])
			
 
				+												localStorage.removeItem("evals-exercises")
			
 
				 											}
			
 
				 										}}>
			
 
				 										<TabsList>
			
@@ -669,6 +720,7 @@ export function NewRun() {
 
				 										onValueChange={(value) => {
			
 
				 											setSelectedExercises(value)
			
 
				 											setValue("exercises", value)
			
 
				+											localStorage.setItem("evals-exercises", JSON.stringify(value))
			
 
				 										}}
			
 
				 										placeholder="Select"
			
 
				 										variant="inverted"
			
@@ -758,6 +810,70 @@ export function NewRun() {
 
				 						)}
			
 
				 					/>
			
 
				 
			
 
				+					<FormItem className="py-5">
			
 
				+						<div className="flex items-center gap-1">
			
 
				+							<Label>Terminal Command Timeout (Seconds)</Label>
			
 
				+							<Tooltip>
			
 
				+								<TooltipTrigger asChild>
			
 
				+									<Info className="size-4 text-muted-foreground cursor-help" />
			
 
				+								</TooltipTrigger>
			
 
				+								<TooltipContent side="right" className="max-w-xs">
			
 
				+									<p>
			
 
				+										Maximum time in seconds to wait for terminal command execution to complete
			
 
				+										before timing out. This applies to commands run via the execute_command tool.
			
 
				+									</p>
			
 
				+								</TooltipContent>
			
 
				+							</Tooltip>
			
 
				+						</div>
			
 
				+						<div className="flex flex-row items-center gap-2">
			
 
				+							<Slider
			
 
				+								value={[commandExecutionTimeout]}
			
 
				+								min={20}
			
 
				+								max={60}
			
 
				+								step={1}
			
 
				+								onValueChange={([value]) => {
			
 
				+									if (value !== undefined) {
			
 
				+										setCommandExecutionTimeout(value)
			
 
				+										localStorage.setItem("evals-command-execution-timeout", String(value))
			
 
				+									}
			
 
				+								}}
			
 
				+							/>
			
 
				+							<div className="w-8 text-right">{commandExecutionTimeout}</div>
			
 
				+						</div>
			
 
				+					</FormItem>
			
 
				+
			
 
				+					<FormItem className="py-5">
			
 
				+						<div className="flex items-center gap-1">
			
 
				+							<Label>Shell Integration Timeout (Seconds)</Label>
			
 
				+							<Tooltip>
			
 
				+								<TooltipTrigger asChild>
			
 
				+									<Info className="size-4 text-muted-foreground cursor-help" />
			
 
				+								</TooltipTrigger>
			
 
				+								<TooltipContent side="right" className="max-w-xs">
			
 
				+									<p>
			
 
				+										Maximum time in seconds to wait for shell integration to initialize when opening
			
 
				+										a new terminal.
			
 
				+									</p>
			
 
				+								</TooltipContent>
			
 
				+							</Tooltip>
			
 
				+						</div>
			
 
				+						<div className="flex flex-row items-center gap-2">
			
 
				+							<Slider
			
 
				+								value={[terminalShellIntegrationTimeout]}
			
 
				+								min={30}
			
 
				+								max={60}
			
 
				+								step={1}
			
 
				+								onValueChange={([value]) => {
			
 
				+									if (value !== undefined) {
			
 
				+										setTerminalShellIntegrationTimeout(value)
			
 
				+										localStorage.setItem("evals-shell-integration-timeout", String(value))
			
 
				+									}
			
 
				+								}}
			
 
				+							/>
			
 
				+							<div className="w-8 text-right">{terminalShellIntegrationTimeout}</div>
			
 
				+						</div>
			
 
				+					</FormItem>
			
 
				+
			
 
				 					<FormField
			
 
				 						control={form.control}
			
 
				 						name="description"
			
--- a/apps/web-evals/src/app/runs/new/settings-diff.tsx
+++ b/apps/web-evals/src/app/runs/new/settings-diff.tsx
@@ -2,7 +2,9 @@ import { type Keys, type RooCodeSettings, GLOBAL_SETTINGS_KEYS, PROVIDER_SETTING
 
				 
			
 
				 import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui"
			
 
				 
			
 
				-export const ROO_CODE_SETTINGS_KEYS = [...GLOBAL_SETTINGS_KEYS, ...PROVIDER_SETTINGS_KEYS] as Keys<RooCodeSettings>[]
			
 
				+export const ROO_CODE_SETTINGS_KEYS = [
			
 
				+	...new Set([...GLOBAL_SETTINGS_KEYS, ...PROVIDER_SETTINGS_KEYS]),
			
 
				+] as Keys<RooCodeSettings>[]
			
 
				 
			
 
				 type SettingsDiffProps = {
			
 
				 	defaultSettings: RooCodeSettings
			
--- a/apps/web-evals/src/components/home/run.tsx
+++ b/apps/web-evals/src/components/home/run.tsx
@@ -124,9 +124,13 @@ export function Run({ run, taskMetrics, toolColumns }: RunProps) {
 
				 				<TableCell>{run.passed}</TableCell>
			
 
				 				<TableCell>{run.failed}</TableCell>
			
 
				 				<TableCell>
			
 
				-					{run.passed + run.failed > 0 && (
			
 
				-						<span>{((run.passed / (run.passed + run.failed)) * 100).toFixed(1)}%</span>
			
 
				-					)}
			
 
				+					{run.passed + run.failed > 0 &&
			
 
				+						(() => {
			
 
				+							const percent = (run.passed / (run.passed + run.failed)) * 100
			
 
				+							const colorClass =
			
 
				+								percent === 100 ? "text-green-500" : percent >= 80 ? "text-yellow-500" : "text-red-500"
			
 
				+							return <span className={colorClass}>{percent.toFixed(1)}%</span>
			
 
				+						})()}
			
 
				 				</TableCell>
			
 
				 				<TableCell>
			
 
				 					{taskMetrics && (
			
@@ -138,12 +142,20 @@ export function Run({ run, taskMetrics, toolColumns }: RunProps) {
 
				 				</TableCell>
			
 
				 				{toolColumns.map((toolName) => {
			
 
				 					const usage = taskMetrics?.toolUsage?.[toolName]
			
 
				+					const successRate =
			
 
				+						usage && usage.attempts > 0 ? ((usage.attempts - usage.failures) / usage.attempts) * 100 : 100
			
 
				+					const rateColor =
			
 
				+						successRate === 100
			
 
				+							? "text-muted-foreground"
			
 
				+							: successRate >= 80
			
 
				+								? "text-yellow-500"
			
 
				+								: "text-red-500"
			
 
				 					return (
			
 
				 						<TableCell key={toolName} className="text-xs text-center">
			
 
				 							{usage ? (
			
 
				 								<div className="flex flex-col items-center">
			
 
				 									<span className="font-medium">{usage.attempts}</span>
			
 
				-									<span className="text-muted-foreground">{formatToolUsageSuccessRate(usage)}</span>
			
 
				+									<span className={rateColor}>{formatToolUsageSuccessRate(usage)}</span>
			
 
				 								</div>
			
 
				 							) : (
			
 
				 								<span className="text-muted-foreground">-</span>
			
--- a/packages/evals/src/cli/runEvals.ts
+++ b/packages/evals/src/cli/runEvals.ts
@@ -37,22 +37,33 @@ export const runEvals = async (runId: number) => {
 
				 	const heartbeat = await startHeartbeat(run.id)
			
 
				 	const queue = new PQueue({ concurrency: run.concurrency })
			
 
				 
			
 
				+	const STAGGER_DELAY_MS = 5000
			
 
				+	const filteredTasks = tasks.filter((task) => task.finishedAt === null)
			
 
				+
			
 
				+	const createTaskRunner = (task: (typeof filteredTasks)[number]) => async () => {
			
 
				+		try {
			
 
				+			if (containerized) {
			
 
				+				await processTaskInContainer({ taskId: task.id, jobToken: run.jobToken, logger })
			
 
				+			} else {
			
 
				+				await processTask({ taskId: task.id, jobToken: run.jobToken, logger })
			
 
				+			}
			
 
				+		} catch (error) {
			
 
				+			logger.error("error processing task", error)
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	try {
			
 
				-		await queue.addAll(
			
 
				-			tasks
			
 
				-				.filter((task) => task.finishedAt === null)
			
 
				-				.map((task) => async () => {
			
 
				-					try {
			
 
				-						if (containerized) {
			
 
				-							await processTaskInContainer({ taskId: task.id, jobToken: run.jobToken, logger })
			
 
				-						} else {
			
 
				-							await processTask({ taskId: task.id, jobToken: run.jobToken, logger })
			
 
				-						}
			
 
				-					} catch (error) {
			
 
				-						logger.error("error processing task", error)
			
 
				-					}
			
 
				-				}),
			
 
				-		)
			
 
				+		// Add tasks with staggered start times when concurrency > 1
			
 
				+		for (let i = 0; i < filteredTasks.length; i++) {
			
 
				+			const task = filteredTasks[i]
			
 
				+			if (!task) continue
			
 
				+			if (run.concurrency > 1 && i > 0) {
			
 
				+				await new Promise((resolve) => setTimeout(resolve, STAGGER_DELAY_MS))
			
 
				+			}
			
 
				+			queue.add(createTaskRunner(task))
			
 
				+		}
			
 
				+
			
 
				+		await queue.onIdle()
			
 
				 
			
 
				 		logger.info("finishRun")
			
 
				 		const result = await finishRun(run.id)
			
--- a/packages/evals/src/cli/runTask.ts
+++ b/packages/evals/src/cli/runTask.ts
@@ -1,4 +1,5 @@
 
				 import * as fs from "fs"
			
 
				+import * as fsp from "fs/promises"
			
 
				 import * as path from "path"
			
 
				 import * as os from "node:os"
			
 
				 
			
@@ -38,6 +39,58 @@ class SubprocessTimeoutError extends Error {
 
				 	}
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * Copy conversation history files from VS Code extension storage to the log directory.
			
 
				+ * This allows us to preserve the api_conversation_history.json and ui_messages.json
			
 
				+ * files for post-mortem analysis alongside the log files.
			
 
				+ */
			
 
				+async function copyConversationHistory({
			
 
				+	rooTaskId,
			
 
				+	logDir,
			
 
				+	language,
			
 
				+	exercise,
			
 
				+	iteration,
			
 
				+	logger,
			
 
				+}: {
			
 
				+	rooTaskId: string
			
 
				+	logDir: string
			
 
				+	language: string
			
 
				+	exercise: string
			
 
				+	iteration: number
			
 
				+	logger: Logger
			
 
				+}): Promise<void> {
			
 
				+	// VS Code extension global storage path within the container
			
 
				+	const extensionStoragePath = "/roo/.vscode/User/globalStorage/rooveterinaryinc.roo-cline"
			
 
				+	const taskStoragePath = path.join(extensionStoragePath, "tasks", rooTaskId)
			
 
				+
			
 
				+	const filesToCopy = ["api_conversation_history.json", "ui_messages.json"]
			
 
				+
			
 
				+	for (const filename of filesToCopy) {
			
 
				+		const sourcePath = path.join(taskStoragePath, filename)
			
 
				+		// Use sanitized exercise name (replace slashes with dashes) for the destination filename
			
 
				+		// Include iteration number to handle multiple attempts at the same exercise
			
 
				+		const sanitizedExercise = exercise.replace(/\//g, "-")
			
 
				+		const destFilename = `${language}-${sanitizedExercise}.${iteration}_${filename}`
			
 
				+		const destPath = path.join(logDir, destFilename)
			
 
				+
			
 
				+		try {
			
 
				+			// Check if source file exists
			
 
				+			await fsp.access(sourcePath)
			
 
				+
			
 
				+			// Copy the file
			
 
				+			await fsp.copyFile(sourcePath, destPath)
			
 
				+			logger.info(`copied ${filename} to ${destPath}`)
			
 
				+		} catch (error) {
			
 
				+			// File may not exist if task didn't complete properly - this is not fatal
			
 
				+			if ((error as NodeJS.ErrnoException).code === "ENOENT") {
			
 
				+				logger.info(`${filename} not found at ${sourcePath} - skipping`)
			
 
				+			} else {
			
 
				+				logger.error(`failed to copy ${filename}:`, error)
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 export const processTask = async ({
			
 
				 	taskId,
			
 
				 	jobToken,
			
@@ -114,7 +167,7 @@ export const processTaskInContainer = async ({
 
				 
			
 
				 	for (let attempt = 0; attempt <= maxRetries; attempt++) {
			
 
				 		const containerName = `evals-task-${taskId}.${attempt}`
			
 
				-		const args = [`--name ${containerName}`, ...baseArgs]
			
 
				+		const args = [`--name ${containerName}`, `-e EVALS_ATTEMPT=${attempt}`, ...baseArgs]
			
 
				 		const isRetry = attempt > 0
			
 
				 
			
 
				 		if (isRetry) {
			
@@ -172,6 +225,7 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO
 
				 	const controller = new AbortController()
			
 
				 	const cancelSignal = controller.signal
			
 
				 	const containerized = isDockerContainer()
			
 
				+	const logDir = containerized ? `/var/log/evals/runs/${run.id}` : `/tmp/evals/runs/${run.id}`
			
 
				 
			
 
				 	let codeCommand = containerized
			
 
				 		? `xvfb-run --auto-servernum --server-num=1 code --wait --log trace --disable-workspace-trust --disable-gpu --disable-lcd-text --no-sandbox --user-data-dir /roo/.vscode --password-store="basic" -n ${workspacePath}`
			
@@ -266,7 +320,23 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO
 
				 				(payload[0].message.say && loggableSays.includes(payload[0].message.say)) ||
			
 
				 				payload[0].message.partial !== true)
			
 
				 		) {
			
 
				-			logger.info(`${eventName} ->`, payload)
			
 
				+			// Extract tool name for tool-related messages for clearer logging
			
 
				+			let logEventName: string = eventName
			
 
				+			if (eventName === RooCodeEventName.Message && payload[0]?.message?.ask === "tool") {
			
 
				+				try {
			
 
				+					const textJson = JSON.parse(payload[0].message.text ?? "{}")
			
 
				+					if (textJson.tool) {
			
 
				+						logEventName = `${eventName} (tool: ${textJson.tool})`
			
 
				+					}
			
 
				+				} catch {
			
 
				+					// If parsing fails, use the default event name
			
 
				+				}
			
 
				+			} else if (eventName === RooCodeEventName.Message && payload[0]?.message?.ask === "command") {
			
 
				+				logEventName = `${eventName} (command)`
			
 
				+			} else if (eventName === RooCodeEventName.Message && payload[0]?.message?.ask === "completion_result") {
			
 
				+				logEventName = `${eventName} (completion_result)`
			
 
				+			}
			
 
				+			logger.info(`${logEventName} ->`, payload)
			
 
				 		}
			
 
				 
			
 
				 		if (eventName === RooCodeEventName.TaskStarted) {
			
@@ -418,9 +488,25 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	// Copy conversation history files from VS Code extension storage to the log directory
			
 
				+	// for post-mortem analysis. Only do this in containerized mode where we have a known path.
			
 
				+	if (containerized && rooTaskId) {
			
 
				+		await copyConversationHistory({
			
 
				+			rooTaskId,
			
 
				+			logDir,
			
 
				+			language,
			
 
				+			exercise,
			
 
				+			iteration: task.iteration,
			
 
				+			logger,
			
 
				+		})
			
 
				+	}
			
 
				+
			
 
				 	logger.close()
			
 
				 
			
 
				-	if (isApiUnstable) {
			
 
				+	// Only throw for API instability if the task didn't complete successfully.
			
 
				+	// If taskFinishedAt is set via TaskCompleted event, the task succeeded despite
			
 
				+	// API retries, so re-running from scratch would waste resources.
			
 
				+	if (isApiUnstable && !taskFinishedAt) {
			
 
				 		throw new Error("API is unstable, throwing to trigger a retry.")
			
 
				 	}
			
 
				 }