Przeglądaj źródła

feat(web-evals): add task log viewing, export failed logs, and new run options (#9637)

Co-authored-by: roomote[bot] <219738659+roomote[bot]@users.noreply.github.com>
Hannes Rudolph 1 miesiąc temu
rodzic
commit
3f0a6971ca

+ 2 - 0
apps/web-evals/package.json

@@ -29,6 +29,7 @@
 		"@roo-code/evals": "workspace:^",
 		"@roo-code/types": "workspace:^",
 		"@tanstack/react-query": "^5.69.0",
+		"archiver": "^7.0.1",
 		"class-variance-authority": "^0.7.1",
 		"clsx": "^2.1.1",
 		"cmdk": "^1.1.0",
@@ -52,6 +53,7 @@
 		"@roo-code/config-eslint": "workspace:^",
 		"@roo-code/config-typescript": "workspace:^",
 		"@tailwindcss/postcss": "^4",
+		"@types/archiver": "^7.0.0",
 		"@types/ps-tree": "^1.1.6",
 		"@types/react": "^18.3.23",
 		"@types/react-dom": "^18.3.5",

+ 25 - 6
apps/web-evals/src/actions/runs.ts

@@ -21,7 +21,7 @@ import { CreateRun } from "@/lib/schemas"
 
 const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals")
 
-export async function createRun({ suite, exercises = [], timeout, ...values }: CreateRun) {
+export async function createRun({ suite, exercises = [], timeout, iterations = 1, ...values }: CreateRun) {
 	const run = await _createRun({
 		...values,
 		timeout,
@@ -36,15 +36,34 @@ export async function createRun({ suite, exercises = [], timeout, ...values }: C
 				throw new Error("Invalid exercise path: " + path)
 			}
 
-			await createTask({ ...values, runId: run.id, language: language as ExerciseLanguage, exercise })
+			// Create multiple tasks for each iteration
+			for (let iteration = 1; iteration <= iterations; iteration++) {
+				await createTask({
+					...values,
+					runId: run.id,
+					language: language as ExerciseLanguage,
+					exercise,
+					iteration,
+				})
+			}
 		}
 	} else {
 		for (const language of exerciseLanguages) {
-			const exercises = await getExercisesForLanguage(EVALS_REPO_PATH, language)
+			const languageExercises = await getExercisesForLanguage(EVALS_REPO_PATH, language)
+
+			// Create tasks for all iterations of each exercise
+			const tasksToCreate: Array<{ language: ExerciseLanguage; exercise: string; iteration: number }> = []
+			for (const exercise of languageExercises) {
+				for (let iteration = 1; iteration <= iterations; iteration++) {
+					tasksToCreate.push({ language, exercise, iteration })
+				}
+			}
 
-			await pMap(exercises, (exercise) => createTask({ runId: run.id, language, exercise }), {
-				concurrency: 10,
-			})
+			await pMap(
+				tasksToCreate,
+				({ language, exercise, iteration }) => createTask({ runId: run.id, language, exercise, iteration }),
+				{ concurrency: 10 },
+			)
 		}
 	}
 

+ 74 - 0
apps/web-evals/src/app/api/runs/[id]/logs/[taskId]/route.ts

@@ -0,0 +1,74 @@
+import { NextResponse } from "next/server"
+import type { NextRequest } from "next/server"
+import * as fs from "node:fs/promises"
+import * as path from "node:path"
+
+import { findTask, findRun } from "@roo-code/evals"
+
+export const dynamic = "force-dynamic"
+
+const LOG_BASE_PATH = "/tmp/evals/runs"
+
+// Sanitize path components to prevent path traversal attacks
+function sanitizePathComponent(component: string): string {
+	// Remove any path separators, null bytes, and other dangerous characters
+	return component.replace(/[/\\:\0*?"<>|]/g, "_")
+}
+
+export async function GET(request: NextRequest, { params }: { params: Promise<{ id: string; taskId: string }> }) {
+	const { id, taskId } = await params
+
+	try {
+		const runId = Number(id)
+		const taskIdNum = Number(taskId)
+
+		if (isNaN(runId) || isNaN(taskIdNum)) {
+			return NextResponse.json({ error: "Invalid run ID or task ID" }, { status: 400 })
+		}
+
+		// Verify the run exists
+		await findRun(runId)
+
+		// Get the task to find its language and exercise
+		const task = await findTask(taskIdNum)
+
+		// Verify the task belongs to this run
+		if (task.runId !== runId) {
+			return NextResponse.json({ error: "Task does not belong to this run" }, { status: 404 })
+		}
+
+		// Sanitize language and exercise to prevent path traversal
+		const safeLanguage = sanitizePathComponent(task.language)
+		const safeExercise = sanitizePathComponent(task.exercise)
+
+		// Construct the log file path
+		const logFileName = `${safeLanguage}-${safeExercise}.log`
+		const logFilePath = path.join(LOG_BASE_PATH, String(runId), logFileName)
+
+		// Verify the resolved path is within the expected directory (defense in depth)
+		const resolvedPath = path.resolve(logFilePath)
+		const expectedBase = path.resolve(LOG_BASE_PATH)
+		if (!resolvedPath.startsWith(expectedBase)) {
+			return NextResponse.json({ error: "Invalid log path" }, { status: 400 })
+		}
+
+		// Check if the log file exists and read it (async)
+		try {
+			const logContent = await fs.readFile(logFilePath, "utf-8")
+			return NextResponse.json({ logContent })
+		} catch (err) {
+			if ((err as NodeJS.ErrnoException).code === "ENOENT") {
+				return NextResponse.json({ error: "Log file not found", logContent: null }, { status: 200 })
+			}
+			throw err
+		}
+	} catch (error) {
+		console.error("Error reading task log:", error)
+
+		if (error instanceof Error && error.name === "RecordNotFoundError") {
+			return NextResponse.json({ error: "Task or run not found" }, { status: 404 })
+		}
+
+		return NextResponse.json({ error: "Failed to read log file" }, { status: 500 })
+	}
+}

+ 129 - 0
apps/web-evals/src/app/api/runs/[id]/logs/failed/route.ts

@@ -0,0 +1,129 @@
+import { NextResponse } from "next/server"
+import type { NextRequest } from "next/server"
+import * as fs from "node:fs"
+import * as path from "node:path"
+import archiver from "archiver"
+
+import { findRun, getTasks } from "@roo-code/evals"
+
+export const dynamic = "force-dynamic"
+
+const LOG_BASE_PATH = "/tmp/evals/runs"
+
+// Sanitize path components to prevent path traversal attacks
+function sanitizePathComponent(component: string): string {
+	// Remove any path separators, null bytes, and other dangerous characters
+	return component.replace(/[/\\:\0*?"<>|]/g, "_")
+}
+
+export async function GET(request: NextRequest, { params }: { params: Promise<{ id: string }> }) {
+	const { id } = await params
+
+	try {
+		const runId = Number(id)
+
+		if (isNaN(runId)) {
+			return NextResponse.json({ error: "Invalid run ID" }, { status: 400 })
+		}
+
+		// Verify the run exists
+		await findRun(runId)
+
+		// Get all tasks for this run
+		const tasks = await getTasks(runId)
+
+		// Filter for failed tasks only
+		const failedTasks = tasks.filter((task) => task.passed === false)
+
+		if (failedTasks.length === 0) {
+			return NextResponse.json({ error: "No failed tasks to export" }, { status: 400 })
+		}
+
+		// Create a zip archive
+		const archive = archiver("zip", { zlib: { level: 9 } })
+
+		// Collect chunks to build the response
+		const chunks: Buffer[] = []
+
+		archive.on("data", (chunk: Buffer) => {
+			chunks.push(chunk)
+		})
+
+		// Track archive errors
+		let archiveError: Error | null = null
+		archive.on("error", (err: Error) => {
+			archiveError = err
+		})
+
+		// Set up the end promise before finalizing (proper event listener ordering)
+		const archiveEndPromise = new Promise<void>((resolve, reject) => {
+			archive.on("end", resolve)
+			archive.on("error", reject)
+		})
+
+		// Add each failed task's log file to the archive
+		const logDir = path.join(LOG_BASE_PATH, String(runId))
+		let filesAdded = 0
+
+		for (const task of failedTasks) {
+			// Sanitize language and exercise to prevent path traversal
+			const safeLanguage = sanitizePathComponent(task.language)
+			const safeExercise = sanitizePathComponent(task.exercise)
+			const logFileName = `${safeLanguage}-${safeExercise}.log`
+			const logFilePath = path.join(logDir, logFileName)
+
+			// Verify the resolved path is within the expected directory (defense in depth)
+			const resolvedPath = path.resolve(logFilePath)
+			const expectedBase = path.resolve(LOG_BASE_PATH)
+			if (!resolvedPath.startsWith(expectedBase)) {
+				continue // Skip files with suspicious paths
+			}
+
+			if (fs.existsSync(logFilePath)) {
+				archive.file(logFilePath, { name: logFileName })
+				filesAdded++
+			}
+		}
+
+		// Check if any files were actually added
+		if (filesAdded === 0) {
+			archive.abort()
+			return NextResponse.json(
+				{ error: "No log files found - they may have been cleared from disk" },
+				{ status: 404 },
+			)
+		}
+
+		// Finalize the archive
+		await archive.finalize()
+
+		// Wait for all data to be collected
+		await archiveEndPromise
+
+		// Check for archive errors
+		if (archiveError) {
+			throw archiveError
+		}
+
+		// Combine all chunks into a single buffer
+		const zipBuffer = Buffer.concat(chunks)
+
+		// Return the zip file
+		return new NextResponse(zipBuffer, {
+			status: 200,
+			headers: {
+				"Content-Type": "application/zip",
+				"Content-Disposition": `attachment; filename="run-${runId}-failed-logs.zip"`,
+				"Content-Length": String(zipBuffer.length),
+			},
+		})
+	} catch (error) {
+		console.error("Error exporting failed logs:", error)
+
+		if (error instanceof Error && error.name === "RecordNotFoundError") {
+			return NextResponse.json({ error: "Run not found" }, { status: 404 })
+		}
+
+		return NextResponse.json({ error: "Failed to export logs" }, { status: 500 })
+	}
+}

+ 247 - 6
apps/web-evals/src/app/runs/[id]/run.tsx

@@ -1,9 +1,10 @@
 "use client"
 
-import { useMemo } from "react"
-import { LoaderCircle } from "lucide-react"
+import { useMemo, useState, useCallback, useEffect } from "react"
+import { toast } from "sonner"
+import { LoaderCircle, FileText, Copy, Check } from "lucide-react"
 
-import type { Run, TaskMetrics as _TaskMetrics } from "@roo-code/evals"
+import type { Run, TaskMetrics as _TaskMetrics, Task } from "@roo-code/evals"
 
 import { formatCurrency, formatDuration, formatTokens, formatToolUsageSuccessRate } from "@/lib/formatters"
 import { useRunStatus } from "@/hooks/use-run-status"
@@ -17,6 +18,12 @@ import {
 	Tooltip,
 	TooltipContent,
 	TooltipTrigger,
+	Dialog,
+	DialogContent,
+	DialogHeader,
+	DialogTitle,
+	ScrollArea,
+	Button,
 } from "@/components/ui"
 
 import { TaskStatus } from "./task-status"
@@ -35,10 +42,169 @@ function getToolAbbreviation(toolName: string): string {
 		.join("")
 }
 
+// Pattern definitions for syntax highlighting
+type HighlightPattern = {
+	pattern: RegExp
+	className: string
+	// If true, wraps the entire match; if a number, wraps that capture group
+	wrapGroup?: number
+}
+
+const HIGHLIGHT_PATTERNS: HighlightPattern[] = [
+	// Timestamps [YYYY-MM-DDTHH:MM:SS.sssZ]
+	{ pattern: /\[(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z)\]/g, className: "text-blue-400" },
+	// Log levels
+	{ pattern: /\|\s*(INFO)\s*\|/g, className: "text-green-400", wrapGroup: 1 },
+	{ pattern: /\|\s*(WARN|WARNING)\s*\|/g, className: "text-yellow-400", wrapGroup: 1 },
+	{ pattern: /\|\s*(ERROR)\s*\|/g, className: "text-red-400", wrapGroup: 1 },
+	{ pattern: /\|\s*(DEBUG)\s*\|/g, className: "text-gray-400", wrapGroup: 1 },
+	// Task identifiers
+	{ pattern: /(taskCreated|taskFocused|taskStarted|taskCompleted|EvalPass|EvalFail)/g, className: "text-purple-400" },
+	// Message arrows
+	{ pattern: /→/g, className: "text-cyan-400" },
+]
+
+// Format a single line with syntax highlighting using React elements (XSS-safe)
+function formatLine(line: string): React.ReactNode[] {
+	// Find all matches with their positions
+	type Match = { start: number; end: number; text: string; className: string }
+	const matches: Match[] = []
+
+	for (const { pattern, className, wrapGroup } of HIGHLIGHT_PATTERNS) {
+		// Reset regex state
+		pattern.lastIndex = 0
+		let regexMatch
+		while ((regexMatch = pattern.exec(line)) !== null) {
+			const capturedText = wrapGroup !== undefined ? regexMatch[wrapGroup] : regexMatch[0]
+			// Skip if capture group didn't match
+			if (!capturedText) continue
+			const start =
+				wrapGroup !== undefined ? regexMatch.index + regexMatch[0].indexOf(capturedText) : regexMatch.index
+			matches.push({
+				start,
+				end: start + capturedText.length,
+				text: capturedText,
+				className,
+			})
+		}
+	}
+
+	// Sort matches by position and filter overlapping ones
+	matches.sort((a, b) => a.start - b.start)
+	const filteredMatches: Match[] = []
+	for (const m of matches) {
+		const lastMatch = filteredMatches[filteredMatches.length - 1]
+		if (!lastMatch || m.start >= lastMatch.end) {
+			filteredMatches.push(m)
+		}
+	}
+
+	// Build result with highlighted spans
+	const result: React.ReactNode[] = []
+	let currentPos = 0
+
+	for (const [i, m] of filteredMatches.entries()) {
+		// Add text before this match
+		if (m.start > currentPos) {
+			result.push(line.slice(currentPos, m.start))
+		}
+		// Add highlighted match
+		result.push(
+			<span key={`${i}-${m.start}`} className={m.className}>
+				{m.text}
+			</span>,
+		)
+		currentPos = m.end
+	}
+
+	// Add remaining text
+	if (currentPos < line.length) {
+		result.push(line.slice(currentPos))
+	}
+
+	return result.length > 0 ? result : [line]
+}
+
+// Format log content with basic highlighting (XSS-safe - no dangerouslySetInnerHTML)
+function formatLogContent(log: string): React.ReactNode[] {
+	const lines = log.split("\n")
+	return lines.map((line, index) => (
+		<div key={index} className="hover:bg-white/5">
+			{line ? formatLine(line) : " "}
+		</div>
+	))
+}
+
 export function Run({ run }: { run: Run }) {
 	const runStatus = useRunStatus(run)
 	const { tasks, tokenUsage, usageUpdatedAt } = runStatus
 
+	const [selectedTask, setSelectedTask] = useState<Task | null>(null)
+	const [taskLog, setTaskLog] = useState<string | null>(null)
+	const [isLoadingLog, setIsLoadingLog] = useState(false)
+	const [copied, setCopied] = useState(false)
+
+	const onCopyLog = useCallback(async () => {
+		if (!taskLog) return
+
+		try {
+			await navigator.clipboard.writeText(taskLog)
+			setCopied(true)
+			toast.success("Log copied to clipboard")
+			setTimeout(() => setCopied(false), 2000)
+		} catch (error) {
+			console.error("Failed to copy log:", error)
+			toast.error("Failed to copy log")
+		}
+	}, [taskLog])
+
+	// Handle ESC key to close the dialog
+	useEffect(() => {
+		const handleKeyDown = (e: KeyboardEvent) => {
+			if (e.key === "Escape" && selectedTask) {
+				setSelectedTask(null)
+			}
+		}
+
+		document.addEventListener("keydown", handleKeyDown)
+		return () => document.removeEventListener("keydown", handleKeyDown)
+	}, [selectedTask])
+
+	const onViewTaskLog = useCallback(
+		async (task: Task) => {
+			// Only allow viewing logs for completed tasks
+			if (task.passed === null || task.passed === undefined) {
+				toast.error("Task is still running")
+				return
+			}
+
+			setSelectedTask(task)
+			setIsLoadingLog(true)
+			setTaskLog(null)
+
+			try {
+				const response = await fetch(`/api/runs/${run.id}/logs/${task.id}`)
+
+				if (!response.ok) {
+					const error = await response.json()
+					toast.error(error.error || "Failed to load log")
+					setSelectedTask(null)
+					return
+				}
+
+				const data = await response.json()
+				setTaskLog(data.logContent)
+			} catch (error) {
+				console.error("Error loading task log:", error)
+				toast.error("Failed to load log")
+				setSelectedTask(null)
+			} finally {
+				setIsLoadingLog(false)
+			}
+		},
+		[run.id],
+	)
+
 	const taskMetrics: Record<number, TaskMetrics> = useMemo(() => {
 		const metrics: Record<number, TaskMetrics> = {}
 
@@ -241,15 +407,33 @@ export function Run({ run }: { run: Run }) {
 						</TableHeader>
 						<TableBody>
 							{tasks.map((task) => (
-								<TableRow key={task.id}>
+								<TableRow
+									key={task.id}
+									className={task.finishedAt ? "cursor-pointer hover:bg-muted/50" : ""}
+									onClick={() => task.finishedAt && onViewTaskLog(task)}>
 									<TableCell>
 										<div className="flex items-center gap-2">
 											<TaskStatus
 												task={task}
 												running={!!task.startedAt || !!tokenUsage.get(task.id)}
 											/>
-											<div>
-												{task.language}/{task.exercise}
+											<div className="flex items-center gap-2">
+												<span>
+													{task.language}/{task.exercise}
+													{task.iteration > 1 && (
+														<span className="text-muted-foreground ml-1">
+															(#{task.iteration})
+														</span>
+													)}
+												</span>
+												{task.finishedAt && (
+													<Tooltip>
+														<TooltipTrigger asChild>
+															<FileText className="size-3 text-muted-foreground" />
+														</TooltipTrigger>
+														<TooltipContent>Click to view log</TooltipContent>
+													</Tooltip>
+												)}
 											</div>
 										</div>
 									</TableCell>
@@ -282,6 +466,63 @@ export function Run({ run }: { run: Run }) {
 					</Table>
 				)}
 			</div>
+
+			{/* Task Log Dialog - Full Screen */}
+			<Dialog open={!!selectedTask} onOpenChange={() => setSelectedTask(null)}>
+				<DialogContent className="w-[95vw] !max-w-[95vw] h-[90vh] flex flex-col">
+					<DialogHeader className="flex-shrink-0">
+						<div className="flex items-center justify-between pr-8">
+							<DialogTitle className="flex items-center gap-2">
+								<FileText className="size-4" />
+								{selectedTask?.language}/{selectedTask?.exercise}
+								{selectedTask?.iteration && selectedTask.iteration > 1 && (
+									<span className="text-muted-foreground">(#{selectedTask.iteration})</span>
+								)}
+								<span
+									className={`ml-2 text-sm ${selectedTask?.passed ? "text-green-600" : "text-red-600"}`}>
+									({selectedTask?.passed ? "Passed" : "Failed"})
+								</span>
+							</DialogTitle>
+							{taskLog && (
+								<Button
+									variant="outline"
+									size="sm"
+									onClick={onCopyLog}
+									className="flex items-center gap-1">
+									{copied ? (
+										<>
+											<Check className="size-4" />
+											Copied!
+										</>
+									) : (
+										<>
+											<Copy className="size-4" />
+											Copy Log
+										</>
+									)}
+								</Button>
+							)}
+						</div>
+					</DialogHeader>
+					<div className="flex-1 min-h-0 overflow-hidden">
+						{isLoadingLog ? (
+							<div className="flex items-center justify-center h-full">
+								<LoaderCircle className="size-6 animate-spin" />
+							</div>
+						) : taskLog ? (
+							<ScrollArea className="h-full w-full">
+								<div className="text-xs font-mono bg-muted p-4 rounded-md overflow-x-auto">
+									{formatLogContent(taskLog)}
+								</div>
+							</ScrollArea>
+						) : (
+							<div className="flex items-center justify-center h-full text-muted-foreground">
+								Log file not available (may have been cleared)
+							</div>
+						)}
+					</div>
+				</DialogContent>
+			</Dialog>
 		</>
 	)
 }

+ 181 - 23
apps/web-evals/src/app/runs/new/new-run.tsx

@@ -7,7 +7,7 @@ import { useQuery } from "@tanstack/react-query"
 import { useForm, FormProvider } from "react-hook-form"
 import { zodResolver } from "@hookform/resolvers/zod"
 import { toast } from "sonner"
-import { X, Rocket, Check, ChevronsUpDown, SlidersHorizontal } from "lucide-react"
+import { X, Rocket, Check, ChevronsUpDown, SlidersHorizontal, Info } from "lucide-react"
 
 import {
 	globalSettingsSchema,
@@ -16,6 +16,7 @@ import {
 	getModelId,
 	type ProviderSettings,
 	type GlobalSettings,
+	type ReasoningEffort,
 } from "@roo-code/types"
 
 import { createRun } from "@/actions/runs"
@@ -30,6 +31,9 @@ import {
 	TIMEOUT_MIN,
 	TIMEOUT_MAX,
 	TIMEOUT_DEFAULT,
+	ITERATIONS_MIN,
+	ITERATIONS_MAX,
+	ITERATIONS_DEFAULT,
 } from "@/lib/schemas"
 import { cn } from "@/lib/utils"
 
@@ -40,6 +44,7 @@ import {
 	Button,
 	Checkbox,
 	FormControl,
+	FormDescription,
 	FormField,
 	FormItem,
 	FormLabel,
@@ -61,7 +66,14 @@ import {
 	PopoverTrigger,
 	Slider,
 	Label,
-	FormDescription,
+	Select,
+	SelectContent,
+	SelectItem,
+	SelectTrigger,
+	SelectValue,
+	Tooltip,
+	TooltipContent,
+	TooltipTrigger,
 } from "@/components/ui"
 
 import { SettingsDiff } from "./settings-diff"
@@ -78,6 +90,8 @@ export function NewRun() {
 	const [provider, setModelSource] = useState<"roo" | "openrouter" | "other">("roo")
 	const [modelPopoverOpen, setModelPopoverOpen] = useState(false)
 	const [useNativeToolProtocol, setUseNativeToolProtocol] = useState(true)
+	const [useMultipleNativeToolCalls, setUseMultipleNativeToolCalls] = useState(true)
+	const [reasoningEffort, setReasoningEffort] = useState<ReasoningEffort | "">("")
 
 	// State for imported settings with config selection
 	const [importedSettings, setImportedSettings] = useState<ImportedSettings | null>(null)
@@ -106,6 +120,7 @@ export function NewRun() {
 			settings: undefined,
 			concurrency: CONCURRENCY_DEFAULT,
 			timeout: TIMEOUT_DEFAULT,
+			iterations: ITERATIONS_DEFAULT,
 			jobToken: "",
 		},
 	})
@@ -204,12 +219,24 @@ export function NewRun() {
 	const onSubmit = useCallback(
 		async (values: CreateRun) => {
 			try {
+				// Validate jobToken for Roo Code Cloud provider
+				if (provider === "roo" && !values.jobToken?.trim()) {
+					toast.error("Roo Code Cloud Token is required")
+					return
+				}
+
+				// Build experiments settings
+				const experimentsSettings = useMultipleNativeToolCalls
+					? { experiments: { multipleNativeToolCalls: true } }
+					: {}
+
 				if (provider === "openrouter") {
 					values.settings = {
 						...(values.settings || {}),
 						apiProvider: "openrouter",
 						openRouterModelId: model,
 						toolProtocol: useNativeToolProtocol ? "native" : "xml",
+						...experimentsSettings,
 					}
 				} else if (provider === "roo") {
 					values.settings = {
@@ -217,6 +244,20 @@ export function NewRun() {
 						apiProvider: "roo",
 						apiModelId: model,
 						toolProtocol: useNativeToolProtocol ? "native" : "xml",
+						...experimentsSettings,
+						...(reasoningEffort
+							? {
+									enableReasoningEffort: true,
+									reasoningEffort: reasoningEffort as ReasoningEffort,
+								}
+							: {}),
+					}
+				} else if (provider === "other" && values.settings) {
+					// For imported settings, merge in experiments and tool protocol
+					values.settings = {
+						...values.settings,
+						toolProtocol: useNativeToolProtocol ? "native" : "xml",
+						...experimentsSettings,
 					}
 				}
 
@@ -226,7 +267,7 @@ export function NewRun() {
 				toast.error(e instanceof Error ? e.message : "An unknown error occurred.")
 			}
 		},
-		[provider, model, router, useNativeToolProtocol],
+		[provider, model, router, useNativeToolProtocol, useMultipleNativeToolCalls, reasoningEffort],
 	)
 
 	const onSelectModel = useCallback(
@@ -394,6 +435,38 @@ export function NewRun() {
 											</div>
 										)}
 
+										<div className="mt-4 p-4 rounded-md bg-muted/30 border border-border space-y-3">
+											<Label className="text-sm font-medium text-muted-foreground">
+												Tool Protocol Options
+											</Label>
+											<div className="flex flex-col gap-2.5 pl-1">
+												<label
+													htmlFor="native-other"
+													className="flex items-center gap-2 cursor-pointer">
+													<Checkbox
+														id="native-other"
+														checked={useNativeToolProtocol}
+														onCheckedChange={(checked) =>
+															setUseNativeToolProtocol(checked === true)
+														}
+													/>
+													<span className="text-sm">Use Native Tool Calls</span>
+												</label>
+												<label
+													htmlFor="multipleNativeToolCalls-other"
+													className="flex items-center gap-2 cursor-pointer">
+													<Checkbox
+														id="multipleNativeToolCalls-other"
+														checked={useMultipleNativeToolCalls}
+														onCheckedChange={(checked) =>
+															setUseMultipleNativeToolCalls(checked === true)
+														}
+													/>
+													<span className="text-sm">Use Multiple Native Tool Calls</span>
+												</label>
+											</div>
+										</div>
+
 										{settings && (
 											<SettingsDiff defaultSettings={EVALS_SETTINGS} customSettings={settings} />
 										)}
@@ -444,15 +517,66 @@ export function NewRun() {
 											</PopoverContent>
 										</Popover>
 
-										<div className="flex items-center gap-1.5">
-											<Checkbox
-												id="native"
-												checked={useNativeToolProtocol}
-												onCheckedChange={(checked) =>
-													setUseNativeToolProtocol(checked === true)
-												}
-											/>
-											<Label htmlFor="native">Use Native Tool Calls</Label>
+										<div className="mt-4 p-4 rounded-md bg-muted/30 border border-border space-y-4">
+											<div className="space-y-3">
+												<Label className="text-sm font-medium text-muted-foreground">
+													Tool Protocol Options
+												</Label>
+												<div className="flex flex-col gap-2.5 pl-1">
+													<label
+														htmlFor="native"
+														className="flex items-center gap-2 cursor-pointer">
+														<Checkbox
+															id="native"
+															checked={useNativeToolProtocol}
+															onCheckedChange={(checked) =>
+																setUseNativeToolProtocol(checked === true)
+															}
+														/>
+														<span className="text-sm">Use Native Tool Calls</span>
+													</label>
+													<label
+														htmlFor="multipleNativeToolCalls"
+														className="flex items-center gap-2 cursor-pointer">
+														<Checkbox
+															id="multipleNativeToolCalls"
+															checked={useMultipleNativeToolCalls}
+															onCheckedChange={(checked) =>
+																setUseMultipleNativeToolCalls(checked === true)
+															}
+														/>
+														<span className="text-sm">Use Multiple Native Tool Calls</span>
+													</label>
+												</div>
+											</div>
+
+											{provider === "roo" && (
+												<div className="space-y-2 pt-2 border-t border-border">
+													<Label className="text-sm font-medium text-muted-foreground">
+														Reasoning Effort
+													</Label>
+													<Select
+														value={reasoningEffort || "none"}
+														onValueChange={(value) =>
+															setReasoningEffort(
+																value === "none" ? "" : (value as ReasoningEffort),
+															)
+														}>
+														<SelectTrigger className="w-full">
+															<SelectValue placeholder="None (default)" />
+														</SelectTrigger>
+														<SelectContent>
+															<SelectItem value="none">None (default)</SelectItem>
+															<SelectItem value="low">Low</SelectItem>
+															<SelectItem value="medium">Medium</SelectItem>
+															<SelectItem value="high">High</SelectItem>
+														</SelectContent>
+													</Select>
+													<p className="text-xs text-muted-foreground pl-1">
+														When set, enableReasoningEffort will be automatically enabled
+													</p>
+												</div>
+											)}
 										</div>
 									</>
 								)}
@@ -468,20 +592,28 @@ export function NewRun() {
 							name="jobToken"
 							render={({ field }) => (
 								<FormItem>
-									<FormLabel>Roo Code Cloud Token</FormLabel>
+									<div className="flex items-center gap-1">
+										<FormLabel>Roo Code Cloud Token</FormLabel>
+										<Tooltip>
+											<TooltipTrigger asChild>
+												<Info className="size-4 text-muted-foreground cursor-help" />
+											</TooltipTrigger>
+											<TooltipContent side="right" className="max-w-xs">
+												<p>
+													If you have access to the Roo Code Cloud repository, generate a
+													token with:
+												</p>
+												<code className="text-xs block mt-1">
+													pnpm --filter @roo-code-cloud/auth production:create-job-token [org]
+													[timeout]
+												</code>
+											</TooltipContent>
+										</Tooltip>
+									</div>
 									<FormControl>
-										<Input type="password" {...field} />
+										<Input type="password" placeholder="Required" {...field} />
 									</FormControl>
 									<FormMessage />
-									<FormDescription>
-										If you have access to the Roo Code Cloud repository then you can generate a
-										token with:
-										<br />
-										<code className="text-xs">
-											pnpm --filter @roo-code-cloud/auth production:create-job-token [org]
-											[timeout]
-										</code>
-									</FormDescription>
 								</FormItem>
 							)}
 						/>
@@ -600,6 +732,32 @@ export function NewRun() {
 						)}
 					/>
 
+					<FormField
+						control={form.control}
+						name="iterations"
+						render={({ field }) => (
+							<FormItem>
+								<FormLabel>Iterations per Exercise</FormLabel>
+								<FormControl>
+									<div className="flex flex-row items-center gap-2">
+										<Slider
+											value={[field.value]}
+											min={ITERATIONS_MIN}
+											max={ITERATIONS_MAX}
+											step={1}
+											onValueChange={(value) => {
+												field.onChange(value[0])
+											}}
+										/>
+										<div>{field.value}</div>
+									</div>
+								</FormControl>
+								<FormDescription>Run each exercise multiple times to compare results</FormDescription>
+								<FormMessage />
+							</FormItem>
+						)}
+					/>
+
 					<FormField
 						control={form.control}
 						name="description"

+ 56 - 1
apps/web-evals/src/components/home/run.tsx

@@ -1,7 +1,8 @@
 import { useCallback, useState, useRef } from "react"
 import Link from "next/link"
 import { useRouter } from "next/navigation"
-import { Ellipsis, ClipboardList, Copy, Check, LoaderCircle, Trash, Settings } from "lucide-react"
+import { toast } from "sonner"
+import { Ellipsis, ClipboardList, Copy, Check, LoaderCircle, Trash, Settings, FileDown } from "lucide-react"
 
 import type { Run as EvalsRun, TaskMetrics as EvalsTaskMetrics } from "@roo-code/evals"
 import type { ToolName } from "@roo-code/types"
@@ -48,9 +49,46 @@ export function Run({ run, taskMetrics, toolColumns }: RunProps) {
 	const router = useRouter()
 	const [deleteRunId, setDeleteRunId] = useState<number>()
 	const [showSettings, setShowSettings] = useState(false)
+	const [isExportingLogs, setIsExportingLogs] = useState(false)
 	const continueRef = useRef<HTMLButtonElement>(null)
 	const { isPending, copyRun, copied } = useCopyRun(run.id)
 
+	const onExportFailedLogs = useCallback(async () => {
+		if (run.failed === 0) {
+			toast.error("No failed tasks to export")
+			return
+		}
+
+		setIsExportingLogs(true)
+		try {
+			const response = await fetch(`/api/runs/${run.id}/logs/failed`)
+
+			if (!response.ok) {
+				const error = await response.json()
+				toast.error(error.error || "Failed to export logs")
+				return
+			}
+
+			// Download the zip file
+			const blob = await response.blob()
+			const url = window.URL.createObjectURL(blob)
+			const a = document.createElement("a")
+			a.href = url
+			a.download = `run-${run.id}-failed-logs.zip`
+			document.body.appendChild(a)
+			a.click()
+			window.URL.revokeObjectURL(url)
+			document.body.removeChild(a)
+
+			toast.success("Failed logs exported successfully")
+		} catch (error) {
+			console.error("Error exporting logs:", error)
+			toast.error("Failed to export logs")
+		} finally {
+			setIsExportingLogs(false)
+		}
+	}, [run.id, run.failed])
+
 	const onConfirmDelete = useCallback(async () => {
 		if (!deleteRunId) {
 			return
@@ -161,6 +199,23 @@ export function Run({ run, taskMetrics, toolColumns }: RunProps) {
 									</div>
 								</DropdownMenuItem>
 							)}
+							{run.failed > 0 && (
+								<DropdownMenuItem onClick={onExportFailedLogs} disabled={isExportingLogs}>
+									<div className="flex items-center gap-1">
+										{isExportingLogs ? (
+											<>
+												<LoaderCircle className="animate-spin" />
+												Exporting...
+											</>
+										) : (
+											<>
+												<FileDown />
+												Export Failed Logs
+											</>
+										)}
+									</div>
+								</DropdownMenuItem>
+							)}
 							<DropdownMenuItem
 								onClick={() => {
 									setDeleteRunId(run.id)

+ 5 - 0
apps/web-evals/src/lib/schemas.ts

@@ -14,6 +14,10 @@ export const TIMEOUT_MIN = 5
 export const TIMEOUT_MAX = 10
 export const TIMEOUT_DEFAULT = 5
 
+export const ITERATIONS_MIN = 1
+export const ITERATIONS_MAX = 10
+export const ITERATIONS_DEFAULT = 1
+
 export const createRunSchema = z
 	.object({
 		model: z.string().min(1, { message: "Model is required." }),
@@ -23,6 +27,7 @@ export const createRunSchema = z
 		settings: rooCodeSettingsSchema.optional(),
 		concurrency: z.number().int().min(CONCURRENCY_MIN).max(CONCURRENCY_MAX),
 		timeout: z.number().int().min(TIMEOUT_MIN).max(TIMEOUT_MAX),
+		iterations: z.number().int().min(ITERATIONS_MIN).max(ITERATIONS_MAX),
 		jobToken: z.string().optional(),
 	})
 	.refine((data) => data.suite === "full" || (data.exercises || []).length > 0, {

+ 45 - 0
packages/evals/docker-compose.override.yml

@@ -0,0 +1,45 @@
+# Development overrides - automatically loaded by docker compose
+# These settings only apply when running locally for development
+#
+# For production, use: docker compose -f docker-compose.yml up
+# (explicitly exclude override file)
+
+services:
+    web:
+        environment:
+            - NODE_ENV=development
+        volumes:
+            # Mount log files so web can access task logs
+            - /tmp/evals:/tmp/evals:ro
+            # Mount source code for hot reload in development
+            - ../../apps/web-evals:/roo/repo/apps/web-evals:delegated
+            - ../../packages/evals:/roo/repo/packages/evals:delegated
+            - ../../packages/types:/roo/repo/packages/types:delegated
+            - ../../packages/ipc:/roo/repo/packages/ipc:delegated
+            - ../../packages/cloud:/roo/repo/packages/cloud:delegated
+            # Exclude node_modules from being overwritten
+            - /roo/repo/node_modules
+            - /roo/repo/apps/web-evals/node_modules
+            - /roo/repo/packages/evals/node_modules
+            - /roo/repo/packages/types/node_modules
+            - /roo/repo/packages/ipc/node_modules
+            - /roo/repo/packages/cloud/node_modules
+        entrypoint: []
+        command:
+            - sh
+            - -c
+            - |
+                echo '🚀 Starting evals web service in development mode...'
+                wait_for_db() {
+                    echo '⏳ Waiting for database...'
+                    until pg_isready -h db -p 5432 -U postgres -d evals_development > /dev/null 2>&1; do
+                        echo '⏳ Database not ready yet, waiting 2 seconds...'
+                        sleep 2
+                    done
+                    echo '✅ Database is ready'
+                }
+                wait_for_db
+                echo '🔄 Running database migrations...'
+                pnpm --filter @roo-code/evals db:migrate
+                echo '🌐 Starting Next.js dev server...'
+                cd /roo/repo/apps/web-evals && npx next dev -p 3446

+ 3 - 0
packages/evals/docker-compose.yml

@@ -55,8 +55,11 @@ services:
             - "${EVALS_WEB_PORT:-3446}:3446"
         environment:
             - HOST_EXECUTION_METHOD=docker
+            - PRODUCTION_DATABASE_URL
         volumes:
             - /var/run/docker.sock:/var/run/docker.sock
+            # Mount log files so web can access task logs
+            - /tmp/evals:/tmp/evals:ro
         depends_on:
             db:
                 condition: service_healthy

+ 3 - 0
packages/evals/src/db/migrations/0004_sloppy_black_knight.sql

@@ -0,0 +1,3 @@
+DROP INDEX "tasks_language_exercise_idx";--> statement-breakpoint
+ALTER TABLE "tasks" ADD COLUMN "iteration" integer DEFAULT 1 NOT NULL;--> statement-breakpoint
+CREATE UNIQUE INDEX "tasks_language_exercise_iteration_idx" ON "tasks" USING btree ("run_id","language","exercise","iteration");

+ 472 - 0
packages/evals/src/db/migrations/meta/0004_snapshot.json

@@ -0,0 +1,472 @@
+{
+	"id": "9caa4487-e146-4084-907d-fbf9cc3e03b9",
+	"prevId": "853d308a-3946-4ea8-9039-236bfce3c6c0",
+	"version": "7",
+	"dialect": "postgresql",
+	"tables": {
+		"public.runs": {
+			"name": "runs",
+			"schema": "",
+			"columns": {
+				"id": {
+					"name": "id",
+					"type": "integer",
+					"primaryKey": true,
+					"notNull": true,
+					"identity": {
+						"type": "always",
+						"name": "runs_id_seq",
+						"schema": "public",
+						"increment": "1",
+						"startWith": "1",
+						"minValue": "1",
+						"maxValue": "2147483647",
+						"cache": "1",
+						"cycle": false
+					}
+				},
+				"task_metrics_id": {
+					"name": "task_metrics_id",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"model": {
+					"name": "model",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"name": {
+					"name": "name",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"description": {
+					"name": "description",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"contextWindow": {
+					"name": "contextWindow",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"inputPrice": {
+					"name": "inputPrice",
+					"type": "real",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"outputPrice": {
+					"name": "outputPrice",
+					"type": "real",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"cacheWritesPrice": {
+					"name": "cacheWritesPrice",
+					"type": "real",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"cacheReadsPrice": {
+					"name": "cacheReadsPrice",
+					"type": "real",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"settings": {
+					"name": "settings",
+					"type": "jsonb",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"jobToken": {
+					"name": "jobToken",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"pid": {
+					"name": "pid",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"socket_path": {
+					"name": "socket_path",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"concurrency": {
+					"name": "concurrency",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true,
+					"default": 2
+				},
+				"timeout": {
+					"name": "timeout",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true,
+					"default": 5
+				},
+				"passed": {
+					"name": "passed",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true,
+					"default": 0
+				},
+				"failed": {
+					"name": "failed",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true,
+					"default": 0
+				},
+				"created_at": {
+					"name": "created_at",
+					"type": "timestamp",
+					"primaryKey": false,
+					"notNull": true
+				}
+			},
+			"indexes": {},
+			"foreignKeys": {
+				"runs_task_metrics_id_taskMetrics_id_fk": {
+					"name": "runs_task_metrics_id_taskMetrics_id_fk",
+					"tableFrom": "runs",
+					"tableTo": "taskMetrics",
+					"columnsFrom": ["task_metrics_id"],
+					"columnsTo": ["id"],
+					"onDelete": "no action",
+					"onUpdate": "no action"
+				}
+			},
+			"compositePrimaryKeys": {},
+			"uniqueConstraints": {},
+			"policies": {},
+			"checkConstraints": {},
+			"isRLSEnabled": false
+		},
+		"public.taskMetrics": {
+			"name": "taskMetrics",
+			"schema": "",
+			"columns": {
+				"id": {
+					"name": "id",
+					"type": "integer",
+					"primaryKey": true,
+					"notNull": true,
+					"identity": {
+						"type": "always",
+						"name": "taskMetrics_id_seq",
+						"schema": "public",
+						"increment": "1",
+						"startWith": "1",
+						"minValue": "1",
+						"maxValue": "2147483647",
+						"cache": "1",
+						"cycle": false
+					}
+				},
+				"tokens_in": {
+					"name": "tokens_in",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"tokens_out": {
+					"name": "tokens_out",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"tokens_context": {
+					"name": "tokens_context",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"cache_writes": {
+					"name": "cache_writes",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"cache_reads": {
+					"name": "cache_reads",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"cost": {
+					"name": "cost",
+					"type": "real",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"duration": {
+					"name": "duration",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"tool_usage": {
+					"name": "tool_usage",
+					"type": "jsonb",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"created_at": {
+					"name": "created_at",
+					"type": "timestamp",
+					"primaryKey": false,
+					"notNull": true
+				}
+			},
+			"indexes": {},
+			"foreignKeys": {},
+			"compositePrimaryKeys": {},
+			"uniqueConstraints": {},
+			"policies": {},
+			"checkConstraints": {},
+			"isRLSEnabled": false
+		},
+		"public.tasks": {
+			"name": "tasks",
+			"schema": "",
+			"columns": {
+				"id": {
+					"name": "id",
+					"type": "integer",
+					"primaryKey": true,
+					"notNull": true,
+					"identity": {
+						"type": "always",
+						"name": "tasks_id_seq",
+						"schema": "public",
+						"increment": "1",
+						"startWith": "1",
+						"minValue": "1",
+						"maxValue": "2147483647",
+						"cache": "1",
+						"cycle": false
+					}
+				},
+				"run_id": {
+					"name": "run_id",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"task_metrics_id": {
+					"name": "task_metrics_id",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"language": {
+					"name": "language",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"exercise": {
+					"name": "exercise",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"iteration": {
+					"name": "iteration",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true,
+					"default": 1
+				},
+				"passed": {
+					"name": "passed",
+					"type": "boolean",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"started_at": {
+					"name": "started_at",
+					"type": "timestamp",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"finished_at": {
+					"name": "finished_at",
+					"type": "timestamp",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"created_at": {
+					"name": "created_at",
+					"type": "timestamp",
+					"primaryKey": false,
+					"notNull": true
+				}
+			},
+			"indexes": {
+				"tasks_language_exercise_iteration_idx": {
+					"name": "tasks_language_exercise_iteration_idx",
+					"columns": [
+						{
+							"expression": "run_id",
+							"isExpression": false,
+							"asc": true,
+							"nulls": "last"
+						},
+						{
+							"expression": "language",
+							"isExpression": false,
+							"asc": true,
+							"nulls": "last"
+						},
+						{
+							"expression": "exercise",
+							"isExpression": false,
+							"asc": true,
+							"nulls": "last"
+						},
+						{
+							"expression": "iteration",
+							"isExpression": false,
+							"asc": true,
+							"nulls": "last"
+						}
+					],
+					"isUnique": true,
+					"concurrently": false,
+					"method": "btree",
+					"with": {}
+				}
+			},
+			"foreignKeys": {
+				"tasks_run_id_runs_id_fk": {
+					"name": "tasks_run_id_runs_id_fk",
+					"tableFrom": "tasks",
+					"tableTo": "runs",
+					"columnsFrom": ["run_id"],
+					"columnsTo": ["id"],
+					"onDelete": "no action",
+					"onUpdate": "no action"
+				},
+				"tasks_task_metrics_id_taskMetrics_id_fk": {
+					"name": "tasks_task_metrics_id_taskMetrics_id_fk",
+					"tableFrom": "tasks",
+					"tableTo": "taskMetrics",
+					"columnsFrom": ["task_metrics_id"],
+					"columnsTo": ["id"],
+					"onDelete": "no action",
+					"onUpdate": "no action"
+				}
+			},
+			"compositePrimaryKeys": {},
+			"uniqueConstraints": {},
+			"policies": {},
+			"checkConstraints": {},
+			"isRLSEnabled": false
+		},
+		"public.toolErrors": {
+			"name": "toolErrors",
+			"schema": "",
+			"columns": {
+				"id": {
+					"name": "id",
+					"type": "integer",
+					"primaryKey": true,
+					"notNull": true,
+					"identity": {
+						"type": "always",
+						"name": "toolErrors_id_seq",
+						"schema": "public",
+						"increment": "1",
+						"startWith": "1",
+						"minValue": "1",
+						"maxValue": "2147483647",
+						"cache": "1",
+						"cycle": false
+					}
+				},
+				"run_id": {
+					"name": "run_id",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"task_id": {
+					"name": "task_id",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"tool_name": {
+					"name": "tool_name",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"error": {
+					"name": "error",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"created_at": {
+					"name": "created_at",
+					"type": "timestamp",
+					"primaryKey": false,
+					"notNull": true
+				}
+			},
+			"indexes": {},
+			"foreignKeys": {
+				"toolErrors_run_id_runs_id_fk": {
+					"name": "toolErrors_run_id_runs_id_fk",
+					"tableFrom": "toolErrors",
+					"tableTo": "runs",
+					"columnsFrom": ["run_id"],
+					"columnsTo": ["id"],
+					"onDelete": "no action",
+					"onUpdate": "no action"
+				},
+				"toolErrors_task_id_tasks_id_fk": {
+					"name": "toolErrors_task_id_tasks_id_fk",
+					"tableFrom": "toolErrors",
+					"tableTo": "tasks",
+					"columnsFrom": ["task_id"],
+					"columnsTo": ["id"],
+					"onDelete": "no action",
+					"onUpdate": "no action"
+				}
+			},
+			"compositePrimaryKeys": {},
+			"uniqueConstraints": {},
+			"policies": {},
+			"checkConstraints": {},
+			"isRLSEnabled": false
+		}
+	},
+	"enums": {},
+	"schemas": {},
+	"sequences": {},
+	"roles": {},
+	"policies": {},
+	"views": {},
+	"_meta": {
+		"columns": {},
+		"schemas": {},
+		"tables": {}
+	}
+}

+ 7 - 0
packages/evals/src/db/migrations/meta/_journal.json

@@ -29,6 +29,13 @@
 			"when": 1763797232454,
 			"tag": "0003_simple_retro_girl",
 			"breakpoints": true
+		},
+		{
+			"idx": 4,
+			"version": "7",
+			"when": 1764201678953,
+			"tag": "0004_sloppy_black_knight",
+			"breakpoints": true
 		}
 	]
 }

+ 9 - 1
packages/evals/src/db/schema.ts

@@ -55,12 +55,20 @@ export const tasks = pgTable(
 		taskMetricsId: integer("task_metrics_id").references(() => taskMetrics.id),
 		language: text().notNull().$type<ExerciseLanguage>(),
 		exercise: text().notNull(),
+		iteration: integer().default(1).notNull(),
 		passed: boolean(),
 		startedAt: timestamp("started_at"),
 		finishedAt: timestamp("finished_at"),
 		createdAt: timestamp("created_at").notNull(),
 	},
-	(table) => [uniqueIndex("tasks_language_exercise_idx").on(table.runId, table.language, table.exercise)],
+	(table) => [
+		uniqueIndex("tasks_language_exercise_iteration_idx").on(
+			table.runId,
+			table.language,
+			table.exercise,
+			table.iteration,
+		),
+	],
 )
 
 export const tasksRelations = relations(tasks, ({ one }) => ({

+ 125 - 1
pnpm-lock.yaml

@@ -176,6 +176,9 @@ importers:
       '@tanstack/react-query':
         specifier: ^5.69.0
         version: 5.76.1([email protected])
+      archiver:
+        specifier: ^7.0.1
+        version: 7.0.1
       class-variance-authority:
         specifier: ^0.7.1
         version: 0.7.1
@@ -240,6 +243,9 @@ importers:
       '@tailwindcss/postcss':
         specifier: ^4
         version: 4.1.8
+      '@types/archiver':
+        specifier: ^7.0.0
+        version: 7.0.0
       '@types/ps-tree':
         specifier: ^1.1.6
         version: 1.1.6
@@ -3904,6 +3910,9 @@ packages:
   '@tybys/[email protected]':
     resolution: {integrity: sha512-6+7nlbMVX/PVDCwaIQ8nTOPveOcFLSt8GcXdx8hD0bt39uWxYT88uXzqTd4fTvqta7oeUJqudepapKNt2DYJFw==}
 
+  '@types/[email protected]':
+    resolution: {integrity: sha512-/3vwGwx9n+mCQdYZ2IKGGHEFL30I96UgBlk8EtRDDFQ9uxM1l4O5Ci6r00EMAkiDaTqD9DQ6nVrWRICnBPtzzg==}
+
   '@types/[email protected]':
     resolution: {integrity: sha512-rfT93uj5s0PRL7EzccGMs3brplhcrghnDoV26NqKhCAS1hVo+WdNsPvE/yb6ilfr5hi2MEk6d5EWJTKdxg8jVw==}
 
@@ -4142,6 +4151,9 @@ packages:
   '@types/[email protected]':
     resolution: {integrity: sha512-/LDXMQh55EzZQ0uVAZmKKhfENivEvWz6E+EYzh+/MCjMhNsotd+ZHhBGIjFDTi6+fz0OhQQQLbTgdQIxxCsC0w==}
 
+  '@types/[email protected]':
+    resolution: {integrity: sha512-raiuEPUYqXu+nvtY2Pe8s8FEmZ3x5yAH4VkLdihcPdalvsHltomrRC9BzuStrJ9yk06470hS0Crw0f1pXqD+Hg==}
+
   '@types/[email protected]':
     resolution: {integrity: sha512-3xSjTp3v03X/lSQLkczaN9UIEwJMoMCA1+Nb5HfbJEQWogdeQIyVtTvxPXDQjZ5zws8rFQfVfRdz03ARihPJgw==}
 
@@ -4471,10 +4483,18 @@ packages:
     resolution: {integrity: sha512-KVgf4XQVrTjhyWmx6cte4RxonPLR9onExufI1jhvw/MQ4BB6IsZD5gT8Lq+u/+pRkWna/6JoHpiQioaqFP5Rzw==}
     engines: {node: '>= 10'}
 
+  [email protected]:
+    resolution: {integrity: sha512-wuLJMmIBQYCsGZgYLTy5FIB2pF6Lfb6cXMSF8Qywwk3t20zWnAi7zLcQFdKQmIB8wyZpY5ER38x08GbwtR2cLA==}
+    engines: {node: '>= 14'}
+
   [email protected]:
     resolution: {integrity: sha512-+25nxyyznAXF7Nef3y0EbBeqmGZgeN/BxHX29Rs39djAfaFalmQ89SE6CWyDCHzGL0yt/ycBtNOmGTW0FyGWNw==}
     engines: {node: '>= 10'}
 
+  [email protected]:
+    resolution: {integrity: sha512-ZcbTaIqJOfCc03QwD468Unz/5Ir8ATtvAHsK+FdXbDIbGfihqh9mrvdcYunQzqn4HrvWWaFyaxJhGZagaJJpPQ==}
+    engines: {node: '>= 14'}
+
   [email protected]:
     resolution: {integrity: sha512-PYjyFOLKQ9y57JvQ6QLo8dAgNqswh8M1RMJYdQduT6xbWSgK36P/Z/v+p888pM69jMMfS8Xd8F6I1kQ/I9HUGg==}
 
@@ -4676,6 +4696,10 @@ packages:
   [email protected]:
     resolution: {integrity: sha512-VO9Ht/+p3SN7SKWqcrgEzjGbRSJYTx+Q1pTQC0wrWqHx0vpJraQ6GtHx8tvcg1rlK1byhU5gccxgOgj7B0TDkQ==}
 
+  [email protected]:
+    resolution: {integrity: sha512-Db1SbgBS/fg/392AblrMJk97KggmvYhr4pB5ZIMTWtaivCPMWLkmb7m21cJvpvgK+J3nsU2CmmixNBZx4vFj/w==}
+    engines: {node: '>=8.0.0'}
+
   [email protected]:
     resolution: {integrity: sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA==}
 
@@ -4689,6 +4713,9 @@ packages:
   [email protected]:
     resolution: {integrity: sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==}
 
+  [email protected]:
+    resolution: {integrity: sha512-FTiCpNxtwiZZHEZbcbTIcZjERVICn9yq/pDFkTl95/AxzD1naBctN7YO68riM/gLSDY7sdrMby8hofADYuuqOA==}
+
   [email protected]:
     resolution: {integrity: sha512-9q/rDEGSb/Qsvv2qvzIzdluL5k7AaJOTrw23z9reQthrbF7is4CtlT0DXyO1oei2DCp4uojjzQ7igaSHp1kAEQ==}
     engines: {node: '>=0.2.0'}
@@ -4978,6 +5005,10 @@ packages:
     resolution: {integrity: sha512-D3uMHtGc/fcO1Gt1/L7i1e33VOvD4A9hfQLP+6ewd+BvG/gQ84Yh4oftEhAdjSMgBgwGL+jsppT7JYNpo6MHHg==}
     engines: {node: '>= 10'}
 
+  [email protected]:
+    resolution: {integrity: sha512-6FqVXeETqWPoGcfzrXb37E50NP0LXT8kAMu5ooZayhWWdgEY4lBEEcbQNXtkuKQsGduxiIcI4gOTsxTmuq/bSg==}
+    engines: {node: '>= 14'}
+
   [email protected]:
     resolution: {integrity: sha512-RMtmw0iFkeR4YV+fUOSucriAQNb9g8zFR52MWCtl+cCZOFRNL6zeB395vPzFhEjjn4fMxXudmELnl/KF/WrK6w==}
 
@@ -5042,6 +5073,10 @@ packages:
     resolution: {integrity: sha512-NT7w2JVU7DFroFdYkeq8cywxrgjPHWkdX1wjpRQXPX5Asews3tA+Ght6lddQO5Mkumffp3X7GEqku3epj2toIw==}
     engines: {node: '>= 10'}
 
+  [email protected]:
+    resolution: {integrity: sha512-piICUB6ei4IlTv1+653yq5+KoqfBYmj9bw6LqXoOneTMDXk5nM1qt12mFW1caG3LlJXEKW1Bp0WggEmIfQB34g==}
+    engines: {node: '>= 14'}
+
   [email protected]:
     resolution: {integrity: sha512-e4a5N8lVvuLgAWgnCrLr2PP0YyDOTHa9H/Rj54dirp61qXnNq46m82bRhNqIA5VccJtWBvPTFRV3TtvHUKPB1g==}
 
@@ -5896,6 +5931,10 @@ packages:
   [email protected]:
     resolution: {integrity: sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA==}
 
+  [email protected]:
+    resolution: {integrity: sha512-mQw+2fkQbALzQ7V0MY0IqdnXNOeTtP4r0lN9z7AAawCXgqea7bDii20AYrIBrFd/Hx0M2Ocz6S111CaFkUcb0Q==}
+    engines: {node: '>=0.8.x'}
+
   [email protected]:
     resolution: {integrity: sha512-6RxOBZ/cYgd8usLwsEl+EC09Au/9BcmCKYF2/xbml6DNczf7nv0MQb+7BA2F+li6//I+28VNlQR37XfQtcAJuA==}
     engines: {node: '>=18.0.0'}
@@ -8362,6 +8401,10 @@ packages:
   [email protected]:
     resolution: {integrity: sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==}
 
+  [email protected]:
+    resolution: {integrity: sha512-cdGef/drWFoydD1JsMzuFf8100nZl+GT+yacc2bEced5f9Rjk4z+WtFUTBu9PhOi9j/jfmBPu0mMEY4wIdAF8A==}
+    engines: {node: '>= 0.6.0'}
+
   [email protected]:
     resolution: {integrity: sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==}
     engines: {node: '>=0.4.0'}
@@ -8614,6 +8657,10 @@ packages:
     resolution: {integrity: sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==}
     engines: {node: '>= 6'}
 
+  [email protected]:
+    resolution: {integrity: sha512-oIGGmcpTLwPga8Bn6/Z75SVaH1z5dUut2ibSyAMVhmUggWpmDn2dapB0n7f8nwaSiRtepAsfJyfXIO5DCVAODg==}
+    engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0}
+
   [email protected]:
     resolution: {integrity: sha512-v05I2k7xN8zXvPD9N+z/uhXPaj0sUFCe2rcWZIpBsqxfP7xXFQ0tipAd/wjj1YxWyWtUS5IDJpOG82JKt2EAVA==}
 
@@ -9180,6 +9227,9 @@ packages:
   [email protected]:
     resolution: {integrity: sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==}
 
+  [email protected]:
+    resolution: {integrity: sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==}
+
   [email protected]:
     resolution: {integrity: sha512-IwfBptatlO+QCJUo19AqvrPNqlVMpW9YEL2LIVY+Rpv2qsjCGxaDLNRgeGsQWJhfItebuJhsGSLjaBbNSQ+ieg==}
 
@@ -10271,6 +10321,10 @@ packages:
     resolution: {integrity: sha512-9qv4rlDiopXg4E69k+vMHjNN63YFMe9sZMrdlvKnCjlCRWeCBswPPMPUfx+ipsAWq1LXHe70RcbaHdJJpS6hyQ==}
     engines: {node: '>= 10'}
 
+  [email protected]:
+    resolution: {integrity: sha512-zK7YHHz4ZXpW89AHXUPbQVGKI7uvkd3hzusTdotCg1UxyaVtg0zFJSTfW/Dq5f7OBBVnq6cZIaC8Ti4hb6dtCA==}
+    engines: {node: '>= 14'}
+
   [email protected]:
     resolution: {integrity: sha512-/AuWwMP+YqiPbsJx5D6TfgRTc4kTLjsh5SOcd4bLsfUg2RcEXrFMJl1DGgdHy2aCfsIA/cr/1JM0xcB2GZji8g==}
     peerDependencies:
@@ -13557,6 +13611,10 @@ snapshots:
       tslib: 2.8.1
     optional: true
 
+  '@types/[email protected]':
+    dependencies:
+      '@types/readdir-glob': 1.1.5
+
   '@types/[email protected]': {}
 
   '@types/[email protected]':
@@ -13831,6 +13889,10 @@ snapshots:
       '@types/prop-types': 15.7.14
       csstype: 3.1.3
 
+  '@types/[email protected]':
+    dependencies:
+      '@types/node': 24.2.1
+
   '@types/[email protected]': {}
 
   '@types/[email protected]': {}
@@ -14052,7 +14114,7 @@ snapshots:
       sirv: 3.0.1
       tinyglobby: 0.2.14
       tinyrainbow: 2.0.0
-      vitest: 3.2.4(@types/[email protected])(@types/node@20.17.57)(@vitest/[email protected])([email protected])([email protected])([email protected])([email protected])([email protected])
+      vitest: 3.2.4(@types/[email protected])(@types/node@24.2.1)(@vitest/[email protected])([email protected])([email protected])([email protected])([email protected])([email protected])
 
   '@vitest/[email protected]':
     dependencies:
@@ -14262,6 +14324,16 @@ snapshots:
       normalize-path: 3.0.0
       readable-stream: 3.6.2
 
+  [email protected]:
+    dependencies:
+      glob: 11.1.0
+      graceful-fs: 4.2.11
+      is-stream: 2.0.1
+      lazystream: 1.0.1
+      lodash: 4.17.21
+      normalize-path: 3.0.0
+      readable-stream: 4.7.0
+
   [email protected]:
     dependencies:
       archiver-utils: 2.1.0
@@ -14272,6 +14344,16 @@ snapshots:
       tar-stream: 2.2.0
       zip-stream: 4.1.1
 
+  [email protected]:
+    dependencies:
+      archiver-utils: 5.0.2
+      async: 3.2.6
+      buffer-crc32: 1.0.0
+      readable-stream: 4.7.0
+      readdir-glob: 1.1.3
+      tar-stream: 3.1.7
+      zip-stream: 6.0.1
+
   [email protected]: {}
 
   [email protected]:
@@ -14502,6 +14584,8 @@ snapshots:
 
   [email protected]: {}
 
+  [email protected]: {}
+
   [email protected]: {}
 
   [email protected]: {}
@@ -14513,6 +14597,11 @@ snapshots:
       base64-js: 1.5.1
       ieee754: 1.2.1
 
+  [email protected]:
+    dependencies:
+      base64-js: 1.5.1
+      ieee754: 1.2.1
+
   [email protected]: {}
 
   [email protected]:
@@ -14823,6 +14912,14 @@ snapshots:
       normalize-path: 3.0.0
       readable-stream: 3.6.2
 
+  [email protected]:
+    dependencies:
+      crc-32: 1.2.2
+      crc32-stream: 6.0.0
+      is-stream: 2.0.1
+      normalize-path: 3.0.0
+      readable-stream: 4.7.0
+
   [email protected]: {}
 
   [email protected]: {}
@@ -14881,6 +14978,11 @@ snapshots:
       crc-32: 1.2.2
       readable-stream: 3.6.2
 
+  [email protected]:
+    dependencies:
+      crc-32: 1.2.2
+      readable-stream: 4.7.0
+
   [email protected]:
     dependencies:
       node-fetch: 2.7.0
@@ -15789,6 +15891,8 @@ snapshots:
 
   [email protected]: {}
 
+  [email protected]: {}
+
   [email protected]: {}
 
   [email protected]:
@@ -18692,6 +18796,8 @@ snapshots:
 
   [email protected]: {}
 
+  [email protected]: {}
+
   [email protected]: {}
 
   [email protected]:
@@ -19021,6 +19127,14 @@ snapshots:
       string_decoder: 1.1.1
       util-deprecate: 1.0.2
 
+  [email protected]:
+    dependencies:
+      abort-controller: 3.0.0
+      buffer: 6.0.3
+      events: 3.3.0
+      process: 0.11.10
+      string_decoder: 1.3.0
+
   [email protected]:
     dependencies:
       minimatch: 5.1.6
@@ -19766,6 +19880,10 @@ snapshots:
     dependencies:
       safe-buffer: 5.1.2
 
+  [email protected]:
+    dependencies:
+      safe-buffer: 5.2.1
+
   [email protected]:
     dependencies:
       character-entities-html4: 2.1.0
@@ -21059,6 +21177,12 @@ snapshots:
       compress-commons: 4.1.2
       readable-stream: 3.6.2
 
+  [email protected]:
+    dependencies:
+      archiver-utils: 5.0.2
+      compress-commons: 6.0.2
+      readable-stream: 4.7.0
+
   [email protected]([email protected]):
     dependencies:
       zod: 3.25.61