Просмотр исходного кода

feat(evals-ui): Add filtering, bulk delete, tool consolidation, and run notes (#9837)

Hannes Rudolph 4 недель назад
Родитель
Сommit
8433eafb05

+ 153 - 0
apps/web-evals/src/actions/runs.ts

@@ -13,6 +13,9 @@ import {
 	exerciseLanguages,
 	createRun as _createRun,
 	deleteRun as _deleteRun,
+	updateRun as _updateRun,
+	getIncompleteRuns as _getIncompleteRuns,
+	deleteRunsByIds as _deleteRunsByIds,
 	createTask,
 	getExercisesForLanguage,
 } from "@roo-code/evals"
@@ -20,6 +23,9 @@ import {
 import { CreateRun } from "@/lib/schemas"
 import { redisClient } from "@/lib/server/redis"
 
+// Storage base path for eval logs
+const EVALS_STORAGE_PATH = "/tmp/evals/runs"
+
 const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals")
 
 export async function createRun({ suite, exercises = [], timeout, iterations = 1, ...values }: CreateRun) {
@@ -214,3 +220,150 @@ export async function killRun(runId: number): Promise<KillRunResult> {
 		errors,
 	}
 }
+
+export type DeleteIncompleteRunsResult = {
+	success: boolean
+	deletedCount: number
+	deletedRunIds: number[]
+	storageErrors: string[]
+}
+
+/**
+ * Delete all incomplete runs (runs without a taskMetricsId/final score).
+ * Removes both database records and storage folders.
+ */
+export async function deleteIncompleteRuns(): Promise<DeleteIncompleteRunsResult> {
+	const storageErrors: string[] = []
+
+	// Get all incomplete runs
+	const incompleteRuns = await _getIncompleteRuns()
+	const runIds = incompleteRuns.map((run) => run.id)
+
+	if (runIds.length === 0) {
+		return {
+			success: true,
+			deletedCount: 0,
+			deletedRunIds: [],
+			storageErrors: [],
+		}
+	}
+
+	// Delete storage folders for each run
+	for (const runId of runIds) {
+		const storagePath = path.join(EVALS_STORAGE_PATH, String(runId))
+		try {
+			if (fs.existsSync(storagePath)) {
+				fs.rmSync(storagePath, { recursive: true, force: true })
+				console.log(`Deleted storage folder: ${storagePath}`)
+			}
+		} catch (error) {
+			console.error(`Failed to delete storage folder ${storagePath}:`, error)
+			storageErrors.push(`Failed to delete storage for run ${runId}`)
+		}
+
+		// Also try to clear Redis state for any potentially running incomplete runs
+		try {
+			const redis = await redisClient()
+			await redis.del(`heartbeat:${runId}`)
+			await redis.del(`runners:${runId}`)
+		} catch (error) {
+			// Non-critical error, just log it
+			console.error(`Failed to clear Redis state for run ${runId}:`, error)
+		}
+	}
+
+	// Delete from database
+	await _deleteRunsByIds(runIds)
+
+	revalidatePath("/runs")
+
+	return {
+		success: true,
+		deletedCount: runIds.length,
+		deletedRunIds: runIds,
+		storageErrors,
+	}
+}
+
+/**
+ * Get count of incomplete runs (for UI display)
+ */
+export async function getIncompleteRunsCount(): Promise<number> {
+	const incompleteRuns = await _getIncompleteRuns()
+	return incompleteRuns.length
+}
+
+/**
+ * Delete all runs older than 30 days.
+ * Removes both database records and storage folders.
+ */
+export async function deleteOldRuns(): Promise<DeleteIncompleteRunsResult> {
+	const storageErrors: string[] = []
+
+	// Get all runs older than 30 days
+	const thirtyDaysAgo = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000)
+	const { getRuns } = await import("@roo-code/evals")
+	const allRuns = await getRuns()
+	const oldRuns = allRuns.filter((run) => run.createdAt < thirtyDaysAgo)
+	const runIds = oldRuns.map((run) => run.id)
+
+	if (runIds.length === 0) {
+		return {
+			success: true,
+			deletedCount: 0,
+			deletedRunIds: [],
+			storageErrors: [],
+		}
+	}
+
+	// Delete storage folders for each run
+	for (const runId of runIds) {
+		const storagePath = path.join(EVALS_STORAGE_PATH, String(runId))
+		try {
+			if (fs.existsSync(storagePath)) {
+				fs.rmSync(storagePath, { recursive: true, force: true })
+				console.log(`Deleted storage folder: ${storagePath}`)
+			}
+		} catch (error) {
+			console.error(`Failed to delete storage folder ${storagePath}:`, error)
+			storageErrors.push(`Failed to delete storage for run ${runId}`)
+		}
+
+		// Also try to clear Redis state
+		try {
+			const redis = await redisClient()
+			await redis.del(`heartbeat:${runId}`)
+			await redis.del(`runners:${runId}`)
+		} catch (error) {
+			// Non-critical error, just log it
+			console.error(`Failed to clear Redis state for run ${runId}:`, error)
+		}
+	}
+
+	// Delete from database
+	await _deleteRunsByIds(runIds)
+
+	revalidatePath("/runs")
+
+	return {
+		success: true,
+		deletedCount: runIds.length,
+		deletedRunIds: runIds,
+		storageErrors,
+	}
+}
+
+/**
+ * Update the description of a run.
+ */
+export async function updateRunDescription(runId: number, description: string | null): Promise<{ success: boolean }> {
+	try {
+		await _updateRun(runId, { description })
+		revalidatePath("/runs")
+		revalidatePath(`/runs/${runId}`)
+		return { success: true }
+	} catch (error) {
+		console.error("Failed to update run description:", error)
+		return { success: false }
+	}
+}

+ 225 - 73
apps/web-evals/src/components/home/run.tsx

@@ -2,12 +2,12 @@ import { useCallback, useState, useRef } from "react"
 import Link from "next/link"
 import { useRouter } from "next/navigation"
 import { toast } from "sonner"
-import { Ellipsis, ClipboardList, Copy, Check, LoaderCircle, Trash, Settings, FileDown } from "lucide-react"
+import { Ellipsis, ClipboardList, Copy, Check, LoaderCircle, Trash, Settings, FileDown, StickyNote } from "lucide-react"
 
 import type { Run as EvalsRun, TaskMetrics as EvalsTaskMetrics } from "@roo-code/evals"
 import type { ToolName } from "@roo-code/types"
 
-import { deleteRun } from "@/actions/runs"
+import { deleteRun, updateRunDescription } from "@/actions/runs"
 import {
 	formatCurrency,
 	formatDateTime,
@@ -20,6 +20,10 @@ import {
 	Button,
 	TableCell,
 	TableRow,
+	Textarea,
+	Tooltip,
+	TooltipContent,
+	TooltipTrigger,
 	DropdownMenu,
 	DropdownMenuContent,
 	DropdownMenuItem,
@@ -34,6 +38,7 @@ import {
 	AlertDialogTitle,
 	Dialog,
 	DialogContent,
+	DialogFooter,
 	DialogHeader,
 	DialogTitle,
 	ScrollArea,
@@ -43,16 +48,41 @@ type RunProps = {
 	run: EvalsRun
 	taskMetrics: EvalsTaskMetrics | null
 	toolColumns: ToolName[]
+	consolidatedToolColumns: string[]
 }
 
-export function Run({ run, taskMetrics, toolColumns }: RunProps) {
+export function Run({ run, taskMetrics, toolColumns, consolidatedToolColumns }: RunProps) {
 	const router = useRouter()
 	const [deleteRunId, setDeleteRunId] = useState<number>()
 	const [showSettings, setShowSettings] = useState(false)
 	const [isExportingLogs, setIsExportingLogs] = useState(false)
+	const [showNotesDialog, setShowNotesDialog] = useState(false)
+	const [editingDescription, setEditingDescription] = useState(run.description ?? "")
+	const [isSavingNotes, setIsSavingNotes] = useState(false)
 	const continueRef = useRef<HTMLButtonElement>(null)
 	const { isPending, copyRun, copied } = useCopyRun(run.id)
 
+	const hasDescription = Boolean(run.description && run.description.trim().length > 0)
+
+	const handleSaveDescription = useCallback(async () => {
+		setIsSavingNotes(true)
+		try {
+			const result = await updateRunDescription(run.id, editingDescription.trim() || null)
+			if (result.success) {
+				toast.success("Description saved")
+				setShowNotesDialog(false)
+				router.refresh()
+			} else {
+				toast.error("Failed to save description")
+			}
+		} catch (error) {
+			console.error("Error saving description:", error)
+			toast.error("Failed to save description")
+		} finally {
+			setIsSavingNotes(false)
+		}
+	}, [run.id, editingDescription, router])
+
 	const onExportFailedLogs = useCallback(async () => {
 		if (run.failed === 0) {
 			toast.error("No failed tasks to export")
@@ -140,6 +170,68 @@ export function Run({ run, taskMetrics, toolColumns }: RunProps) {
 						</div>
 					)}
 				</TableCell>
+				{consolidatedToolColumns.length > 0 && (
+					<TableCell className="text-xs text-center">
+						{taskMetrics?.toolUsage ? (
+							(() => {
+								// Calculate aggregated stats for consolidated tools
+								let totalAttempts = 0
+								let totalFailures = 0
+								const breakdown: Array<{ tool: string; attempts: number; rate: string }> = []
+
+								for (const toolName of consolidatedToolColumns) {
+									const usage = taskMetrics.toolUsage[toolName as ToolName]
+									if (usage) {
+										totalAttempts += usage.attempts
+										totalFailures += usage.failures
+										const rate =
+											usage.attempts > 0
+												? `${Math.round(((usage.attempts - usage.failures) / usage.attempts) * 100)}%`
+												: "0%"
+										breakdown.push({ tool: toolName, attempts: usage.attempts, rate })
+									}
+								}
+
+								const consolidatedRate =
+									totalAttempts > 0 ? ((totalAttempts - totalFailures) / totalAttempts) * 100 : 100
+								const rateColor =
+									consolidatedRate === 100
+										? "text-muted-foreground"
+										: consolidatedRate >= 80
+											? "text-yellow-500"
+											: "text-red-500"
+
+								return totalAttempts > 0 ? (
+									<Tooltip>
+										<TooltipTrigger>
+											<div className="flex flex-col items-center">
+												<span className="font-medium">{totalAttempts}</span>
+												<span className={rateColor}>{Math.round(consolidatedRate)}%</span>
+											</div>
+										</TooltipTrigger>
+										<TooltipContent>
+											<div className="text-xs">
+												<div className="font-semibold mb-1">Consolidated Tools:</div>
+												{breakdown.map(({ tool, attempts, rate }) => (
+													<div key={tool} className="flex justify-between gap-4">
+														<span>{tool}:</span>
+														<span>
+															{attempts} ({rate})
+														</span>
+													</div>
+												))}
+											</div>
+										</TooltipContent>
+									</Tooltip>
+								) : (
+									<span className="text-muted-foreground">-</span>
+								)
+							})()
+						) : (
+							<span className="text-muted-foreground">-</span>
+						)}
+					</TableCell>
+				)}
 				{toolColumns.map((toolName) => {
 					const usage = taskMetrics?.toolUsage?.[toolName]
 					const successRate =
@@ -166,80 +258,107 @@ export function Run({ run, taskMetrics, toolColumns }: RunProps) {
 				<TableCell>{taskMetrics && formatCurrency(taskMetrics.cost)}</TableCell>
 				<TableCell>{taskMetrics && formatDuration(taskMetrics.duration)}</TableCell>
 				<TableCell onClick={(e) => e.stopPropagation()}>
-					<DropdownMenu>
-						<Button variant="ghost" size="icon" asChild>
-							<DropdownMenuTrigger data-dropdown-trigger>
-								<Ellipsis />
-							</DropdownMenuTrigger>
-						</Button>
-						<DropdownMenuContent align="end">
-							<DropdownMenuItem asChild>
-								<Link href={`/runs/${run.id}`}>
-									<div className="flex items-center gap-1">
-										<ClipboardList />
-										<div>View Tasks</div>
-									</div>
-								</Link>
-							</DropdownMenuItem>
-							{run.settings && (
-								<DropdownMenuItem onClick={() => setShowSettings(true)}>
-									<div className="flex items-center gap-1">
-										<Settings />
-										<div>View Settings</div>
-									</div>
-								</DropdownMenuItem>
-							)}
-							{run.taskMetricsId && (
-								<DropdownMenuItem onClick={() => copyRun()} disabled={isPending || copied}>
-									<div className="flex items-center gap-1">
-										{isPending ? (
-											<>
-												<LoaderCircle className="animate-spin" />
-												Copying...
-											</>
-										) : copied ? (
-											<>
-												<Check />
-												Copied!
-											</>
-										) : (
-											<>
-												<Copy />
-												Copy to Production
-											</>
-										)}
-									</div>
+					<div className="flex items-center gap-1">
+						{/* Note Icon */}
+						<Tooltip>
+							<TooltipTrigger asChild>
+								<Button
+									variant="ghost"
+									size="icon"
+									className={hasDescription ? "" : "opacity-30 hover:opacity-60"}
+									onClick={(e) => {
+										e.stopPropagation()
+										setEditingDescription(run.description ?? "")
+										setShowNotesDialog(true)
+									}}>
+									<StickyNote className="h-4 w-4" />
+								</Button>
+							</TooltipTrigger>
+							<TooltipContent className="max-w-[300px]">
+								{hasDescription ? (
+									<div className="whitespace-pre-wrap">{run.description}</div>
+								) : (
+									<div className="text-muted-foreground">No description. Click to add one.</div>
+								)}
+							</TooltipContent>
+						</Tooltip>
+
+						{/* More Actions Menu */}
+						<DropdownMenu>
+							<Button variant="ghost" size="icon" asChild>
+								<DropdownMenuTrigger data-dropdown-trigger>
+									<Ellipsis />
+								</DropdownMenuTrigger>
+							</Button>
+							<DropdownMenuContent align="end">
+								<DropdownMenuItem asChild>
+									<Link href={`/runs/${run.id}`}>
+										<div className="flex items-center gap-1">
+											<ClipboardList />
+											<div>View Tasks</div>
+										</div>
+									</Link>
 								</DropdownMenuItem>
-							)}
-							{run.failed > 0 && (
-								<DropdownMenuItem onClick={onExportFailedLogs} disabled={isExportingLogs}>
+								{run.settings && (
+									<DropdownMenuItem onClick={() => setShowSettings(true)}>
+										<div className="flex items-center gap-1">
+											<Settings />
+											<div>View Settings</div>
+										</div>
+									</DropdownMenuItem>
+								)}
+								{run.taskMetricsId && (
+									<DropdownMenuItem onClick={() => copyRun()} disabled={isPending || copied}>
+										<div className="flex items-center gap-1">
+											{isPending ? (
+												<>
+													<LoaderCircle className="animate-spin" />
+													Copying...
+												</>
+											) : copied ? (
+												<>
+													<Check />
+													Copied!
+												</>
+											) : (
+												<>
+													<Copy />
+													Copy to Production
+												</>
+											)}
+										</div>
+									</DropdownMenuItem>
+								)}
+								{run.failed > 0 && (
+									<DropdownMenuItem onClick={onExportFailedLogs} disabled={isExportingLogs}>
+										<div className="flex items-center gap-1">
+											{isExportingLogs ? (
+												<>
+													<LoaderCircle className="animate-spin" />
+													Exporting...
+												</>
+											) : (
+												<>
+													<FileDown />
+													Export Failed Logs
+												</>
+											)}
+										</div>
+									</DropdownMenuItem>
+								)}
+								<DropdownMenuItem
+									onClick={() => {
+										setDeleteRunId(run.id)
+										setTimeout(() => continueRef.current?.focus(), 0)
+									}}>
 									<div className="flex items-center gap-1">
-										{isExportingLogs ? (
-											<>
-												<LoaderCircle className="animate-spin" />
-												Exporting...
-											</>
-										) : (
-											<>
-												<FileDown />
-												Export Failed Logs
-											</>
-										)}
+										<Trash />
+										<div>Delete</div>
 									</div>
 								</DropdownMenuItem>
-							)}
-							<DropdownMenuItem
-								onClick={() => {
-									setDeleteRunId(run.id)
-									setTimeout(() => continueRef.current?.focus(), 0)
-								}}>
-								<div className="flex items-center gap-1">
-									<Trash />
-									<div>Delete</div>
-								</div>
-							</DropdownMenuItem>
-						</DropdownMenuContent>
-					</DropdownMenu>
+							</DropdownMenuContent>
+						</DropdownMenu>
+					</div>
 				</TableCell>
 			</TableRow>
 			<AlertDialog open={!!deleteRunId} onOpenChange={() => setDeleteRunId(undefined)}>
@@ -268,6 +387,39 @@ export function Run({ run, taskMetrics, toolColumns }: RunProps) {
 					</ScrollArea>
 				</DialogContent>
 			</Dialog>
+
+			{/* Notes/Description Dialog */}
+			<Dialog open={showNotesDialog} onOpenChange={setShowNotesDialog}>
+				<DialogContent className="max-w-lg">
+					<DialogHeader>
+						<DialogTitle>Run Description</DialogTitle>
+					</DialogHeader>
+					<div className="space-y-4">
+						<Textarea
+							placeholder="Add a description or notes for this run..."
+							value={editingDescription}
+							onChange={(e) => setEditingDescription(e.target.value)}
+							rows={4}
+							className="resize-none"
+						/>
+					</div>
+					<DialogFooter>
+						<Button variant="outline" onClick={() => setShowNotesDialog(false)}>
+							Cancel
+						</Button>
+						<Button onClick={handleSaveDescription} disabled={isSavingNotes}>
+							{isSavingNotes ? (
+								<>
+									<LoaderCircle className="h-4 w-4 mr-2 animate-spin" />
+									Saving...
+								</>
+							) : (
+								"Save"
+							)}
+						</Button>
+					</DialogFooter>
+				</DialogContent>
+			</Dialog>
 		</>
 	)
 }

+ 511 - 20
apps/web-evals/src/components/home/runs.tsx

@@ -1,14 +1,45 @@
 "use client"
 
-import { useMemo, useState } from "react"
+import { useCallback, useEffect, useMemo, useState } from "react"
 import { useRouter } from "next/navigation"
-import { ArrowDown, ArrowUp, ArrowUpDown, Rocket } from "lucide-react"
+import {
+	ArrowDown,
+	ArrowUp,
+	ArrowUpDown,
+	Combine,
+	Ellipsis,
+	LoaderCircle,
+	Rocket,
+	RotateCcw,
+	Trash2,
+	X,
+} from "lucide-react"
+import { toast } from "sonner"
 
 import type { Run, TaskMetrics } from "@roo-code/evals"
 import type { ToolName } from "@roo-code/types"
 
+import { deleteIncompleteRuns, deleteOldRuns } from "@/actions/runs"
 import {
+	AlertDialog,
+	AlertDialogAction,
+	AlertDialogCancel,
+	AlertDialogContent,
+	AlertDialogDescription,
+	AlertDialogFooter,
+	AlertDialogHeader,
+	AlertDialogTitle,
 	Button,
+	DropdownMenu,
+	DropdownMenuContent,
+	DropdownMenuItem,
+	DropdownMenuTrigger,
+	MultiSelect,
+	Select,
+	SelectContent,
+	SelectItem,
+	SelectTrigger,
+	SelectValue,
 	Table,
 	TableBody,
 	TableCell,
@@ -26,6 +57,41 @@ type RunWithTaskMetrics = Run & { taskMetrics: TaskMetrics | null }
 type SortColumn = "model" | "provider" | "passed" | "failed" | "percent" | "cost" | "duration" | "createdAt"
 type SortDirection = "asc" | "desc"
 
+type TimeframeOption = "all" | "24h" | "7d" | "30d" | "90d"
+
+const TIMEFRAME_OPTIONS: { value: TimeframeOption; label: string }[] = [
+	{ value: "all", label: "All time" },
+	{ value: "24h", label: "Last 24 hours" },
+	{ value: "7d", label: "Last 7 days" },
+	{ value: "30d", label: "Last 30 days" },
+	{ value: "90d", label: "Last 90 days" },
+]
+
+// LocalStorage keys
+const STORAGE_KEYS = {
+	TIMEFRAME: "evals-runs-timeframe",
+	MODEL_FILTER: "evals-runs-model-filter",
+	PROVIDER_FILTER: "evals-runs-provider-filter",
+	CONSOLIDATED_TOOLS: "evals-runs-consolidated-tools",
+}
+
+function getTimeframeStartDate(timeframe: TimeframeOption): Date | null {
+	if (timeframe === "all") return null
+	const now = new Date()
+	switch (timeframe) {
+		case "24h":
+			return new Date(now.getTime() - 24 * 60 * 60 * 1000)
+		case "7d":
+			return new Date(now.getTime() - 7 * 24 * 60 * 60 * 1000)
+		case "30d":
+			return new Date(now.getTime() - 30 * 24 * 60 * 60 * 1000)
+		case "90d":
+			return new Date(now.getTime() - 90 * 24 * 60 * 60 * 1000)
+		default:
+			return null
+	}
+}
+
 // Generate abbreviation from tool name (e.g., "read_file" -> "RF", "list_code_definition_names" -> "LCDN")
 function getToolAbbreviation(toolName: string): string {
 	return toolName
@@ -54,6 +120,109 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) {
 	const [sortColumn, setSortColumn] = useState<SortColumn | null>("createdAt")
 	const [sortDirection, setSortDirection] = useState<SortDirection>("desc")
 
+	// Filter state - initialize from localStorage
+	const [timeframeFilter, setTimeframeFilter] = useState<TimeframeOption>(() => {
+		if (typeof window === "undefined") return "all"
+		const stored = localStorage.getItem(STORAGE_KEYS.TIMEFRAME)
+		return (stored as TimeframeOption) || "all"
+	})
+	const [modelFilter, setModelFilter] = useState<string[]>(() => {
+		if (typeof window === "undefined") return []
+		const stored = localStorage.getItem(STORAGE_KEYS.MODEL_FILTER)
+		return stored ? JSON.parse(stored) : []
+	})
+	const [providerFilter, setProviderFilter] = useState<string[]>(() => {
+		if (typeof window === "undefined") return []
+		const stored = localStorage.getItem(STORAGE_KEYS.PROVIDER_FILTER)
+		return stored ? JSON.parse(stored) : []
+	})
+
+	// Tool column consolidation state - initialize from localStorage
+	const [consolidatedToolColumns, setConsolidatedToolColumns] = useState<string[]>(() => {
+		if (typeof window === "undefined") return []
+		const stored = localStorage.getItem(STORAGE_KEYS.CONSOLIDATED_TOOLS)
+		return stored ? JSON.parse(stored) : []
+	})
+
+	// Delete runs state
+	const [showDeleteConfirm, setShowDeleteConfirm] = useState(false)
+	const [showDeleteOldConfirm, setShowDeleteOldConfirm] = useState(false)
+	const [isDeleting, setIsDeleting] = useState(false)
+
+	// Persist filters to localStorage
+	useEffect(() => {
+		localStorage.setItem(STORAGE_KEYS.TIMEFRAME, timeframeFilter)
+	}, [timeframeFilter])
+
+	useEffect(() => {
+		localStorage.setItem(STORAGE_KEYS.MODEL_FILTER, JSON.stringify(modelFilter))
+	}, [modelFilter])
+
+	useEffect(() => {
+		localStorage.setItem(STORAGE_KEYS.PROVIDER_FILTER, JSON.stringify(providerFilter))
+	}, [providerFilter])
+
+	useEffect(() => {
+		localStorage.setItem(STORAGE_KEYS.CONSOLIDATED_TOOLS, JSON.stringify(consolidatedToolColumns))
+	}, [consolidatedToolColumns])
+
+	// Count incomplete runs (runs without taskMetricsId)
+	const incompleteRunsCount = useMemo(() => {
+		return runs.filter((run) => run.taskMetrics === null).length
+	}, [runs])
+
+	// Count runs older than 30 days
+	const oldRunsCount = useMemo(() => {
+		const thirtyDaysAgo = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000)
+		return runs.filter((run) => run.createdAt < thirtyDaysAgo).length
+	}, [runs])
+
+	const handleDeleteIncompleteRuns = useCallback(async () => {
+		setIsDeleting(true)
+		try {
+			const result = await deleteIncompleteRuns()
+			if (result.success) {
+				toast.success(`Deleted ${result.deletedCount} incomplete run${result.deletedCount !== 1 ? "s" : ""}`)
+				if (result.storageErrors.length > 0) {
+					toast.warning(`Some storage folders could not be deleted: ${result.storageErrors.length} errors`)
+				}
+				router.refresh()
+			} else {
+				toast.error("Failed to delete incomplete runs")
+			}
+		} catch (error) {
+			console.error("Error deleting incomplete runs:", error)
+			toast.error("Failed to delete incomplete runs")
+		} finally {
+			setIsDeleting(false)
+			setShowDeleteConfirm(false)
+		}
+	}, [router])
+
+	const handleDeleteOldRuns = useCallback(async () => {
+		setIsDeleting(true)
+		try {
+			const result = await deleteOldRuns()
+			if (result.success) {
+				toast.success(
+					`Deleted ${result.deletedCount} run${result.deletedCount !== 1 ? "s" : ""} older than 30 days`,
+				)
+				if (result.storageErrors.length > 0) {
+					toast.warning(`Some storage folders could not be deleted: ${result.storageErrors.length} errors`)
+				}
+				router.refresh()
+			} else {
+				toast.error("Failed to delete old runs")
+			}
+		} catch (error) {
+			console.error("Error deleting old runs:", error)
+			toast.error("Failed to delete old runs")
+		} finally {
+			setIsDeleting(false)
+			setShowDeleteOldConfirm(false)
+		}
+	}, [router])
+
 	const handleSort = (column: SortColumn) => {
 		if (sortColumn === column) {
 			setSortDirection(sortDirection === "asc" ? "desc" : "asc")
@@ -63,11 +232,59 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) {
 		}
 	}
 
-	// Collect all unique tool names from all runs and sort by total attempts
-	const toolColumns = useMemo<ToolName[]>(() => {
-		const toolTotals = new Map<ToolName, number>()
+	// Derive unique models and providers from runs
+	const modelOptions = useMemo(() => {
+		const models = new Set<string>()
+		for (const run of runs) {
+			if (run.model) models.add(run.model)
+		}
+		return Array.from(models)
+			.sort()
+			.map((model) => ({ label: model, value: model }))
+	}, [runs])
 
+	const providerOptions = useMemo(() => {
+		const providers = new Set<string>()
 		for (const run of runs) {
+			const provider = run.settings?.apiProvider
+			if (provider) providers.add(provider)
+		}
+		return Array.from(providers)
+			.sort()
+			.map((provider) => ({ label: provider, value: provider }))
+	}, [runs])
+
+	// Filter runs based on filter state
+	const filteredRuns = useMemo(() => {
+		return runs.filter((run) => {
+			// Timeframe filter
+			const timeframeStart = getTimeframeStartDate(timeframeFilter)
+			if (timeframeStart && run.createdAt < timeframeStart) {
+				return false
+			}
+
+			// Model filter
+			if (modelFilter.length > 0 && !modelFilter.includes(run.model)) {
+				return false
+			}
+
+			// Provider filter
+			if (providerFilter.length > 0) {
+				const provider = run.settings?.apiProvider
+				if (!provider || !providerFilter.includes(provider)) {
+					return false
+				}
+			}
+
+			return true
+		})
+	}, [runs, timeframeFilter, modelFilter, providerFilter])
+
+	// Collect all unique tool names from filtered runs and sort by total attempts
+	const allToolColumns = useMemo<ToolName[]>(() => {
+		const toolTotals = new Map<ToolName, number>()
+
+		for (const run of filteredRuns) {
 			if (run.taskMetrics?.toolUsage) {
 				for (const [toolName, usage] of Object.entries(run.taskMetrics.toolUsage)) {
 					const tool = toolName as ToolName
@@ -81,13 +298,32 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) {
 		return Array.from(toolTotals.entries())
 			.sort((a, b) => b[1] - a[1])
 			.map(([name]): ToolName => name)
-	}, [runs])
+	}, [filteredRuns])
+
+	// Tool column options for the consolidation dropdown
+	const toolColumnOptions = useMemo(() => {
+		return allToolColumns.map((tool) => ({
+			label: tool,
+			value: tool,
+		}))
+	}, [allToolColumns])
+
+	// Separate consolidated and individual tool columns
+	const individualToolColumns = useMemo(() => {
+		return allToolColumns.filter((tool) => !consolidatedToolColumns.includes(tool))
+	}, [allToolColumns, consolidatedToolColumns])
 
-	// Sort runs based on current sort column and direction
+	// Create a "consolidated" column if any tools are selected for consolidation
+	const hasConsolidatedColumn = consolidatedToolColumns.length > 0
+
+	// Use individualToolColumns for rendering
+	const toolColumns = individualToolColumns
+
+	// Sort filtered runs based on current sort column and direction
 	const sortedRuns = useMemo(() => {
-		if (!sortColumn) return runs
+		if (!sortColumn) return filteredRuns
 
-		return [...runs].sort((a, b) => {
+		return [...filteredRuns].sort((a, b) => {
 			let aVal: string | number | Date | null = null
 			let bVal: string | number | Date | null = null
 
@@ -139,14 +375,170 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) {
 
 			return sortDirection === "asc" ? comparison : -comparison
 		})
-	}, [runs, sortColumn, sortDirection])
+	}, [filteredRuns, sortColumn, sortDirection])
 
-	// Calculate colSpan for empty state (7 base columns + dynamic tools + 3 end columns)
-	const totalColumns = 7 + toolColumns.length + 3
+	// Calculate colSpan for empty state (7 base columns + dynamic tools + consolidated column + 3 end columns)
+	const totalColumns = 7 + toolColumns.length + (hasConsolidatedColumn ? 1 : 0) + 3
+
+	// Check if any filters or settings are active
+	const hasActiveFilters = timeframeFilter !== "all" || modelFilter.length > 0 || providerFilter.length > 0
+	const hasConsolidatedTools = consolidatedToolColumns.length > 0
+	const hasAnyCustomization = hasActiveFilters || hasConsolidatedTools
+
+	const clearAllFilters = () => {
+		setTimeframeFilter("all")
+		setModelFilter([])
+		setProviderFilter([])
+	}
+
+	const resetAll = () => {
+		setTimeframeFilter("all")
+		setModelFilter([])
+		setProviderFilter([])
+		setConsolidatedToolColumns([])
+		localStorage.removeItem(STORAGE_KEYS.TIMEFRAME)
+		localStorage.removeItem(STORAGE_KEYS.MODEL_FILTER)
+		localStorage.removeItem(STORAGE_KEYS.PROVIDER_FILTER)
+		localStorage.removeItem(STORAGE_KEYS.CONSOLIDATED_TOOLS)
+	}
 
 	return (
 		<>
-			<Table className="border border-t-0">
+			{/* Filter Controls */}
+			<div className="flex items-center gap-4 p-4 border border-b-0 rounded-t-md bg-muted/30">
+				<div className="flex items-center gap-2">
+					<span className="text-sm font-medium text-muted-foreground">Timeframe:</span>
+					<Select
+						value={timeframeFilter}
+						onValueChange={(value) => setTimeframeFilter(value as TimeframeOption)}>
+						<SelectTrigger className="w-[140px]">
+							<SelectValue />
+						</SelectTrigger>
+						<SelectContent>
+							{TIMEFRAME_OPTIONS.map((option) => (
+								<SelectItem key={option.value} value={option.value}>
+									{option.label}
+								</SelectItem>
+							))}
+						</SelectContent>
+					</Select>
+				</div>
+
+				<div className="flex items-center gap-2">
+					<span className="text-sm font-medium text-muted-foreground">Model:</span>
+					<MultiSelect
+						options={modelOptions}
+						value={modelFilter}
+						onValueChange={setModelFilter}
+						placeholder="All models"
+						className="w-[200px]"
+						maxCount={1}
+					/>
+				</div>
+
+				<div className="flex items-center gap-2">
+					<span className="text-sm font-medium text-muted-foreground">Provider:</span>
+					<MultiSelect
+						options={providerOptions}
+						value={providerFilter}
+						onValueChange={setProviderFilter}
+						placeholder="All providers"
+						className="w-[180px]"
+						maxCount={1}
+					/>
+				</div>
+
+				<div className="flex items-center gap-2">
+					<Tooltip>
+						<TooltipTrigger asChild>
+							<div className="flex items-center gap-2">
+								<Combine className="h-4 w-4 text-muted-foreground" />
+								<span className="text-sm font-medium text-muted-foreground">Consolidate:</span>
+							</div>
+						</TooltipTrigger>
+						<TooltipContent>Select tool columns to consolidate into a combined column</TooltipContent>
+					</Tooltip>
+					<div className="relative min-w-[100px] w-fit max-w-[140px]">
+						<div className={consolidatedToolColumns.length > 0 ? "[&>div>div]:invisible" : ""}>
+							<MultiSelect
+								options={toolColumnOptions}
+								value={consolidatedToolColumns}
+								onValueChange={setConsolidatedToolColumns}
+								placeholder="None"
+								className="w-full min-w-[100px]"
+								maxCount={0}
+								popoverAutoWidth
+								footer={
+									hasAnyCustomization && (
+										<Button
+											variant="ghost"
+											size="sm"
+											className="w-full justify-start text-muted-foreground hover:text-foreground"
+											onClick={resetAll}>
+											<RotateCcw className="h-4 w-4 mr-2" />
+											Reset all filters & consolidation
+										</Button>
+									)
+								}
+							/>
+						</div>
+						{consolidatedToolColumns.length > 0 && (
+							<div className="absolute inset-0 flex items-center px-3 pointer-events-none">
+								<span className="text-sm font-medium whitespace-nowrap">
+									{consolidatedToolColumns.length} tool
+									{consolidatedToolColumns.length !== 1 ? "s" : ""}
+								</span>
+							</div>
+						)}
+					</div>
+				</div>
+
+				{hasActiveFilters && (
+					<Button variant="ghost" size="sm" onClick={clearAllFilters}>
+						<X className="h-4 w-4 mr-1" />
+						Clear filters
+					</Button>
+				)}
+
+				<div className="flex items-center gap-2 ml-auto">
+					{/* Bulk Actions Menu */}
+					{(incompleteRunsCount > 0 || oldRunsCount > 0) && (
+						<DropdownMenu>
+							<Button variant="ghost" size="sm" asChild>
+								<DropdownMenuTrigger disabled={isDeleting}>
+									<Ellipsis className="h-4 w-4" />
+								</DropdownMenuTrigger>
+							</Button>
+							<DropdownMenuContent align="end">
+								{incompleteRunsCount > 0 && (
+									<DropdownMenuItem
+										onClick={() => setShowDeleteConfirm(true)}
+										disabled={isDeleting}
+										className="text-destructive focus:text-destructive">
+										<Trash2 className="h-4 w-4 mr-2" />
+										Delete {incompleteRunsCount} incomplete run
+										{incompleteRunsCount !== 1 ? "s" : ""}
+									</DropdownMenuItem>
+								)}
+								{oldRunsCount > 0 && (
+									<DropdownMenuItem
+										onClick={() => setShowDeleteOldConfirm(true)}
+										disabled={isDeleting}
+										className="text-destructive focus:text-destructive">
+										<Trash2 className="h-4 w-4 mr-2" />
+										Delete {oldRunsCount} run{oldRunsCount !== 1 ? "s" : ""} over 30d
+									</DropdownMenuItem>
+								)}
+							</DropdownMenuContent>
+						</DropdownMenu>
+					)}
+					<div className="text-sm text-muted-foreground">
+						{filteredRuns.length} of {runs.length} runs
+					</div>
+				</div>
+			</div>
+
+			<Table className="border border-t-0 rounded-t-none">
 				<TableHeader>
 					<TableRow>
 						<TableHead
@@ -188,6 +580,23 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) {
 							</div>
 						</TableHead>
 						<TableHead>Tokens</TableHead>
+						{hasConsolidatedColumn && (
+							<TableHead className="text-xs text-center">
+								<Tooltip>
+									<TooltipTrigger>
+										<Combine className="h-3 w-3 inline" />
+									</TooltipTrigger>
+									<TooltipContent>
+										<div className="text-xs">
+											<div className="font-semibold mb-1">Consolidated Tools:</div>
+											{consolidatedToolColumns.map((tool) => (
+												<div key={tool}>{tool}</div>
+											))}
+										</div>
+									</TooltipContent>
+								</Tooltip>
+							</TableHead>
+						)}
 						{toolColumns.map((toolName) => (
 							<TableHead key={toolName} className="text-xs text-center">
 								<Tooltip>
@@ -214,16 +623,34 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) {
 				<TableBody>
 					{sortedRuns.length ? (
 						sortedRuns.map(({ taskMetrics, ...run }) => (
-							<Row key={run.id} run={run} taskMetrics={taskMetrics} toolColumns={toolColumns} />
+							<Row
+								key={run.id}
+								run={run}
+								taskMetrics={taskMetrics}
+								toolColumns={toolColumns}
+								consolidatedToolColumns={consolidatedToolColumns}
+							/>
 						))
 					) : (
 						<TableRow>
-							<TableCell colSpan={totalColumns} className="text-center">
-								No eval runs yet.
-								<Button variant="link" onClick={() => router.push("/runs/new")}>
-									Launch
-								</Button>
-								one now.
+							<TableCell colSpan={totalColumns} className="text-center py-8">
+								{runs.length === 0 ? (
+									<>
+										No eval runs yet.
+										<Button variant="link" onClick={() => router.push("/runs/new")}>
+											Launch
+										</Button>
+										one now.
+									</>
+								) : (
+									<>
+										No runs match the current filters.
+										<Button variant="link" onClick={clearAllFilters}>
+											Clear filters
+										</Button>
+										to see all runs.
+									</>
+								)}
 							</TableCell>
 						</TableRow>
 					)}
@@ -235,6 +662,70 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) {
 				onClick={() => router.push("/runs/new")}>
 				<Rocket className="size-6" />
 			</Button>
+
+			{/* Delete Incomplete Runs Confirmation Dialog */}
+			<AlertDialog open={showDeleteConfirm} onOpenChange={setShowDeleteConfirm}>
+				<AlertDialogContent>
+					<AlertDialogHeader>
+						<AlertDialogTitle>Delete Incomplete Runs</AlertDialogTitle>
+						<AlertDialogDescription>
+							Are you sure you want to delete {incompleteRunsCount} incomplete run
+							{incompleteRunsCount !== 1 ? "s" : ""}? This will permanently remove all database records
+							and storage folders for these runs. This action cannot be undone.
+						</AlertDialogDescription>
+					</AlertDialogHeader>
+					<AlertDialogFooter>
+						<AlertDialogCancel disabled={isDeleting}>Cancel</AlertDialogCancel>
+						<AlertDialogAction
+							onClick={handleDeleteIncompleteRuns}
+							disabled={isDeleting}
+							className="bg-destructive text-destructive-foreground hover:bg-destructive/90">
+							{isDeleting ? (
+								<>
+									<LoaderCircle className="h-4 w-4 mr-2 animate-spin" />
+									Deleting...
+								</>
+							) : (
+								<>
+									Delete {incompleteRunsCount} run{incompleteRunsCount !== 1 ? "s" : ""}
+								</>
+							)}
+						</AlertDialogAction>
+					</AlertDialogFooter>
+				</AlertDialogContent>
+			</AlertDialog>
+
+			{/* Delete Old Runs Confirmation Dialog */}
+			<AlertDialog open={showDeleteOldConfirm} onOpenChange={setShowDeleteOldConfirm}>
+				<AlertDialogContent>
+					<AlertDialogHeader>
+						<AlertDialogTitle>Delete Old Runs</AlertDialogTitle>
+						<AlertDialogDescription>
+							Are you sure you want to delete {oldRunsCount} run{oldRunsCount !== 1 ? "s" : ""} older than
+							30 days? This will permanently remove all database records and storage folders for these
+							runs. This action cannot be undone.
+						</AlertDialogDescription>
+					</AlertDialogHeader>
+					<AlertDialogFooter>
+						<AlertDialogCancel disabled={isDeleting}>Cancel</AlertDialogCancel>
+						<AlertDialogAction
+							onClick={handleDeleteOldRuns}
+							disabled={isDeleting}
+							className="bg-destructive text-destructive-foreground hover:bg-destructive/90">
+							{isDeleting ? (
+								<>
+									<LoaderCircle className="h-4 w-4 mr-2 animate-spin" />
+									Deleting...
+								</>
+							) : (
+								<>
+									Delete {oldRunsCount} run{oldRunsCount !== 1 ? "s" : ""}
+								</>
+							)}
+						</AlertDialogAction>
+					</AlertDialogFooter>
+				</AlertDialogContent>
+			</AlertDialog>
 		</>
 	)
 }

+ 16 - 1
apps/web-evals/src/components/ui/multi-select.tsx

@@ -84,6 +84,18 @@ interface MultiSelectProps extends React.HTMLAttributes<HTMLDivElement>, Variant
 	 * Optional, can be used to add custom styles.
 	 */
 	className?: string
+
+	/**
+	 * If true, popover width will auto-size to content instead of matching trigger width.
+	 * Optional, defaults to false.
+	 */
+	popoverAutoWidth?: boolean
+
+	/**
+	 * Optional footer content to render at the bottom of the popover.
+	 * Useful for adding reset buttons or other actions.
+	 */
+	footer?: React.ReactNode
 }
 
 export const MultiSelect = React.forwardRef<HTMLDivElement, MultiSelectProps>(
@@ -97,6 +109,8 @@ export const MultiSelect = React.forwardRef<HTMLDivElement, MultiSelectProps>(
 			placeholder = "Select options",
 			maxCount = 3,
 			modalPopover = false,
+			popoverAutoWidth = false,
+			footer,
 			className,
 			...props
 		},
@@ -243,7 +257,7 @@ export const MultiSelect = React.forwardRef<HTMLDivElement, MultiSelectProps>(
 					</div>
 				</PopoverTrigger>
 				<PopoverContent
-					className="p-0 w-[var(--radix-popover-trigger-width)]"
+					className={cn("p-0", popoverAutoWidth ? "w-auto" : "w-[var(--radix-popover-trigger-width)]")}
 					align="start"
 					onEscapeKeyDown={() => setIsPopoverOpen(false)}>
 					<Command filter={onFilter}>
@@ -276,6 +290,7 @@ export const MultiSelect = React.forwardRef<HTMLDivElement, MultiSelectProps>(
 							</CommandGroup>
 						</CommandList>
 					</Command>
+					{footer && <div className="border-t p-2">{footer}</div>}
 				</PopoverContent>
 			</Popover>
 		)

+ 1 - 1
apps/web-evals/src/lib/formatters.ts

@@ -45,7 +45,7 @@ export const formatTokens = (tokens: number) => {
 }
 
 export const formatToolUsageSuccessRate = (usage: { attempts: number; failures: number }) =>
-	usage.attempts === 0 ? "0%" : `${(((usage.attempts - usage.failures) / usage.attempts) * 100).toFixed(1)}%`
+	usage.attempts === 0 ? "0%" : `${Math.round(((usage.attempts - usage.failures) / usage.attempts) * 100)}%`
 
 export const formatDateTime = (date: Date) => {
 	return new Intl.DateTimeFormat("en-US", {

+ 64 - 0
packages/evals/src/db/queries/runs.ts

@@ -134,3 +134,67 @@ export const deleteRun = async (runId: number) => {
 
 	await db.delete(schema.taskMetrics).where(inArray(schema.taskMetrics.id, taskMetricsIds))
 }
+
+/**
+ * Get all runs without a taskMetricsId (incomplete runs)
+ */
+export const getIncompleteRuns = async () => {
+	return db.query.runs.findMany({
+		where: sql`${schema.runs.taskMetricsId} IS NULL`,
+		columns: { id: true },
+	})
+}
+
+/**
+ * Delete multiple runs by their IDs
+ */
+export const deleteRunsByIds = async (runIds: number[]) => {
+	if (runIds.length === 0) return
+
+	// Get all tasks for these runs
+	const tasks = await db.query.tasks.findMany({
+		where: inArray(schema.tasks.runId, runIds),
+		columns: { id: true, taskMetricsId: true },
+	})
+
+	const taskIds = tasks.map(({ id }) => id)
+
+	// Get run taskMetricsIds
+	const runs = await db.query.runs.findMany({
+		where: inArray(schema.runs.id, runIds),
+		columns: { taskMetricsId: true },
+	})
+
+	// Delete tool errors for tasks
+	if (taskIds.length > 0) {
+		await db.delete(schema.toolErrors).where(inArray(schema.toolErrors.taskId, taskIds))
+	}
+
+	// Delete tasks
+	await db.delete(schema.tasks).where(inArray(schema.tasks.runId, runIds))
+
+	// Delete tool errors for runs
+	await db.delete(schema.toolErrors).where(inArray(schema.toolErrors.runId, runIds))
+
+	// Delete from tables that exist in DB but not in drizzle schema
+	// Using individual deletes since drizzle's sql template doesn't support custom table schemas
+	for (const runId of runIds) {
+		await db.execute(sql`DELETE FROM "cpuMetrics" WHERE run_id = ${runId}`)
+		await db.execute(sql`DELETE FROM "notes" WHERE run_id = ${runId}`)
+	}
+
+	// Delete runs
+	await db.delete(schema.runs).where(inArray(schema.runs.id, runIds))
+
+	// Delete task metrics
+	const taskMetricsIds = [
+		...tasks
+			.map(({ taskMetricsId }) => taskMetricsId)
+			.filter((id): id is number => id !== null && id !== undefined),
+		...runs.map(({ taskMetricsId }) => taskMetricsId).filter((id): id is number => id !== null && id !== undefined),
+	]
+
+	if (taskMetricsIds.length > 0) {
+		await db.delete(schema.taskMetrics).where(inArray(schema.taskMetrics.id, taskMetricsIds))
+	}
+}