Просмотр исходного кода

Evals fixes (#2505)

* Allow Turso URLs, add support for API providers beyond OpenRouter

* Make the git branch name unique
Chris Estreich 8 месяцев назад
Родитель
Сommit
2eba534dd6

+ 4 - 2
evals/apps/cli/src/index.ts

@@ -70,7 +70,7 @@ const run = async (toolbox: GluegunToolbox) => {
 		run = await createRun({
 			model: rooCodeDefaults.openRouterModelId!,
 			pid: process.pid,
-			socketPath: path.resolve(os.tmpdir(), `roo-code-evals-${crypto.randomUUID()}.sock`),
+			socketPath: path.resolve(os.tmpdir(), `roo-code-evals-${crypto.randomUUID().slice(0, 8)}.sock`),
 		})
 
 		if (language === "all") {
@@ -101,7 +101,9 @@ const run = async (toolbox: GluegunToolbox) => {
 	console.log(await execa({ cwd: exercisesPath })`git config user.email "[email protected]"`)
 	console.log(await execa({ cwd: exercisesPath })`git checkout -f`)
 	console.log(await execa({ cwd: exercisesPath })`git clean -fd`)
-	console.log(await execa({ cwd: exercisesPath })`git checkout -b runs/${run.id} main`)
+	console.log(
+		await execa({ cwd: exercisesPath })`git checkout -b runs/${run.id}-${crypto.randomUUID().slice(0, 8)} main`,
+	)
 
 	fs.writeFileSync(
 		path.resolve(exercisesPath, "settings.json"),

+ 107 - 149
evals/apps/web/src/app/runs/new/new-run.tsx

@@ -22,7 +22,6 @@ import {
 	FormField,
 	FormItem,
 	FormLabel,
-	FormDescription,
 	FormMessage,
 	Textarea,
 	Tabs,
@@ -43,15 +42,11 @@ import {
 
 import { SettingsDiff } from "./settings-diff"
 
-const recommendedModels = [
-	"anthropic/claude-3.7-sonnet",
-	"anthropic/claude-3.7-sonnet:thinking",
-	"google/gemini-2.0-flash-001",
-]
-
 export function NewRun() {
 	const router = useRouter()
 
+	const [mode, setMode] = useState<"openrouter" | "settings">("openrouter")
+
 	const [modelSearchValue, setModelSearchValue] = useState("")
 	const [modelPopoverOpen, setModelPopoverOpen] = useState(false)
 	const modelSearchResultsRef = useRef<Map<string, number>>(new Map())
@@ -81,29 +76,15 @@ export function NewRun() {
 	const [model, suite, settings] = watch(["model", "suite", "settings"])
 
 	const onSubmit = useCallback(
-		async ({ settings, ...data }: FormValues) => {
+		async (values: FormValues) => {
 			try {
-				const openRouterModel = models.data?.find(({ id }) => id === data.model)
-
-				if (!openRouterModel) {
-					throw new Error(`Model not found: ${data.model}`)
-				}
-
-				const { id } = await createRun({
-					...data,
-					settings: {
-						...settings,
-						openRouterModelId: openRouterModel.id,
-						openRouterModelInfo: openRouterModel.modelInfo,
-					},
-				})
-
+				const { id } = await createRun(values)
 				router.push(`/runs/${id}`)
 			} catch (e) {
 				toast.error(e instanceof Error ? e.message : "An unknown error occurred.")
 			}
 		},
-		[router, models.data],
+		[router],
 	)
 
 	const onFilterModels = useCallback(
@@ -157,36 +138,25 @@ export function NewRun() {
 					.parse(JSON.parse(await file.text()))
 
 				const providerSettings = providerProfiles.apiConfigs[providerProfiles.currentApiConfigName] ?? {}
-
-				if (providerSettings.apiProvider === "openrouter" && providerSettings.openRouterModelId) {
-					const {
-						openRouterModelId,
-						modelMaxTokens,
-						modelMaxThinkingTokens,
-						modelTemperature,
-						includeMaxTokens,
-					} = providerSettings
-
-					const model = openRouterModelId
-
-					const settings = {
-						...rooCodeDefaults,
-						openRouterModelId,
-						modelMaxTokens,
-						modelMaxThinkingTokens,
-						modelTemperature,
-						includeMaxTokens,
-						...globalSettings,
-					}
-
-					setValue("model", model)
-					setValue("settings", settings)
-				} else {
-					setValue("settings", globalSettings)
+				const { apiProvider, openRouterModelId, openAiModelId } = providerSettings
+
+				switch (apiProvider) {
+					case "openrouter":
+						setValue("model", openRouterModelId ?? "")
+						break
+					case "openai":
+						setValue("model", openAiModelId ?? "")
+						break
+					default:
+						throw new Error(`Unsupported API provider: ${apiProvider}`)
 				}
 
+				setValue("settings", { ...rooCodeDefaults, ...providerSettings, ...globalSettings })
+				setMode("settings")
+
 				event.target.value = ""
 			} catch (e) {
+				console.error(e)
 				toast.error(e instanceof Error ? e.message : "An unknown error occurred.")
 			}
 		},
@@ -199,108 +169,96 @@ export function NewRun() {
 				<form
 					onSubmit={form.handleSubmit(onSubmit)}
 					className="flex flex-col justify-center divide-y divide-primary *:py-5">
-					<FormField
-						control={form.control}
-						name="model"
-						render={() => (
-							<FormItem>
-								<FormLabel>OpenRouter Model</FormLabel>
-								<Popover open={modelPopoverOpen} onOpenChange={setModelPopoverOpen}>
-									<PopoverTrigger asChild>
-										<Button
-											variant="input"
-											role="combobox"
-											aria-expanded={modelPopoverOpen}
-											className="flex items-center justify-between">
-											<div>
-												{models.data?.find(({ id }) => id === model)?.name || model || "Select"}
-											</div>
-											<ChevronsUpDown className="opacity-50" />
-										</Button>
-									</PopoverTrigger>
-									<PopoverContent className="p-0 w-[var(--radix-popover-trigger-width)]">
-										<Command filter={onFilterModels}>
-											<CommandInput
-												placeholder="Search"
-												value={modelSearchValue}
-												onValueChange={setModelSearchValue}
-												className="h-9"
-											/>
-											<CommandList>
-												<CommandEmpty>No model found.</CommandEmpty>
-												<CommandGroup>
-													{models.data?.map(({ id, name }) => (
-														<CommandItem key={id} value={id} onSelect={onSelectModel}>
-															{name}
-															<Check
-																className={cn(
-																	"ml-auto text-accent group-data-[selected=true]:text-accent-foreground size-4",
-																	id === model ? "opacity-100" : "opacity-0",
-																)}
-															/>
-														</CommandItem>
-													))}
-												</CommandGroup>
-											</CommandList>
-										</Command>
-									</PopoverContent>
-								</Popover>
-								<FormMessage />
-								<FormDescription className="flex flex-wrap items-center gap-2">
-									<span>Recommended:</span>
-									{recommendedModels.map((modelId) => (
-										<Button
-											key={modelId}
-											variant="link"
-											className="break-all px-0!"
-											onClick={(e) => {
-												e.preventDefault()
-												setValue("model", modelId)
-											}}>
-											{modelId}
-										</Button>
-									))}
-								</FormDescription>
-							</FormItem>
+					<div className="flex flex-row justify-between gap-4">
+						{mode === "openrouter" && (
+							<FormField
+								control={form.control}
+								name="model"
+								render={() => (
+									<FormItem className="flex-1">
+										<Popover open={modelPopoverOpen} onOpenChange={setModelPopoverOpen}>
+											<PopoverTrigger asChild>
+												<Button
+													variant="input"
+													role="combobox"
+													aria-expanded={modelPopoverOpen}
+													className="flex items-center justify-between">
+													<div>
+														{models.data?.find(({ id }) => id === model)?.name ||
+															model ||
+															"Select OpenRouter Model"}
+													</div>
+													<ChevronsUpDown className="opacity-50" />
+												</Button>
+											</PopoverTrigger>
+											<PopoverContent className="p-0 w-[var(--radix-popover-trigger-width)]">
+												<Command filter={onFilterModels}>
+													<CommandInput
+														placeholder="Search"
+														value={modelSearchValue}
+														onValueChange={setModelSearchValue}
+														className="h-9"
+													/>
+													<CommandList>
+														<CommandEmpty>No model found.</CommandEmpty>
+														<CommandGroup>
+															{models.data?.map(({ id, name }) => (
+																<CommandItem
+																	key={id}
+																	value={id}
+																	onSelect={onSelectModel}>
+																	{name}
+																	<Check
+																		className={cn(
+																			"ml-auto text-accent group-data-[selected=true]:text-accent-foreground size-4",
+																			id === model ? "opacity-100" : "opacity-0",
+																		)}
+																	/>
+																</CommandItem>
+															))}
+														</CommandGroup>
+													</CommandList>
+												</Command>
+											</PopoverContent>
+										</Popover>
+										<FormMessage />
+									</FormItem>
+								)}
+							/>
 						)}
-					/>
 
-					<FormItem>
-						<FormLabel>Import Settings</FormLabel>
-						<Button
-							type="button"
-							variant="secondary"
-							size="icon"
-							onClick={() => document.getElementById("json-upload")?.click()}>
-							<HardDriveUpload />
-						</Button>
-						<input
-							id="json-upload"
-							type="file"
-							accept="application/json"
-							className="hidden"
-							onChange={onImportSettings}
-						/>
-						{settings ? (
-							<ScrollArea className="max-h-64 border rounded-sm">
-								<>
-									<div className="flex items-center gap-1 p-2 border-b">
-										<CircleCheck className="size-4 text-ring" />
-										<div className="text-sm">
-											Imported valid Roo Code settings. Showing differences from default settings.
+						<FormItem className="flex-1">
+							<Button
+								type="button"
+								variant="secondary"
+								onClick={() => document.getElementById("json-upload")?.click()}>
+								<HardDriveUpload />
+								Import Settings
+							</Button>
+							<input
+								id="json-upload"
+								type="file"
+								accept="application/json"
+								className="hidden"
+								onChange={onImportSettings}
+							/>
+							{settings && (
+								<ScrollArea className="max-h-64 border rounded-sm">
+									<>
+										<div className="flex items-center gap-1 p-2 border-b">
+											<CircleCheck className="size-4 text-ring" />
+											<div className="text-sm">
+												Imported valid Roo Code settings. Showing differences from default
+												settings.
+											</div>
 										</div>
-									</div>
-									<SettingsDiff defaultSettings={rooCodeDefaults} customSettings={settings} />
-								</>
-							</ScrollArea>
-						) : (
-							<FormDescription>
-								Fully configure how Roo Code for this run using a settings file that was exported by Roo
-								Code.
-							</FormDescription>
-						)}
-						<FormMessage />
-					</FormItem>
+										<SettingsDiff defaultSettings={rooCodeDefaults} customSettings={settings} />
+									</>
+								</ScrollArea>
+							)}
+							<FormMessage />
+						</FormItem>
+					</div>
 
 					<FormField
 						control={form.control}

+ 12 - 4
evals/packages/db/drizzle.config.ts

@@ -1,10 +1,18 @@
 import { defineConfig } from "drizzle-kit"
 
+if ((!process.env.TURSO_CONNECTION_URL || !process.env.TURSO_AUTH_TOKEN) && !process.env.BENCHMARKS_DB_PATH) {
+	throw new Error("TURSO_CONNECTION_URL and TURSO_AUTH_TOKEN or BENCHMARKS_DB_PATH must be set")
+}
+
+const dialect = process.env.BENCHMARKS_DB_PATH ? "sqlite" : "turso"
+
+const dbCredentials = process.env.BENCHMARKS_DB_PATH
+	? { url: process.env.BENCHMARKS_DB_PATH }
+	: { url: process.env.TURSO_CONNECTION_URL!, authToken: process.env.TURSO_AUTH_TOKEN! }
+
 export default defineConfig({
 	out: "./drizzle",
 	schema: "./src/schema.ts",
-	dialect: "sqlite",
-	dbCredentials: {
-		url: process.env.BENCHMARKS_DB_PATH!,
-	},
+	dialect,
+	dbCredentials,
 })

+ 6 - 3
evals/packages/db/src/db.ts

@@ -2,9 +2,12 @@ import { drizzle } from "drizzle-orm/libsql"
 
 import { schema } from "./schema.js"
 
-const connection = {
-	url: process.env.BENCHMARKS_DB_PATH!,
-	concurrency: 50,
+if ((!process.env.TURSO_CONNECTION_URL || !process.env.TURSO_AUTH_TOKEN) && !process.env.BENCHMARKS_DB_PATH) {
+	throw new Error("TURSO_CONNECTION_URL and TURSO_AUTH_TOKEN or BENCHMARKS_DB_PATH must be set")
 }
 
+const connection = process.env.BENCHMARKS_DB_PATH
+	? { url: process.env.BENCHMARKS_DB_PATH, concurrency: 50 }
+	: { url: process.env.TURSO_CONNECTION_URL!, authToken: process.env.TURSO_AUTH_TOKEN! }
+
 export const db = drizzle({ schema, connection })

+ 17 - 2
evals/packages/db/src/queries/taskMetrics.ts

@@ -1,8 +1,8 @@
-import { eq } from "drizzle-orm"
+import { eq, avg, min, max, and, isNotNull } from "drizzle-orm"
 
 import { RecordNotFoundError, RecordNotCreatedError } from "./errors.js"
 import type { InsertTaskMetrics, UpdateTaskMetrics } from "../schema.js"
-import { insertTaskMetricsSchema, taskMetrics } from "../schema.js"
+import { insertTaskMetricsSchema, taskMetrics, tasks, runs } from "../schema.js"
 import { db } from "../db.js"
 
 const table = taskMetrics
@@ -45,3 +45,18 @@ export const updateTaskMetrics = async (id: number, values: UpdateTaskMetrics) =
 
 	return record
 }
+
+export const successfulTaskDurations = async () => {
+	return db
+		.select({
+			runId: tasks.runId,
+			avgDuration: avg(taskMetrics.duration).mapWith(Number),
+			minDuration: min(taskMetrics.duration).mapWith(Number),
+			maxDuration: max(taskMetrics.duration).mapWith(Number),
+		})
+		.from(tasks)
+		.innerJoin(taskMetrics, eq(tasks.taskMetricsId, taskMetrics.id))
+		.innerJoin(runs, eq(tasks.runId, runs.id))
+		.where(and(eq(tasks.passed, true), isNotNull(runs.taskMetricsId)))
+		.groupBy(tasks.runId)
+}