Просмотр исходного кода

Enable the Roo Code Cloud provider in evals (#9492)

Chris Estreich 1 месяц назад
Родитель
Сommit
2ca9eac9e0

+ 1 - 0
apps/web-evals/package.json

@@ -14,6 +14,7 @@
 	"dependencies": {
 		"@hookform/resolvers": "^5.1.1",
 		"@radix-ui/react-alert-dialog": "^1.1.7",
+		"@radix-ui/react-checkbox": "^1.1.5",
 		"@radix-ui/react-dialog": "^1.1.6",
 		"@radix-ui/react-dropdown-menu": "^2.1.7",
 		"@radix-ui/react-label": "^2.1.2",

+ 1 - 2
apps/web-evals/src/actions/runs.ts

@@ -21,8 +21,7 @@ import { CreateRun } from "@/lib/schemas"
 
 const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals")
 
-// eslint-disable-next-line @typescript-eslint/no-unused-vars
-export async function createRun({ suite, exercises = [], systemPrompt, timeout, ...values }: CreateRun) {
+export async function createRun({ suite, exercises = [], timeout, ...values }: CreateRun) {
 	const run = await _createRun({
 		...values,
 		timeout,

+ 0 - 24
apps/web-evals/src/app/api/health/route.ts

@@ -1,24 +0,0 @@
-import { NextResponse } from "next/server"
-
-export async function GET() {
-	try {
-		return NextResponse.json(
-			{
-				status: "healthy",
-				timestamp: new Date().toISOString(),
-				uptime: process.uptime(),
-				environment: process.env.NODE_ENV || "production",
-			},
-			{ status: 200 },
-		)
-	} catch (error) {
-		return NextResponse.json(
-			{
-				status: "unhealthy",
-				timestamp: new Date().toISOString(),
-				error: error instanceof Error ? error.message : "Unknown error",
-			},
-			{ status: 503 },
-		)
-	}
-}

+ 119 - 88
apps/web-evals/src/app/runs/new/new-run.tsx

@@ -1,23 +1,22 @@
 "use client"
 
-import { useCallback, useRef, useState } from "react"
+import { useCallback, useState } from "react"
 import { useRouter } from "next/navigation"
 import { z } from "zod"
 import { useQuery } from "@tanstack/react-query"
 import { useForm, FormProvider } from "react-hook-form"
 import { zodResolver } from "@hookform/resolvers/zod"
-import fuzzysort from "fuzzysort"
 import { toast } from "sonner"
-import { X, Rocket, Check, ChevronsUpDown, SlidersHorizontal, CircleCheck } from "lucide-react"
+import { X, Rocket, Check, ChevronsUpDown, SlidersHorizontal } from "lucide-react"
 
 import { globalSettingsSchema, providerSettingsSchema, EVALS_SETTINGS, getModelId } from "@roo-code/types"
 
 import { createRun } from "@/actions/runs"
 import { getExercises } from "@/actions/exercises"
+
 import {
-	createRunSchema,
 	type CreateRun,
-	MODEL_DEFAULT,
+	createRunSchema,
 	CONCURRENCY_MIN,
 	CONCURRENCY_MAX,
 	CONCURRENCY_DEFAULT,
@@ -26,14 +25,19 @@ import {
 	TIMEOUT_DEFAULT,
 } from "@/lib/schemas"
 import { cn } from "@/lib/utils"
+
 import { useOpenRouterModels } from "@/hooks/use-open-router-models"
+import { useRooCodeCloudModels } from "@/hooks/use-roo-code-cloud-models"
+
 import {
 	Button,
+	Checkbox,
 	FormControl,
 	FormField,
 	FormItem,
 	FormLabel,
 	FormMessage,
+	Input,
 	Textarea,
 	Tabs,
 	TabsList,
@@ -48,9 +52,9 @@ import {
 	Popover,
 	PopoverContent,
 	PopoverTrigger,
-	ScrollArea,
-	ScrollBar,
 	Slider,
+	Label,
+	FormDescription,
 } from "@/components/ui"
 
 import { SettingsDiff } from "./settings-diff"
@@ -58,26 +62,30 @@ import { SettingsDiff } from "./settings-diff"
 export function NewRun() {
 	const router = useRouter()
 
-	const [mode, setMode] = useState<"openrouter" | "settings">("openrouter")
-	const [modelSearchValue, setModelSearchValue] = useState("")
+	const [provider, setModelSource] = useState<"roo" | "openrouter" | "other">("roo")
 	const [modelPopoverOpen, setModelPopoverOpen] = useState(false)
+	const [useNativeToolProtocol, setUseNativeToolProtocol] = useState(true)
 
-	const modelSearchResultsRef = useRef<Map<string, number>>(new Map())
-	const modelSearchValueRef = useRef("")
+	const openRouter = useOpenRouterModels()
+	const rooCodeCloud = useRooCodeCloudModels()
+	const models = provider === "openrouter" ? openRouter.data : rooCodeCloud.data
+	const searchValue = provider === "openrouter" ? openRouter.searchValue : rooCodeCloud.searchValue
+	const setSearchValue = provider === "openrouter" ? openRouter.setSearchValue : rooCodeCloud.setSearchValue
+	const onFilter = provider === "openrouter" ? openRouter.onFilter : rooCodeCloud.onFilter
 
-	const models = useOpenRouterModels()
 	const exercises = useQuery({ queryKey: ["getExercises"], queryFn: () => getExercises() })
 
 	const form = useForm<CreateRun>({
 		resolver: zodResolver(createRunSchema),
 		defaultValues: {
-			model: MODEL_DEFAULT,
+			model: "",
 			description: "",
 			suite: "full",
 			exercises: [],
 			settings: undefined,
 			concurrency: CONCURRENCY_DEFAULT,
 			timeout: TIMEOUT_DEFAULT,
+			jobToken: "",
 		},
 	})
 
@@ -93,8 +101,20 @@ export function NewRun() {
 	const onSubmit = useCallback(
 		async (values: CreateRun) => {
 			try {
-				if (mode === "openrouter") {
-					values.settings = { ...(values.settings || {}), openRouterModelId: model }
+				if (provider === "openrouter") {
+					values.settings = {
+						...(values.settings || {}),
+						apiProvider: "openrouter",
+						openRouterModelId: model,
+						toolProtocol: useNativeToolProtocol ? "native" : "xml",
+					}
+				} else if (provider === "roo") {
+					values.settings = {
+						...(values.settings || {}),
+						apiProvider: "roo",
+						apiModelId: model,
+						toolProtocol: useNativeToolProtocol ? "native" : "xml",
+					}
 				}
 
 				const { id } = await createRun(values)
@@ -103,28 +123,7 @@ export function NewRun() {
 				toast.error(e instanceof Error ? e.message : "An unknown error occurred.")
 			}
 		},
-		[mode, model, router],
-	)
-
-	const onFilterModels = useCallback(
-		(value: string, search: string) => {
-			if (modelSearchValueRef.current !== search) {
-				modelSearchValueRef.current = search
-				modelSearchResultsRef.current.clear()
-
-				for (const {
-					obj: { id },
-					score,
-				} of fuzzysort.go(search, models.data || [], {
-					key: "name",
-				})) {
-					modelSearchResultsRef.current.set(id, score)
-				}
-			}
-
-			return modelSearchResultsRef.current.get(value) ?? 0
-		},
-		[models.data],
+		[provider, model, router, useNativeToolProtocol],
 	)
 
 	const onSelectModel = useCallback(
@@ -132,7 +131,7 @@ export function NewRun() {
 			setValue("model", model)
 			setModelPopoverOpen(false)
 		},
-		[setValue],
+		[setValue, setModelPopoverOpen],
 	)
 
 	const onImportSettings = useCallback(
@@ -160,7 +159,6 @@ export function NewRun() {
 
 				setValue("model", getModelId(providerSettings) ?? "")
 				setValue("settings", { ...EVALS_SETTINGS, ...providerSettings, ...globalSettings })
-				setMode("settings")
 
 				event.target.value = ""
 			} catch (e) {
@@ -177,13 +175,44 @@ export function NewRun() {
 				<form
 					onSubmit={form.handleSubmit(onSubmit)}
 					className="flex flex-col justify-center divide-y divide-primary *:py-5">
-					<div className="flex flex-row justify-between gap-4">
-						{mode === "openrouter" && (
-							<FormField
-								control={form.control}
-								name="model"
-								render={() => (
-									<FormItem className="flex-1">
+					<FormField
+						control={form.control}
+						name="model"
+						render={() => (
+							<FormItem>
+								<Tabs
+									value={provider}
+									onValueChange={(value) => setModelSource(value as "roo" | "openrouter" | "other")}>
+									<TabsList className="mb-2">
+										<TabsTrigger value="roo">Roo Code Cloud</TabsTrigger>
+										<TabsTrigger value="openrouter">OpenRouter</TabsTrigger>
+										<TabsTrigger value="other">Other</TabsTrigger>
+									</TabsList>
+								</Tabs>
+
+								{provider === "other" ? (
+									<div className="space-y-2 overflow-auto">
+										<Button
+											type="button"
+											variant="secondary"
+											onClick={() => document.getElementById("json-upload")?.click()}
+											className="w-full">
+											<SlidersHorizontal />
+											Import Settings
+										</Button>
+										<input
+											id="json-upload"
+											type="file"
+											accept="application/json"
+											className="hidden"
+											onChange={onImportSettings}
+										/>
+										{settings && (
+											<SettingsDiff defaultSettings={EVALS_SETTINGS} customSettings={settings} />
+										)}
+									</div>
+								) : (
+									<>
 										<Popover open={modelPopoverOpen} onOpenChange={setModelPopoverOpen}>
 											<PopoverTrigger asChild>
 												<Button
@@ -192,25 +221,23 @@ export function NewRun() {
 													aria-expanded={modelPopoverOpen}
 													className="flex items-center justify-between">
 													<div>
-														{models.data?.find(({ id }) => id === model)?.name ||
-															model ||
-															"Select OpenRouter Model"}
+														{models?.find(({ id }) => id === model)?.name || `Select`}
 													</div>
 													<ChevronsUpDown className="opacity-50" />
 												</Button>
 											</PopoverTrigger>
 											<PopoverContent className="p-0 w-[var(--radix-popover-trigger-width)]">
-												<Command filter={onFilterModels}>
+												<Command filter={onFilter}>
 													<CommandInput
 														placeholder="Search"
-														value={modelSearchValue}
-														onValueChange={setModelSearchValue}
+														value={searchValue}
+														onValueChange={setSearchValue}
 														className="h-9"
 													/>
 													<CommandList>
 														<CommandEmpty>No model found.</CommandEmpty>
 														<CommandGroup>
-															{models.data?.map(({ id, name }) => (
+															{models?.map(({ id, name }) => (
 																<CommandItem
 																	key={id}
 																	value={id}
@@ -229,45 +256,49 @@ export function NewRun() {
 												</Command>
 											</PopoverContent>
 										</Popover>
-										<FormMessage />
-									</FormItem>
-								)}
-							/>
-						)}
 
-						<FormItem className="flex-1">
-							<Button
-								type="button"
-								variant="secondary"
-								onClick={() => document.getElementById("json-upload")?.click()}>
-								<SlidersHorizontal />
-								Import Settings
-							</Button>
-							<input
-								id="json-upload"
-								type="file"
-								accept="application/json"
-								className="hidden"
-								onChange={onImportSettings}
-							/>
-							{settings && (
-								<ScrollArea className="max-h-64 border rounded-sm">
-									<>
-										<div className="flex items-center gap-1 p-2 border-b">
-											<CircleCheck className="size-4 text-ring" />
-											<div className="text-sm">
-												Imported valid Roo Code settings. Showing differences from default
-												settings.
-											</div>
+										<div className="flex items-center gap-1.5">
+											<Checkbox
+												id="native"
+												checked={useNativeToolProtocol}
+												onCheckedChange={(checked) =>
+													setUseNativeToolProtocol(checked === true)
+												}
+											/>
+											<Label htmlFor="native">Use Native Tool Calls</Label>
 										</div>
-										<SettingsDiff defaultSettings={EVALS_SETTINGS} customSettings={settings} />
 									</>
-									<ScrollBar orientation="horizontal" />
-								</ScrollArea>
+								)}
+
+								<FormMessage />
+							</FormItem>
+						)}
+					/>
+
+					{provider === "roo" && (
+						<FormField
+							control={form.control}
+							name="jobToken"
+							render={({ field }) => (
+								<FormItem>
+									<FormLabel>Roo Code Cloud Token</FormLabel>
+									<FormControl>
+										<Input type="password" {...field} />
+									</FormControl>
+									<FormMessage />
+									<FormDescription>
+										If you have access to the Roo Code Cloud repository then you can generate a
+										token with:
+										<br />
+										<code className="text-xs">
+											pnpm --filter @roo-code-cloud/auth production:create-job-token [org]
+											[timeout]
+										</code>
+									</FormDescription>
+								</FormItem>
 							)}
-							<FormMessage />
-						</FormItem>
-					</div>
+						/>
+					)}
 
 					<FormField
 						control={form.control}

+ 35 - 45
apps/web-evals/src/app/runs/new/settings-diff.tsx

@@ -1,12 +1,10 @@
-import { Fragment, HTMLAttributes } from "react"
-
 import { type Keys, type RooCodeSettings, GLOBAL_SETTINGS_KEYS, PROVIDER_SETTINGS_KEYS } from "@roo-code/types"
 
-import { cn } from "@/lib/utils"
+import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui"
 
 export const ROO_CODE_SETTINGS_KEYS = [...GLOBAL_SETTINGS_KEYS, ...PROVIDER_SETTINGS_KEYS] as Keys<RooCodeSettings>[]
 
-type SettingsDiffProps = HTMLAttributes<HTMLDivElement> & {
+type SettingsDiffProps = {
 	defaultSettings: RooCodeSettings
 	customSettings: RooCodeSettings
 }
@@ -14,53 +12,45 @@ type SettingsDiffProps = HTMLAttributes<HTMLDivElement> & {
 export function SettingsDiff({
 	customSettings: { experiments: customExperiments, ...customSettings },
 	defaultSettings: { experiments: defaultExperiments, ...defaultSettings },
-	className,
-	...props
 }: SettingsDiffProps) {
 	const defaults = { ...defaultSettings, ...defaultExperiments }
 	const custom = { ...customSettings, ...customExperiments }
 
 	return (
-		<div className={cn("grid grid-cols-3 gap-2 text-sm p-2", className)} {...props}>
-			<div className="font-medium text-muted-foreground">Setting</div>
-			<div className="font-medium text-muted-foreground">Default</div>
-			<div className="font-medium text-muted-foreground">Custom</div>
-			{ROO_CODE_SETTINGS_KEYS.map((key) => {
-				const defaultValue = defaults[key as keyof typeof defaults]
-				const customValue = custom[key as keyof typeof custom]
-				const isDefault = JSON.stringify(defaultValue) === JSON.stringify(customValue)
-
-				return isDefault ? null : (
-					<SettingDiff
-						key={key}
-						name={key}
-						defaultValue={JSON.stringify(defaultValue, null, 2)}
-						customValue={JSON.stringify(customValue, null, 2)}
-					/>
-				)
-			})}
+		<div className="border rounded-sm">
+			<Table>
+				<TableHeader>
+					<TableRow className="font-medium text-muted-foreground">
+						<TableHead>Setting</TableHead>
+						<TableHead>Default</TableHead>
+						<TableHead>Custom</TableHead>
+					</TableRow>
+				</TableHeader>
+				<TableBody>
+					{ROO_CODE_SETTINGS_KEYS.map((key) => {
+						const defaultValue = JSON.stringify(defaults[key as keyof typeof defaults], null, 2)
+						const customValue = JSON.stringify(custom[key as keyof typeof custom], null, 2)
+
+						return defaultValue === customValue ||
+							(isEmpty(defaultValue) && isEmpty(customValue)) ? null : (
+							<TableRow key={key}>
+								<TableCell className="font-mono" title={key}>
+									{key}
+								</TableCell>
+								<TableCell className="font-mono text-rose-500 line-through" title={defaultValue}>
+									{defaultValue}
+								</TableCell>
+								<TableCell className="font-mono text-teal-500" title={customValue}>
+									{customValue}
+								</TableCell>
+							</TableRow>
+						)
+					})}
+				</TableBody>
+			</Table>
 		</div>
 	)
 }
 
-type SettingDiffProps = HTMLAttributes<HTMLDivElement> & {
-	name: string
-	defaultValue?: string
-	customValue?: string
-}
-
-export function SettingDiff({ name, defaultValue, customValue, ...props }: SettingDiffProps) {
-	return (
-		<Fragment {...props}>
-			<div className="font-mono" title={name}>
-				{name}
-			</div>
-			<pre className="inline text-rose-500 line-through" title={defaultValue}>
-				{defaultValue}
-			</pre>
-			<pre className="inline text-teal-500" title={customValue}>
-				{customValue}
-			</pre>
-		</Fragment>
-	)
-}
+const isEmpty = (value: string | undefined) =>
+	value === undefined || value === "" || value === "null" || value === '""' || value === "[]" || value === "{}"

+ 27 - 0
apps/web-evals/src/components/ui/checkbox.tsx

@@ -0,0 +1,27 @@
+"use client"
+
+import * as React from "react"
+import * as CheckboxPrimitive from "@radix-ui/react-checkbox"
+import { CheckIcon } from "lucide-react"
+
+import { cn } from "@/lib/utils"
+
+function Checkbox({ className, ...props }: React.ComponentProps<typeof CheckboxPrimitive.Root>) {
+	return (
+		<CheckboxPrimitive.Root
+			data-slot="checkbox"
+			className={cn(
+				"peer border-input dark:bg-input/30 data-[state=checked]:bg-primary data-[state=checked]:text-primary-foreground dark:data-[state=checked]:bg-primary data-[state=checked]:border-primary focus-visible:border-ring focus-visible:ring-ring/50 aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40 aria-invalid:border-destructive size-4 shrink-0 rounded-[4px] border shadow-xs transition-shadow outline-none focus-visible:ring-[3px] disabled:cursor-not-allowed disabled:opacity-50",
+				className,
+			)}
+			{...props}>
+			<CheckboxPrimitive.Indicator
+				data-slot="checkbox-indicator"
+				className="grid place-content-center text-current transition-none">
+				<CheckIcon className="size-3.5" />
+			</CheckboxPrimitive.Indicator>
+		</CheckboxPrimitive.Root>
+	)
+}
+
+export { Checkbox }

+ 1 - 0
apps/web-evals/src/components/ui/index.ts

@@ -1,6 +1,7 @@
 export * from "./alert-dialog"
 export * from "./badge"
 export * from "./button"
+export * from "./checkbox"
 export * from "./command"
 export * from "./dialog"
 export * from "./drawer"

+ 37 - 0
apps/web-evals/src/hooks/use-fuzzy-model-search.ts

@@ -0,0 +1,37 @@
+import { useCallback, useRef, useState } from "react"
+import fuzzysort from "fuzzysort"
+
+interface ModelWithId {
+	id: string
+	name: string
+}
+
+export const useFuzzyModelSearch = <T extends ModelWithId>(data: T[] | undefined) => {
+	const [searchValue, setSearchValue] = useState("")
+
+	const searchResultsRef = useRef<Map<string, number>>(new Map())
+	const searchValueRef = useRef("")
+
+	const onFilter = useCallback(
+		(value: string, search: string) => {
+			if (searchValueRef.current !== search) {
+				searchValueRef.current = search
+				searchResultsRef.current.clear()
+
+				for (const {
+					obj: { id },
+					score,
+				} of fuzzysort.go(search, data || [], {
+					key: "name",
+				})) {
+					searchResultsRef.current.set(id, score)
+				}
+			}
+
+			return searchResultsRef.current.get(value) ?? 0
+		},
+		[data],
+	)
+
+	return { searchValue, setSearchValue, onFilter }
+}

+ 8 - 2
apps/web-evals/src/hooks/use-open-router-models.ts

@@ -1,5 +1,6 @@
 import { z } from "zod"
 import { useQuery } from "@tanstack/react-query"
+import { useFuzzyModelSearch } from "./use-fuzzy-model-search"
 
 export const openRouterModelSchema = z.object({
 	id: z.string(),
@@ -25,8 +26,13 @@ export const getOpenRouterModels = async (): Promise<OpenRouterModel[]> => {
 	return result.data.data.sort((a, b) => a.name.localeCompare(b.name))
 }
 
-export const useOpenRouterModels = () =>
-	useQuery({
+export const useOpenRouterModels = () => {
+	const query = useQuery({
 		queryKey: ["getOpenRouterModels"],
 		queryFn: getOpenRouterModels,
 	})
+
+	const { searchValue, setSearchValue, onFilter } = useFuzzyModelSearch(query.data)
+
+	return { ...query, searchValue, setSearchValue, onFilter }
+}

+ 66 - 0
apps/web-evals/src/hooks/use-roo-code-cloud-models.ts

@@ -0,0 +1,66 @@
+import { z } from "zod"
+import { useQuery } from "@tanstack/react-query"
+import { useFuzzyModelSearch } from "./use-fuzzy-model-search"
+
+export const rooCodeCloudModelSchema = z.object({
+	object: z.literal("model"),
+	id: z.string(),
+	name: z.string(),
+	description: z.string().optional(),
+	context_window: z.number(),
+	max_tokens: z.number(),
+	supports_images: z.boolean().optional(),
+	supports_prompt_cache: z.boolean().optional(),
+	type: z.literal("language"),
+	tags: z.array(z.string()).optional(),
+	deprecationMessage: z.string().optional(),
+	owned_by: z.string(),
+	pricing: z.object({
+		input: z.string(),
+		output: z.string(),
+		input_cache_read: z.string().optional(),
+		input_cache_write: z.string().optional(),
+	}),
+	evals: z
+		.object({
+			score: z.number().min(0).max(100),
+		})
+		.optional(),
+	created: z.number(),
+	deprecated: z.boolean().optional(),
+})
+
+export type RooCodeCloudModel = z.infer<typeof rooCodeCloudModelSchema>
+
+export const getRooCodeCloudModels = async (): Promise<RooCodeCloudModel[]> => {
+	const response = await fetch("https://api.roocode.com/proxy/v1/models")
+
+	if (!response.ok) {
+		return []
+	}
+
+	const result = z
+		.object({
+			object: z.literal("list"),
+			data: z.array(rooCodeCloudModelSchema),
+		})
+		.safeParse(await response.json())
+
+	if (!result.success) {
+		console.error(result.error)
+		return []
+	}
+
+	return result.data.data.sort((a, b) => a.name.localeCompare(b.name))
+}
+
+export const useRooCodeCloudModels = () => {
+	const query = useQuery({
+		queryKey: ["getRooCodeCloudModels"],
+		queryFn: getRooCodeCloudModels,
+	})
+
+	const { searchValue, setSearchValue, onFilter } = useFuzzyModelSearch(query.data)
+
+	return { ...query, searchValue, setSearchValue, onFilter }
+}

+ 1 - 3
apps/web-evals/src/lib/schemas.ts

@@ -6,8 +6,6 @@ import { rooCodeSettingsSchema } from "@roo-code/types"
  * CreateRun
  */
 
-export const MODEL_DEFAULT = "anthropic/claude-sonnet-4"
-
 export const CONCURRENCY_MIN = 1
 export const CONCURRENCY_MAX = 25
 export const CONCURRENCY_DEFAULT = 1
@@ -25,7 +23,7 @@ export const createRunSchema = z
 		settings: rooCodeSettingsSchema.optional(),
 		concurrency: z.number().int().min(CONCURRENCY_MIN).max(CONCURRENCY_MAX),
 		timeout: z.number().int().min(TIMEOUT_MIN).max(TIMEOUT_MAX),
-		systemPrompt: z.string().optional(),
+		jobToken: z.string().optional(),
 	})
 	.refine((data) => data.suite === "full" || (data.exercises || []).length > 0, {
 		message: "Exercises are required when running a partial suite.",

+ 1 - 1
packages/evals/src/cli/index.ts

@@ -28,7 +28,7 @@ const main = async () => {
 					} else if (runId !== -1) {
 						await runEvals(runId)
 					} else if (taskId !== -1) {
-						await processTask({ taskId })
+						await processTask({ taskId, jobToken: process.env.ROO_CODE_CLOUD_TOKEN || null })
 					} else {
 						throw new Error("Either runId or taskId must be provided.")
 					}

+ 2 - 2
packages/evals/src/cli/runEvals.ts

@@ -44,9 +44,9 @@ export const runEvals = async (runId: number) => {
 				.map((task) => async () => {
 					try {
 						if (containerized) {
-							await processTaskInContainer({ taskId: task.id, logger })
+							await processTaskInContainer({ taskId: task.id, jobToken: run.jobToken, logger })
 						} else {
-							await processTask({ taskId: task.id, logger })
+							await processTask({ taskId: task.id, jobToken: run.jobToken, logger })
 						}
 					} catch (error) {
 						logger.error("error processing task", error)

+ 23 - 4
packages/evals/src/cli/runTask.ts

@@ -38,7 +38,15 @@ class SubprocessTimeoutError extends Error {
 	}
 }
 
-export const processTask = async ({ taskId, logger }: { taskId: number; logger?: Logger }) => {
+export const processTask = async ({
+	taskId,
+	jobToken,
+	logger,
+}: {
+	taskId: number
+	jobToken: string | null
+	logger?: Logger
+}) => {
 	const task = await findTask(taskId)
 	const { language, exercise } = task
 	const run = await findRun(task.runId)
@@ -61,7 +69,7 @@ export const processTask = async ({ taskId, logger }: { taskId: number; logger?:
 		}
 
 		logger.info(`running task ${task.id} (${language}/${exercise})...`)
-		await runTask({ run, task, publish, logger })
+		await runTask({ run, task, jobToken, publish, logger })
 
 		logger.info(`testing task ${task.id} (${language}/${exercise})...`)
 		const passed = await runUnitTest({ task, logger })
@@ -80,10 +88,12 @@ export const processTask = async ({ taskId, logger }: { taskId: number; logger?:
 
 export const processTaskInContainer = async ({
 	taskId,
+	jobToken,
 	logger,
 	maxRetries = 10,
 }: {
 	taskId: number
+	jobToken: string | null
 	logger: Logger
 	maxRetries?: number
 }) => {
@@ -95,6 +105,10 @@ export const processTaskInContainer = async ({
 		"-e HOST_EXECUTION_METHOD=docker",
 	]
 
+	if (jobToken) {
+		baseArgs.push(`-e ROO_CODE_CLOUD_TOKEN=${jobToken}`)
+	}
+
 	const command = `pnpm --filter @roo-code/evals cli --taskId ${taskId}`
 	logger.info(command)
 
@@ -144,11 +158,12 @@ export const processTaskInContainer = async ({
 type RunTaskOptions = {
 	run: Run
 	task: Task
+	jobToken: string | null
 	publish: (taskEvent: TaskEvent) => Promise<void>
 	logger: Logger
 }
 
-export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) => {
+export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskOptions) => {
 	const { language, exercise } = task
 	const prompt = fs.readFileSync(path.resolve(EVALS_REPO_PATH, `prompts/${language}.md`), "utf-8")
 	const workspacePath = path.resolve(EVALS_REPO_PATH, language, exercise)
@@ -158,10 +173,14 @@ export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) =>
 	const cancelSignal = controller.signal
 	const containerized = isDockerContainer()
 
-	const codeCommand = containerized
+	let codeCommand = containerized
 		? `xvfb-run --auto-servernum --server-num=1 code --wait --log trace --disable-workspace-trust --disable-gpu --disable-lcd-text --no-sandbox --user-data-dir /roo/.vscode --password-store="basic" -n ${workspacePath}`
 		: `code --disable-workspace-trust -n ${workspacePath}`
 
+	if (jobToken) {
+		codeCommand = `ROO_CODE_CLOUD_TOKEN=${jobToken} ${codeCommand}`
+	}
+
 	logger.info(codeCommand)
 
 	// Sleep for a random amount of time between 5 and 10 seconds, unless we're

+ 1 - 0
packages/evals/src/db/migrations/0003_simple_retro_girl.sql

@@ -0,0 +1 @@
+ALTER TABLE "runs" ADD COLUMN "jobToken" text;

+ 459 - 0
packages/evals/src/db/migrations/meta/0003_snapshot.json

@@ -0,0 +1,459 @@
+{
+	"id": "853d308a-3946-4ea8-9039-236bfce3c6c0",
+	"prevId": "3d2b8423-6170-4cb2-9f62-1c86756da97a",
+	"version": "7",
+	"dialect": "postgresql",
+	"tables": {
+		"public.runs": {
+			"name": "runs",
+			"schema": "",
+			"columns": {
+				"id": {
+					"name": "id",
+					"type": "integer",
+					"primaryKey": true,
+					"notNull": true,
+					"identity": {
+						"type": "always",
+						"name": "runs_id_seq",
+						"schema": "public",
+						"increment": "1",
+						"startWith": "1",
+						"minValue": "1",
+						"maxValue": "2147483647",
+						"cache": "1",
+						"cycle": false
+					}
+				},
+				"task_metrics_id": {
+					"name": "task_metrics_id",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"model": {
+					"name": "model",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"name": {
+					"name": "name",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"description": {
+					"name": "description",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"contextWindow": {
+					"name": "contextWindow",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"inputPrice": {
+					"name": "inputPrice",
+					"type": "real",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"outputPrice": {
+					"name": "outputPrice",
+					"type": "real",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"cacheWritesPrice": {
+					"name": "cacheWritesPrice",
+					"type": "real",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"cacheReadsPrice": {
+					"name": "cacheReadsPrice",
+					"type": "real",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"settings": {
+					"name": "settings",
+					"type": "jsonb",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"jobToken": {
+					"name": "jobToken",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"pid": {
+					"name": "pid",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"socket_path": {
+					"name": "socket_path",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"concurrency": {
+					"name": "concurrency",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true,
+					"default": 2
+				},
+				"timeout": {
+					"name": "timeout",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true,
+					"default": 5
+				},
+				"passed": {
+					"name": "passed",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true,
+					"default": 0
+				},
+				"failed": {
+					"name": "failed",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true,
+					"default": 0
+				},
+				"created_at": {
+					"name": "created_at",
+					"type": "timestamp",
+					"primaryKey": false,
+					"notNull": true
+				}
+			},
+			"indexes": {},
+			"foreignKeys": {
+				"runs_task_metrics_id_taskMetrics_id_fk": {
+					"name": "runs_task_metrics_id_taskMetrics_id_fk",
+					"tableFrom": "runs",
+					"tableTo": "taskMetrics",
+					"columnsFrom": ["task_metrics_id"],
+					"columnsTo": ["id"],
+					"onDelete": "no action",
+					"onUpdate": "no action"
+				}
+			},
+			"compositePrimaryKeys": {},
+			"uniqueConstraints": {},
+			"policies": {},
+			"checkConstraints": {},
+			"isRLSEnabled": false
+		},
+		"public.taskMetrics": {
+			"name": "taskMetrics",
+			"schema": "",
+			"columns": {
+				"id": {
+					"name": "id",
+					"type": "integer",
+					"primaryKey": true,
+					"notNull": true,
+					"identity": {
+						"type": "always",
+						"name": "taskMetrics_id_seq",
+						"schema": "public",
+						"increment": "1",
+						"startWith": "1",
+						"minValue": "1",
+						"maxValue": "2147483647",
+						"cache": "1",
+						"cycle": false
+					}
+				},
+				"tokens_in": {
+					"name": "tokens_in",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"tokens_out": {
+					"name": "tokens_out",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"tokens_context": {
+					"name": "tokens_context",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"cache_writes": {
+					"name": "cache_writes",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"cache_reads": {
+					"name": "cache_reads",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"cost": {
+					"name": "cost",
+					"type": "real",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"duration": {
+					"name": "duration",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"tool_usage": {
+					"name": "tool_usage",
+					"type": "jsonb",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"created_at": {
+					"name": "created_at",
+					"type": "timestamp",
+					"primaryKey": false,
+					"notNull": true
+				}
+			},
+			"indexes": {},
+			"foreignKeys": {},
+			"compositePrimaryKeys": {},
+			"uniqueConstraints": {},
+			"policies": {},
+			"checkConstraints": {},
+			"isRLSEnabled": false
+		},
+		"public.tasks": {
+			"name": "tasks",
+			"schema": "",
+			"columns": {
+				"id": {
+					"name": "id",
+					"type": "integer",
+					"primaryKey": true,
+					"notNull": true,
+					"identity": {
+						"type": "always",
+						"name": "tasks_id_seq",
+						"schema": "public",
+						"increment": "1",
+						"startWith": "1",
+						"minValue": "1",
+						"maxValue": "2147483647",
+						"cache": "1",
+						"cycle": false
+					}
+				},
+				"run_id": {
+					"name": "run_id",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"task_metrics_id": {
+					"name": "task_metrics_id",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"language": {
+					"name": "language",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"exercise": {
+					"name": "exercise",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"passed": {
+					"name": "passed",
+					"type": "boolean",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"started_at": {
+					"name": "started_at",
+					"type": "timestamp",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"finished_at": {
+					"name": "finished_at",
+					"type": "timestamp",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"created_at": {
+					"name": "created_at",
+					"type": "timestamp",
+					"primaryKey": false,
+					"notNull": true
+				}
+			},
+			"indexes": {
+				"tasks_language_exercise_idx": {
+					"name": "tasks_language_exercise_idx",
+					"columns": [
+						{
+							"expression": "run_id",
+							"isExpression": false,
+							"asc": true,
+							"nulls": "last"
+						},
+						{
+							"expression": "language",
+							"isExpression": false,
+							"asc": true,
+							"nulls": "last"
+						},
+						{
+							"expression": "exercise",
+							"isExpression": false,
+							"asc": true,
+							"nulls": "last"
+						}
+					],
+					"isUnique": true,
+					"concurrently": false,
+					"method": "btree",
+					"with": {}
+				}
+			},
+			"foreignKeys": {
+				"tasks_run_id_runs_id_fk": {
+					"name": "tasks_run_id_runs_id_fk",
+					"tableFrom": "tasks",
+					"tableTo": "runs",
+					"columnsFrom": ["run_id"],
+					"columnsTo": ["id"],
+					"onDelete": "no action",
+					"onUpdate": "no action"
+				},
+				"tasks_task_metrics_id_taskMetrics_id_fk": {
+					"name": "tasks_task_metrics_id_taskMetrics_id_fk",
+					"tableFrom": "tasks",
+					"tableTo": "taskMetrics",
+					"columnsFrom": ["task_metrics_id"],
+					"columnsTo": ["id"],
+					"onDelete": "no action",
+					"onUpdate": "no action"
+				}
+			},
+			"compositePrimaryKeys": {},
+			"uniqueConstraints": {},
+			"policies": {},
+			"checkConstraints": {},
+			"isRLSEnabled": false
+		},
+		"public.toolErrors": {
+			"name": "toolErrors",
+			"schema": "",
+			"columns": {
+				"id": {
+					"name": "id",
+					"type": "integer",
+					"primaryKey": true,
+					"notNull": true,
+					"identity": {
+						"type": "always",
+						"name": "toolErrors_id_seq",
+						"schema": "public",
+						"increment": "1",
+						"startWith": "1",
+						"minValue": "1",
+						"maxValue": "2147483647",
+						"cache": "1",
+						"cycle": false
+					}
+				},
+				"run_id": {
+					"name": "run_id",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"task_id": {
+					"name": "task_id",
+					"type": "integer",
+					"primaryKey": false,
+					"notNull": false
+				},
+				"tool_name": {
+					"name": "tool_name",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"error": {
+					"name": "error",
+					"type": "text",
+					"primaryKey": false,
+					"notNull": true
+				},
+				"created_at": {
+					"name": "created_at",
+					"type": "timestamp",
+					"primaryKey": false,
+					"notNull": true
+				}
+			},
+			"indexes": {},
+			"foreignKeys": {
+				"toolErrors_run_id_runs_id_fk": {
+					"name": "toolErrors_run_id_runs_id_fk",
+					"tableFrom": "toolErrors",
+					"tableTo": "runs",
+					"columnsFrom": ["run_id"],
+					"columnsTo": ["id"],
+					"onDelete": "no action",
+					"onUpdate": "no action"
+				},
+				"toolErrors_task_id_tasks_id_fk": {
+					"name": "toolErrors_task_id_tasks_id_fk",
+					"tableFrom": "toolErrors",
+					"tableTo": "tasks",
+					"columnsFrom": ["task_id"],
+					"columnsTo": ["id"],
+					"onDelete": "no action",
+					"onUpdate": "no action"
+				}
+			},
+			"compositePrimaryKeys": {},
+			"uniqueConstraints": {},
+			"policies": {},
+			"checkConstraints": {},
+			"isRLSEnabled": false
+		}
+	},
+	"enums": {},
+	"schemas": {},
+	"sequences": {},
+	"roles": {},
+	"policies": {},
+	"views": {},
+	"_meta": {
+		"columns": {},
+		"schemas": {},
+		"tables": {}
+	}
+}

+ 7 - 0
packages/evals/src/db/migrations/meta/_journal.json

@@ -22,6 +22,13 @@
 			"when": 1757191027855,
 			"tag": "0002_bouncy_blazing_skull",
 			"breakpoints": true
+		},
+		{
+			"idx": 3,
+			"version": "7",
+			"when": 1763797232454,
+			"tag": "0003_simple_retro_girl",
+			"breakpoints": true
 		}
 	]
 }

+ 1 - 0
packages/evals/src/db/schema.ts

@@ -21,6 +21,7 @@ export const runs = pgTable("runs", {
 	cacheWritesPrice: real(),
 	cacheReadsPrice: real(),
 	settings: jsonb().$type<RooCodeSettings>(),
+	jobToken: text(),
 	pid: integer(),
 	socketPath: text("socket_path").notNull(),
 	concurrency: integer().default(2).notNull(),

+ 3 - 0
pnpm-lock.yaml

@@ -131,6 +131,9 @@ importers:
       '@radix-ui/react-alert-dialog':
         specifier: ^1.1.7
         version: 1.1.13(@types/[email protected](@types/[email protected]))(@types/[email protected])([email protected]([email protected]))([email protected])
+      '@radix-ui/react-checkbox':
+        specifier: ^1.1.5
+        version: 1.3.1(@types/[email protected](@types/[email protected]))(@types/[email protected])([email protected]([email protected]))([email protected])
       '@radix-ui/react-dialog':
         specifier: ^1.1.6
         version: 1.1.13(@types/[email protected](@types/[email protected]))(@types/[email protected])([email protected]([email protected]))([email protected])