瀏覽代碼

Fix rust evals (run cargo with bash) + logging tweaks (#4392)

Chris Estreich 6 月之前
父節點
當前提交
39d6535f7c

+ 1 - 2
packages/evals/src/cli/index.ts

@@ -8,7 +8,7 @@ import { runEvals } from "./runEvals.js"
 import { processTask } from "./processTask.js"
 
 const main = async () => {
-	const result = await run(
+	await run(
 		command({
 			name: "cli",
 			description: "Execute an eval run.",
@@ -43,7 +43,6 @@ const main = async () => {
 		process.argv.slice(2),
 	)
 
-	console.log(result)
 	process.exit(0)
 }
 

+ 1 - 1
packages/evals/src/cli/processTask.ts

@@ -25,7 +25,7 @@ export const processTask = async (taskId: number) => {
 		await runTask({ run, task, publish })
 
 		console.log(`[${Date.now()} | ${tag}] testing task ${task.id} (${task.language}/${task.exercise})...`)
-		const passed = await runUnitTest({ task })
+		const passed = await runUnitTest({ run, task })
 
 		console.log(`[${Date.now()} | ${tag}] task ${task.id} (${task.language}/${task.exercise}) -> ${passed}`)
 		await updateTask(task.id, { passed })

+ 13 - 13
packages/evals/src/cli/runEvals.ts

@@ -1,10 +1,9 @@
-import { execa } from "execa"
 import PQueue from "p-queue"
 
 import { findRun, finishRun, getTasks } from "../db/index.js"
 import { exercisesPath } from "../exercises/index.js"
 
-import { getTag, isDockerContainer } from "./utils.js"
+import { getTag, isDockerContainer, resetEvalsRepo, commitEvalsRepoChanges } from "./utils.js"
 import { processTask, processTaskInContainer } from "./processTask.js"
 import { startHeartbeat, stopHeartbeat } from "./redis.js"
 
@@ -24,31 +23,32 @@ export const runEvals = async (runId: number) => {
 	const tag = getTag("runEvals", { run })
 	console.log(`[${Date.now()} | ${tag}] running ${tasks.length} task(s)`)
 
-	const cwd = exercisesPath
-	await execa({ cwd })`git config user.name "Roo Code"`
-	await execa({ cwd })`git config user.email "[email protected]"`
-	await execa({ cwd })`git checkout -f`
-	await execa({ cwd })`git clean -fd`
-	await execa({ cwd })`git checkout -b runs/${run.id}-${crypto.randomUUID().slice(0, 8)} main`
+	const containerized = isDockerContainer()
+
+	if (!containerized) {
+		await resetEvalsRepo({ run, cwd: exercisesPath })
+	}
 
 	const heartbeat = await startHeartbeat(run.id)
 	const queue = new PQueue({ concurrency: run.concurrency })
 
 	try {
-		const containerize = isDockerContainer()
-
 		await queue.addAll(
 			tasks
 				.filter((task) => task.finishedAt === null)
-				.map((task) => () => (containerize ? processTaskInContainer(task.id) : processTask(task.id))),
+				.map((task) => () => (containerized ? processTaskInContainer(task.id) : processTask(task.id))),
 		)
 
 		console.log(`[${Date.now()} | ${tag}] finishRun`)
 		const result = await finishRun(run.id)
 		console.log(`[${Date.now()} | ${tag}] result ->`, result)
 
-		await execa({ cwd: exercisesPath })`git add .`
-		await execa({ cwd: exercisesPath })`git commit -m ${`Run #${run.id}`} --no-verify`
+		// There's no need to commit the changes in the container since they
+		// will lost when the container is destroyed. I think we should
+		// store the diffs in the database instead.
+		if (!containerized) {
+			await commitEvalsRepoChanges({ run, cwd: exercisesPath })
+		}
 	} finally {
 		console.log(`[${Date.now()} | ${tag}] cleaning up`)
 		stopHeartbeat(run.id, heartbeat)

+ 56 - 98
packages/evals/src/cli/runTask.ts

@@ -6,11 +6,10 @@ import pWaitFor from "p-wait-for"
 import { execa } from "execa"
 
 import {
+	type TaskEvent,
+	TaskCommandName,
 	RooCodeEventName,
-	IpcOrigin,
 	IpcMessageType,
-	TaskCommandName,
-	type TaskEvent,
 	EVALS_SETTINGS,
 	EVALS_TIMEOUT,
 } from "@roo-code/types"
@@ -27,49 +26,38 @@ type RunTaskOptions = {
 	publish: (taskEvent: TaskEvent) => Promise<void>
 }
 
-export const runTask = async ({ run, task, publish }: RunTaskOptions): Promise<{ success: boolean }> => {
-	const { language, exercise } = task
+export const runTask = async ({ run, task, publish }: RunTaskOptions) => {
 	const tag = getTag("runTask", { run, task })
+	const log = (message: string, ...args: unknown[]) => console.log(`[${Date.now()} | ${tag}] ${message}`, ...args)
+	const logError = (message: string, ...args: unknown[]) =>
+		console.error(`[${Date.now()} | ${tag}] ${message}`, ...args)
 
+	const { language, exercise } = task
 	const prompt = fs.readFileSync(path.resolve(exercisesPath, `prompts/${language}.md`), "utf-8")
 	const workspacePath = path.resolve(exercisesPath, language, exercise)
-	const taskSocketPath = path.resolve(os.tmpdir(), `evals-${run.id}-${task.id}.sock`)
-
-	// Inject foot gun system prompt if present.
-	if (process.env.FOOTGUN_SYSTEM_PROMPT) {
-		const rooDir = path.join(workspacePath, ".roo")
-
-		if (!fs.existsSync(rooDir)) {
-			fs.mkdirSync(rooDir, { recursive: true })
-		}
-
-		fs.writeFileSync(path.join(rooDir, "system-prompt-code"), process.env.FOOTGUN_SYSTEM_PROMPT)
-	}
-
-	console.log(`[${Date.now()} | ${tag}] Opening new VS Code window at ${workspacePath}`)
-
+	const ipcSocketPath = path.resolve(os.tmpdir(), `evals-${run.id}-${task.id}.sock`)
+	const env = { ROO_CODE_IPC_SOCKET_PATH: ipcSocketPath }
 	const controller = new AbortController()
 	const cancelSignal = controller.signal
+	const containerized = isDockerContainer()
 
-	const codeCommand = isDockerContainer()
-		? `xvfb-run --auto-servernum --server-num=1 code --wait --log trace --disable-workspace-trust --disable-gpu --disable-lcd-text --no-sandbox --user-data-dir /roo/.vscode --password-store="basic"`
-		: `code --disable-workspace-trust`
+	const codeCommand = containerized
+		? `xvfb-run --auto-servernum --server-num=1 code --wait --log trace --disable-workspace-trust --disable-gpu --disable-lcd-text --no-sandbox --user-data-dir /roo/.vscode --password-store="basic" -n ${workspacePath}`
+		: `code --disable-workspace-trust -n ${workspacePath}`
 
-	console.log(`[${Date.now()} | ${tag}] ${codeCommand}`)
+	log(codeCommand)
 
-	// Sleep for a random amount of time between 5 and 10 seconds.
-	await new Promise((resolve) => setTimeout(resolve, Math.random() * 5_000 + 5_000))
+	// Sleep for a random amount of time between 5 and 10 seconds, unless we're
+	// running in a container, in which case there are no issues with flooding
+	// VSCode with new windows.
+	if (!containerized) {
+		await new Promise((resolve) => setTimeout(resolve, Math.random() * 5_000 + 5_000))
+	}
 
-	const subprocess = execa({
-		env: {
-			ROO_CODE_IPC_SOCKET_PATH: taskSocketPath,
-		},
-		shell: "/bin/bash",
-		cancelSignal,
-	})`${codeCommand} -n ${workspacePath}`
+	const subprocess = execa({ env, shell: "/bin/bash", cancelSignal })`${codeCommand}`
 
-	// If debugging:
-	subprocess.stdout.pipe(process.stdout)
+	// If debugging, add `--verbose` to `command` and uncomment the following line.
+	// subprocess.stdout.pipe(process.stdout)
 
 	// Give VSCode some time to spawn before connecting to its unix socket.
 	await new Promise((resolve) => setTimeout(resolve, 3_000))
@@ -78,26 +66,20 @@ export const runTask = async ({ run, task, publish }: RunTaskOptions): Promise<{
 
 	while (true) {
 		try {
-			console.log(`[${Date.now()} | ${tag}] connecting to ${taskSocketPath}`)
-			client = new IpcClient(taskSocketPath)
+			client = new IpcClient(ipcSocketPath)
 			await pWaitFor(() => client!.isReady, { interval: 250, timeout: 1_000 })
 			break
 		} catch (_error) {
-			if (client) {
-				client.disconnect()
-			}
-
+			client?.disconnect()
 			attempts--
 
 			if (attempts <= 0) {
-				console.error(`[${Date.now()} | ${tag}] unable to connect`)
-				return { success: false }
+				logError(`unable to connect to IPC socket -> ${ipcSocketPath}`)
+				return
 			}
 		}
 	}
 
-	console.log(`[${Date.now()} | ${tag}] connected to ${taskSocketPath}`)
-
 	let taskStartedAt = Date.now()
 	let taskFinishedAt: number | undefined
 	let taskMetricsId: number | undefined
@@ -106,18 +88,24 @@ export const runTask = async ({ run, task, publish }: RunTaskOptions): Promise<{
 
 	const ignoreEvents: Record<"broadcast" | "log", RooCodeEventName[]> = {
 		broadcast: [RooCodeEventName.Message],
-		log: [RooCodeEventName.TaskTokenUsageUpdated], // [RooCodeEventName.Message, RooCodeEventName.TaskAskResponded],
+		log: [RooCodeEventName.TaskTokenUsageUpdated, RooCodeEventName.TaskAskResponded],
 	}
 
 	client.on(IpcMessageType.TaskEvent, async (taskEvent) => {
 		const { eventName, payload } = taskEvent
 
+		// Publish all events except for these to Redis.
 		if (!ignoreEvents.broadcast.includes(eventName)) {
 			await publish({ ...taskEvent, taskId: task.id })
 		}
 
-		if (!ignoreEvents.log.includes(eventName)) {
-			console.log(`[${Date.now()} | ${tag}] ${eventName} ->`, payload)
+		// Log all events except for these.
+		// For message events we only log non-partial messages.
+		if (
+			!ignoreEvents.log.includes(eventName) &&
+			(eventName !== RooCodeEventName.Message || payload[0].message.partial !== true)
+		) {
+			log(`${eventName} ->`, payload)
 		}
 
 		if (eventName === RooCodeEventName.TaskStarted) {
@@ -177,54 +165,32 @@ export const runTask = async ({ run, task, publish }: RunTaskOptions): Promise<{
 	})
 
 	client.on(IpcMessageType.Disconnect, async () => {
-		console.log(`[${Date.now()} | ${tag}] disconnect`)
+		log(`disconnected from IPC socket -> ${ipcSocketPath}`)
 		isClientDisconnected = true
 	})
 
-	if (client.isReady) {
-		const configuration = {
-			...EVALS_SETTINGS,
-			...run.settings,
-			openRouterApiKey: process.env.OPENROUTER_API_KEY,
-		}
-
-		client.sendMessage({
-			type: IpcMessageType.TaskCommand,
-			origin: IpcOrigin.Client,
-			clientId: client.clientId!,
-			data: {
-				commandName: TaskCommandName.StartNewTask,
-				data: {
-					configuration,
-					text: prompt,
-					newTab: true,
-				},
+	client.sendCommand({
+		commandName: TaskCommandName.StartNewTask,
+		data: {
+			configuration: {
+				...EVALS_SETTINGS,
+				...run.settings,
+				openRouterApiKey: process.env.OPENROUTER_API_KEY,
 			},
-		})
-	} else {
-		console.error(`[${Date.now()} | ${tag}] unable to connect`)
-		client.disconnect()
-		taskFinishedAt = Date.now()
-		isClientDisconnected = true
-	}
+			text: prompt,
+			newTab: true,
+		},
+	})
 
 	try {
 		await pWaitFor(() => !!taskFinishedAt || isClientDisconnected, { interval: 1_000, timeout: EVALS_TIMEOUT })
-		// eslint-disable-next-line @typescript-eslint/no-unused-vars
-	} catch (error) {
-		console.log(`[${Date.now()} | ${tag}] time limit reached`)
+	} catch (_error) {
+		logError("time limit reached")
 
-		// Cancel the task.
 		if (rooTaskId && !isClientDisconnected) {
-			client.sendMessage({
-				type: IpcMessageType.TaskCommand,
-				origin: IpcOrigin.Client,
-				clientId: client.clientId!,
-				data: { commandName: TaskCommandName.CancelTask, data: rooTaskId },
-			})
-
-			// Allow some time for the task to cancel.
-			await new Promise((resolve) => setTimeout(resolve, 5_000))
+			log("cancelling task")
+			client.sendCommand({ commandName: TaskCommandName.CancelTask, data: rooTaskId })
+			await new Promise((resolve) => setTimeout(resolve, 5_000)) // Allow some time for the task to cancel.
 		}
 
 		await updateTask(task.id, { finishedAt: new Date() })
@@ -232,15 +198,9 @@ export const runTask = async ({ run, task, publish }: RunTaskOptions): Promise<{
 
 	if (!isClientDisconnected) {
 		if (rooTaskId) {
-			client.sendMessage({
-				type: IpcMessageType.TaskCommand,
-				origin: IpcOrigin.Client,
-				clientId: client.clientId!,
-				data: { commandName: TaskCommandName.CloseTask, data: rooTaskId },
-			})
-
-			// Allow some time for the window to close.
-			await new Promise((resolve) => setTimeout(resolve, 2_000))
+			log("closing task")
+			client.sendCommand({ commandName: TaskCommandName.CloseTask, data: rooTaskId })
+			await new Promise((resolve) => setTimeout(resolve, 2_000)) // Allow some time for the window to close.
 		}
 
 		client.disconnect()
@@ -248,6 +208,4 @@ export const runTask = async ({ run, task, publish }: RunTaskOptions): Promise<{
 
 	controller.abort()
 	await subprocess
-
-	return { success: !!taskFinishedAt }
 }

+ 19 - 13
packages/evals/src/cli/runUnitTest.ts

@@ -3,9 +3,11 @@ import * as path from "path"
 import { execa, parseCommandString } from "execa"
 import psTree from "ps-tree"
 
-import { type Task } from "../db/index.js"
+import type { Run, Task } from "../db/index.js"
 import { type ExerciseLanguage, exercisesPath } from "../exercises/index.js"
 
+import { getTag } from "./utils.js"
+
 const UNIT_TEST_TIMEOUT = 2 * 60 * 1_000
 
 const testCommands: Record<ExerciseLanguage, { commands: string[]; timeout?: number; cwd?: string }> = {
@@ -16,18 +18,25 @@ const testCommands: Record<ExerciseLanguage, { commands: string[]; timeout?: num
 	rust: { commands: ["cargo test"] },
 }
 
-export const runUnitTest = async ({ task }: { task: Task }) => {
+export const runUnitTest = async ({ run, task }: { run: Run; task: Task }) => {
+	const tag = getTag("runUnitTest", { run, task })
+	const log = (message: string, ...args: unknown[]) => console.log(`[${Date.now()} | ${tag}] ${message}`, ...args)
+	const logError = (message: string, ...args: unknown[]) =>
+		console.error(`[${Date.now()} | ${tag}] ${message}`, ...args)
+
 	const cmd = testCommands[task.language]
 	const exercisePath = path.resolve(exercisesPath, task.language, task.exercise)
 	const cwd = cmd.cwd ? path.resolve(exercisePath, cmd.cwd) : exercisePath
 	const commands = cmd.commands.map((cs) => parseCommandString(cs))
-	const tag = `cli#runUnitTest | ${task.language} / ${task.exercise}`
 
 	let passed = true
 
 	for (const command of commands) {
 		try {
-			const subprocess = execa({ cwd, shell: true, reject: false })`${command}`
+			log(`running "${command.join(" ")}"`)
+			const subprocess = execa({ cwd, shell: "/bin/bash", reject: false })`${command}`
+			subprocess.stdout.pipe(process.stdout)
+			subprocess.stderr.pipe(process.stderr)
 
 			const timeout = setTimeout(async () => {
 				const descendants = await new Promise<number[]>((resolve, reject) => {
@@ -40,28 +49,25 @@ export const runUnitTest = async ({ task }: { task: Task }) => {
 					})
 				})
 
-				console.log(
-					`${Date.now()} [${tag}] "${command.join(" ")}": unit tests timed out, killing ${subprocess.pid} + ${JSON.stringify(descendants)}`,
-				)
+				log(`"${command.join(" ")}" timed out, killing ${subprocess.pid} + ${JSON.stringify(descendants)}`)
 
 				if (descendants.length > 0) {
 					for (const descendant of descendants) {
 						try {
-							console.log(`${Date.now()} [${tag}] killing ${descendant}`)
-
+							log(`killing descendant process ${descendant}`)
 							await execa`kill -9 ${descendant}`
 						} catch (error) {
-							console.error(`${Date.now()} [${tag}] Error killing descendant processes:`, error)
+							logError(`failed to kill descendant process ${descendant}:`, error)
 						}
 					}
 				}
 
-				console.log(`${Date.now()} [${tag}] killing ${subprocess.pid}`)
+				log(`killing main process ${subprocess.pid}`)
 
 				try {
 					await execa`kill -9 ${subprocess.pid!}`
 				} catch (error) {
-					console.error(`${Date.now()} [${tag}] Error killing process:`, error)
+					logError(`failed to kill main process ${subprocess.pid}:`, error)
 				}
 			}, UNIT_TEST_TIMEOUT)
 
@@ -74,7 +80,7 @@ export const runUnitTest = async ({ task }: { task: Task }) => {
 				break
 			}
 		} catch (error) {
-			console.error(`${Date.now()} [${tag}]`, error)
+			logError(`unexpected error:`, error)
 			passed = false
 			break
 		}

+ 15 - 0
packages/evals/src/cli/utils.ts

@@ -1,5 +1,7 @@
 import * as fs from "fs"
 
+import { execa } from "execa"
+
 import type { Run, Task } from "../db/index.js"
 
 export const getTag = (caller: string, { run, task }: { run: Run; task?: Task }) =>
@@ -14,3 +16,16 @@ export const isDockerContainer = () => {
 		return false
 	}
 }
+
+export const resetEvalsRepo = async ({ run, cwd }: { run: Run; cwd: string }) => {
+	await execa({ cwd })`git config user.name "Roo Code"`
+	await execa({ cwd })`git config user.email "[email protected]"`
+	await execa({ cwd })`git checkout -f`
+	await execa({ cwd })`git clean -fd`
+	await execa({ cwd })`git checkout -b runs/${run.id}-${crypto.randomUUID().slice(0, 8)} main`
+}
+
+export const commitEvalsRepoChanges = async ({ run, cwd }: { run: Run; cwd: string }) => {
+	await execa({ cwd })`git add .`
+	await execa({ cwd })`git commit -m ${`Run #${run.id}`} --no-verify`
+}

+ 19 - 1
packages/ipc/src/ipc-client.ts

@@ -3,7 +3,14 @@ import * as crypto from "node:crypto"
 
 import ipc from "node-ipc"
 
-import { type IpcClientEvents, type IpcMessage, IpcOrigin, IpcMessageType, ipcMessageSchema } from "@roo-code/types"
+import {
+	type TaskCommand,
+	type IpcClientEvents,
+	type IpcMessage,
+	IpcOrigin,
+	IpcMessageType,
+	ipcMessageSchema,
+} from "@roo-code/types"
 
 export class IpcClient extends EventEmitter<IpcClientEvents> {
 	private readonly _socketPath: string
@@ -80,6 +87,17 @@ export class IpcClient extends EventEmitter<IpcClientEvents> {
 		this._log(...args)
 	}
 
+	public sendCommand(command: TaskCommand) {
+		const message: IpcMessage = {
+			type: IpcMessageType.TaskCommand,
+			origin: IpcOrigin.Client,
+			clientId: this._clientId!,
+			data: command,
+		}
+
+		this.sendMessage(message)
+	}
+
 	public sendMessage(message: IpcMessage) {
 		ipc.of[this._id]?.emit("message", message)
 	}