Chris Estreich 8 luni în urmă
părinte
comite
8d5dab3518

+ 74 - 0
.github/workflows/evals.yml

@@ -0,0 +1,74 @@
+name: Evals
+
+on:
+    pull_request:
+        types: [labeled]
+    workflow_dispatch:
+
+env:
+    DOCKER_BUILDKIT: 1
+    COMPOSE_DOCKER_CLI_BUILD: 1
+
+jobs:
+    evals:
+        # Run if triggered manually or if PR has 'evals' label.
+        if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'evals')
+        runs-on: blacksmith-16vcpu-ubuntu-2404
+        timeout-minutes: 45
+
+        defaults:
+            run:
+                working-directory: packages/evals
+
+        steps:
+            - name: Checkout repository
+              uses: actions/checkout@v4
+
+            - name: Set up Docker Buildx
+              uses: docker/setup-buildx-action@v3
+
+            - name: Create environment
+              run: |
+                  cat > .env.local << EOF
+                  OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY || 'test-key-for-build' }}
+                  EOF
+
+                  cat > .env.development << EOF
+                  NODE_ENV=development
+                  DATABASE_URL=postgresql://postgres:password@db:5432/evals_development
+                  REDIS_URL=redis://redis:6379
+                  HOST_EXECUTION_METHOD=docker
+                  EOF
+
+            - name: Build image
+              uses: docker/build-push-action@v5
+              with:
+                  context: .
+                  file: packages/evals/Dockerfile.runner
+                  tags: evals-runner:latest
+                  cache-from: type=gha
+                  cache-to: type=gha,mode=max
+                  push: false
+                  load: true
+
+            - name: Tag image
+              run: docker tag evals-runner:latest evals-runner
+
+            - name: Start containers
+              run: |
+                  docker compose up -d db redis
+                  timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done'
+                  timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done'
+                  docker compose run --rm runner sh -c 'nc -z db 5432 && echo "✓ Runner -> Database connection successful"'
+                  docker compose run --rm runner sh -c 'nc -z redis 6379 && echo "✓ Runner -> Redis connection successful"'
+                  docker compose run --rm runner docker ps
+
+            - name: Run database migrations
+              run: docker compose run --rm runner pnpm --filter @roo-code/evals db:migrate
+
+            - name: Run evals
+              run: docker compose run --rm runner pnpm --filter @roo-code/evals cli --ci
+
+            - name: Cleanup
+              if: always()
+              run: docker compose down -v --remove-orphans

+ 4 - 19
apps/web-evals/src/actions/exercises.ts

@@ -1,37 +1,22 @@
 "use server"
 
-import * as fs from "fs/promises"
 import * as path from "path"
 import { fileURLToPath } from "url"
 
-import { type ExerciseLanguage, exerciseLanguages } from "@roo-code/evals"
+import { exerciseLanguages, listDirectories } from "@roo-code/evals"
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url)) // <repo>/apps/web-evals/src/actions
 
-const EXERCISES_BASE_PATH = path.resolve(__dirname, "../../../../../evals")
-
-export const listDirectories = async (relativePath: string) => {
-	try {
-		const targetPath = path.resolve(__dirname, relativePath)
-		const entries = await fs.readdir(targetPath, { withFileTypes: true })
-		return entries.filter((entry) => entry.isDirectory() && !entry.name.startsWith(".")).map((entry) => entry.name)
-	} catch (error) {
-		console.error(`Error listing directories at ${relativePath}:`, error)
-		return []
-	}
-}
+const EVALS_REPO_PATH = path.resolve(__dirname, "../../../../../evals")
 
 export const getExercises = async () => {
 	const result = await Promise.all(
 		exerciseLanguages.map(async (language) => {
-			const languagePath = path.join(EXERCISES_BASE_PATH, language)
-			const exercises = await listDirectories(languagePath)
+			const languagePath = path.join(EVALS_REPO_PATH, language)
+			const exercises = await listDirectories(__dirname, languagePath)
 			return exercises.map((exercise) => `${language}/${exercise}`)
 		}),
 	)
 
 	return result.flat()
 }
-
-export const getExercisesForLanguage = async (language: ExerciseLanguage) =>
-	listDirectories(path.join(EXERCISES_BASE_PATH, language))

+ 7 - 4
apps/web-evals/src/actions/runs.ts

@@ -1,7 +1,9 @@
 "use server"
 
-import { spawn } from "child_process"
+import * as path from "path"
 import fs from "fs"
+import { fileURLToPath } from "url"
+import { spawn } from "child_process"
 
 import { revalidatePath } from "next/cache"
 import pMap from "p-map"
@@ -12,11 +14,12 @@ import {
 	createRun as _createRun,
 	deleteRun as _deleteRun,
 	createTask,
+	getExercisesForLanguage,
 } from "@roo-code/evals"
 
 import { CreateRun } from "@/lib/schemas"
 
-import { getExercisesForLanguage } from "./exercises"
+const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals")
 
 // eslint-disable-next-line @typescript-eslint/no-unused-vars
 export async function createRun({ suite, exercises = [], systemPrompt, ...values }: CreateRun) {
@@ -37,9 +40,9 @@ export async function createRun({ suite, exercises = [], systemPrompt, ...values
 		}
 	} else {
 		for (const language of exerciseLanguages) {
-			const exercises = await getExercisesForLanguage(language)
+			const exercises = await getExercisesForLanguage(EVALS_REPO_PATH, language)
 
-			await pMap(exercises, (exercise) => createTask({ ...values, runId: run.id, language, exercise }), {
+			await pMap(exercises, (exercise) => createTask({ runId: run.id, language, exercise }), {
 				concurrency: 10,
 			})
 		}

+ 24 - 0
apps/web-evals/src/app/api/health/route.ts

@@ -0,0 +1,24 @@
+import { NextResponse } from "next/server"
+
+export async function GET() {
+	try {
+		return NextResponse.json(
+			{
+				status: "healthy",
+				timestamp: new Date().toISOString(),
+				uptime: process.uptime(),
+				environment: process.env.NODE_ENV || "production",
+			},
+			{ status: 200 },
+		)
+	} catch (error) {
+		return NextResponse.json(
+			{
+				status: "unhealthy",
+				timestamp: new Date().toISOString(),
+				error: error instanceof Error ? error.message : "Unknown error",
+			},
+			{ status: 503 },
+		)
+	}
+}

+ 1 - 0
packages/evals/Dockerfile.runner

@@ -13,6 +13,7 @@ RUN apt update && \
   git \
   vim \
   jq \
+  netcat-openbsd \
   apt-transport-https \
   ca-certificates \
   gnupg \

+ 1 - 1
packages/evals/Dockerfile.web

@@ -8,7 +8,7 @@ RUN npm install -g npm@latest
 RUN npm install -g npm-run-all
 
 # Install system packages
-RUN apt update && apt install -y curl git vim jq postgresql-client
+RUN apt update && apt install -y curl git vim jq netcat-openbsd postgresql-client
 
 # Install Docker cli
 RUN apt install -y apt-transport-https ca-certificates gnupg lsb-release

+ 8 - 4
packages/evals/docker-compose.yml

@@ -17,8 +17,10 @@ services:
     db:
         container_name: evals-db
         image: postgres:15.4
-        expose:
-            - 5432
+        # expose:
+        #     - 5432
+        ports:
+            - "${EVALS_DB_PORT:-5432}:5432"
         volumes:
             - ./.docker/postgres:/var/lib/postgresql/data
             - ./.docker/scripts/postgres:/docker-entrypoint-initdb.d
@@ -38,8 +40,10 @@ services:
     redis:
         container_name: evals-redis
         image: redis:7-alpine
-        expose:
-            - 6379
+        # expose:
+        #     - 6379
+        ports:
+            - "${EVALS_REDIS_PORT:-6379}:6379"
         volumes:
             - ./.docker/redis:/data
         command: redis-server --appendonly yes

+ 2 - 1
packages/evals/package.json

@@ -21,7 +21,8 @@
 		"db:start": "docker compose up -d db",
 		"db:stop": "docker compose down db",
 		"redis:start": "docker compose up -d redis",
-		"redis:stop": "docker compose down redis"
+		"redis:stop": "docker compose down redis",
+		"services:start": "docker compose up -d db redis"
 	},
 	"dependencies": {
 		"@roo-code/ipc": "workspace:^",

+ 0 - 86
packages/evals/src/cli/FileLogger.ts

@@ -1,86 +0,0 @@
-import * as fs from "fs"
-import * as path from "path"
-
-export enum LogLevel {
-	INFO = "INFO",
-	ERROR = "ERROR",
-	WARN = "WARN",
-	DEBUG = "DEBUG",
-}
-
-export interface LoggerOptions {
-	logDir: string
-	filename: string
-	tag: string
-}
-
-export class FileLogger {
-	private logStream: fs.WriteStream | undefined
-	private logFilePath: string
-	private tag: string
-
-	constructor({ logDir, filename, tag }: LoggerOptions) {
-		this.tag = tag
-		this.logFilePath = path.join(logDir, filename)
-		this.initializeLogger(logDir)
-	}
-
-	private initializeLogger(logDir: string): void {
-		try {
-			fs.mkdirSync(logDir, { recursive: true })
-		} catch (error) {
-			console.error(`Failed to create log directory ${logDir}:`, error)
-		}
-
-		try {
-			this.logStream = fs.createWriteStream(this.logFilePath, { flags: "a" })
-		} catch (error) {
-			console.error(`Failed to create log file ${this.logFilePath}:`, error)
-		}
-	}
-
-	private writeToLog(level: LogLevel, message: string, ...args: unknown[]) {
-		try {
-			const timestamp = new Date().toISOString()
-
-			const logLine = `[${timestamp} | ${level} | ${this.tag}] ${message} ${
-				args.length > 0 ? JSON.stringify(args) : ""
-			}\n`
-
-			console.log(logLine.trim())
-
-			if (this.logStream) {
-				this.logStream.write(logLine)
-			}
-		} catch (error) {
-			console.error(`Failed to write to log file ${this.logFilePath}:`, error)
-		}
-	}
-
-	public info(message: string, ...args: unknown[]): void {
-		this.writeToLog(LogLevel.INFO, message, ...args)
-	}
-
-	public error(message: string, ...args: unknown[]): void {
-		this.writeToLog(LogLevel.ERROR, message, ...args)
-	}
-
-	public warn(message: string, ...args: unknown[]): void {
-		this.writeToLog(LogLevel.WARN, message, ...args)
-	}
-
-	public debug(message: string, ...args: unknown[]): void {
-		this.writeToLog(LogLevel.DEBUG, message, ...args)
-	}
-
-	public log(message: string, ...args: unknown[]): void {
-		this.info(message, ...args)
-	}
-
-	public close(): void {
-		if (this.logStream) {
-			this.logStream.end()
-			this.logStream = undefined
-		}
-	}
-}

+ 14 - 16
packages/evals/src/cli/index.ts

@@ -1,11 +1,12 @@
 import * as fs from "fs"
 
-import { command, run, number, option } from "cmd-ts"
+import { run, command, option, flag, number, boolean } from "cmd-ts"
 
-import { exercisesPath } from "../exercises/index.js"
+import { EVALS_REPO_PATH } from "../exercises/index.js"
 
+import { runCi } from "./runCi.js"
 import { runEvals } from "./runEvals.js"
-import { processTask } from "./processTask.js"
+import { processTask } from "./runTask.js"
 
 const main = async () => {
 	await run(
@@ -14,25 +15,22 @@ const main = async () => {
 			description: "Execute an eval run.",
 			version: "0.0.0",
 			args: {
+				ci: flag({ type: boolean, long: "ci", defaultValue: () => false }),
 				runId: option({ type: number, long: "runId", short: "r", defaultValue: () => -1 }),
 				taskId: option({ type: number, long: "taskId", short: "t", defaultValue: () => -1 }),
 			},
 			handler: async (args) => {
-				const { runId, taskId } = args
-
-				if (runId === -1 && taskId === -1) {
-					throw new Error("Either runId or taskId must be provided.")
-				}
-
-				if (runId !== -1 && taskId !== -1) {
-					throw new Error("Only one of runId or taskId must be provided.")
-				}
+				const { runId, taskId, ci } = args
 
 				try {
-					if (runId !== -1) {
+					if (ci) {
+						await runCi({ concurrency: 3, exercisesPerLanguage: 5 })
+					} else if (runId !== -1) {
 						await runEvals(runId)
-					} else {
+					} else if (taskId !== -1) {
 						await processTask({ taskId })
+					} else {
+						throw new Error("Either runId or taskId must be provided.")
 					}
 				} catch (error) {
 					console.error(error)
@@ -46,9 +44,9 @@ const main = async () => {
 	process.exit(0)
 }
 
-if (!fs.existsSync(exercisesPath)) {
+if (!fs.existsSync(EVALS_REPO_PATH)) {
 	console.error(
-		`Exercises do not exist at ${exercisesPath}. Please run "git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals".`,
+		`Exercises do not exist at ${EVALS_REPO_PATH}. Please run "git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals".`,
 	)
 
 	process.exit(1)

+ 0 - 112
packages/evals/src/cli/processTask.ts

@@ -1,112 +0,0 @@
-import { execa } from "execa"
-
-import { RooCodeEventName, type TaskEvent } from "@roo-code/types"
-
-import { findTask, updateTask, findRun } from "../db/index.js"
-
-import { getTag } from "./utils.js"
-import { FileLogger } from "./FileLogger.js"
-import { redisClient, getPubSubKey, registerRunner, deregisterRunner } from "./redis.js"
-import { runTask } from "./runTask.js"
-import { runUnitTest } from "./runUnitTest.js"
-
-export const processTask = async ({ taskId, logger }: { taskId: number; logger?: FileLogger }) => {
-	const task = await findTask(taskId)
-	const { language, exercise } = task
-	const run = await findRun(task.runId)
-	await registerRunner({ runId: run.id, taskId })
-
-	logger =
-		logger ||
-		new FileLogger({
-			logDir: `/var/log/evals/runs/${run.id}`,
-			filename: `${language}-${exercise}.log`,
-			tag: getTag("runTask", { run, task }),
-		})
-
-	try {
-		const publish = async (e: TaskEvent) => {
-			const redis = await redisClient()
-			await redis.publish(getPubSubKey(run.id), JSON.stringify(e))
-		}
-
-		logger.info(`running task ${task.id} (${language}/${exercise})...`)
-		await runTask({ run, task, publish, logger })
-
-		logger.info(`testing task ${task.id} (${language}/${exercise})...`)
-		const passed = await runUnitTest({ run, task })
-
-		logger.info(`task ${task.id} (${language}/${exercise}) -> ${passed}`)
-		await updateTask(task.id, { passed })
-
-		await publish({
-			eventName: passed ? RooCodeEventName.EvalPass : RooCodeEventName.EvalFail,
-			taskId: task.id,
-		})
-	} finally {
-		await deregisterRunner({ runId: run.id, taskId })
-	}
-}
-
-export const processTaskInContainer = async ({
-	taskId,
-	logger,
-	maxRetries = 10,
-}: {
-	taskId: number
-	logger: FileLogger
-	maxRetries?: number
-}) => {
-	const baseArgs = [
-		"--rm",
-		"--network evals_default",
-		"-v /var/run/docker.sock:/var/run/docker.sock",
-		"-v /tmp/evals:/var/log/evals",
-		"-e HOST_EXECUTION_METHOD=docker",
-	]
-
-	const command = `pnpm --filter @roo-code/evals cli --taskId ${taskId}`
-	logger.info(command)
-
-	for (let attempt = 0; attempt <= maxRetries; attempt++) {
-		const containerName = `evals-task-${taskId}.${attempt}`
-		const args = [`--name ${containerName}`, ...baseArgs]
-		const isRetry = attempt > 0
-
-		if (isRetry) {
-			const delayMs = Math.pow(2, attempt - 1) * 1000 * (0.5 + Math.random())
-			logger.info(`retrying in ${delayMs}ms (attempt ${attempt + 1}/${maxRetries + 1})`)
-			await new Promise((resolve) => setTimeout(resolve, delayMs))
-		}
-
-		logger.info(
-			`${isRetry ? "retrying" : "executing"} container command (attempt ${attempt + 1}/${maxRetries + 1})`,
-		)
-
-		const subprocess = execa(`docker run ${args.join(" ")} evals-runner sh -c "${command}"`, { shell: true })
-		// subprocess.stdout?.on("data", (data) => console.log(data.toString()))
-		// subprocess.stderr?.on("data", (data) => console.error(data.toString()))
-
-		try {
-			const result = await subprocess
-			logger.info(`container process completed with exit code: ${result.exitCode}`)
-			return
-		} catch (error) {
-			if (error && typeof error === "object" && "exitCode" in error) {
-				logger.error(
-					`container process failed with exit code: ${error.exitCode} (attempt ${attempt + 1}/${maxRetries + 1})`,
-				)
-			} else {
-				logger.error(`container process failed with error: ${error} (attempt ${attempt + 1}/${maxRetries + 1})`)
-			}
-
-			if (attempt === maxRetries) {
-				break
-			}
-		}
-	}
-
-	logger.error(`all ${maxRetries + 1} attempts failed, giving up`)
-
-	// TODO: Mark task as failed.
-}

+ 30 - 0
packages/evals/src/cli/runCi.ts

@@ -0,0 +1,30 @@
+import pMap from "p-map"
+
+import { EVALS_REPO_PATH, exerciseLanguages, getExercisesForLanguage } from "../exercises/index.js"
+import { createRun, createTask } from "../db/index.js"
+
+import { runEvals } from "./runEvals.js"
+
+export const runCi = async ({
+	concurrency = 1,
+	exercisesPerLanguage,
+}: {
+	concurrency?: number
+	exercisesPerLanguage?: number
+} = {}) => {
+	console.log("Running evals in CI mode.")
+
+	const run = await createRun({ model: "anthropic/claude-sonnet-4", socketPath: "", concurrency })
+
+	for (const language of exerciseLanguages) {
+		let exercises = await getExercisesForLanguage(EVALS_REPO_PATH, language)
+
+		if (exercisesPerLanguage) {
+			exercises = exercises.slice(0, exercisesPerLanguage)
+		}
+
+		await pMap(exercises, (exercise) => createTask({ runId: run.id, language, exercise }), { concurrency })
+	}
+
+	await runEvals(run.id)
+}

+ 6 - 7
packages/evals/src/cli/runEvals.ts

@@ -1,12 +1,11 @@
 import PQueue from "p-queue"
 
 import { findRun, finishRun, getTasks } from "../db/index.js"
-import { exercisesPath } from "../exercises/index.js"
+import { EVALS_REPO_PATH } from "../exercises/index.js"
 
-import { getTag, isDockerContainer, resetEvalsRepo, commitEvalsRepoChanges } from "./utils.js"
-import { processTask, processTaskInContainer } from "./processTask.js"
+import { Logger, getTag, isDockerContainer, resetEvalsRepo, commitEvalsRepoChanges } from "./utils.js"
 import { startHeartbeat, stopHeartbeat } from "./redis.js"
-import { FileLogger } from "./FileLogger.js"
+import { processTask, processTaskInContainer } from "./runTask.js"
 
 export const runEvals = async (runId: number) => {
 	const run = await findRun(runId)
@@ -21,7 +20,7 @@ export const runEvals = async (runId: number) => {
 		throw new Error(`Run ${run.id} has no tasks.`)
 	}
 
-	const logger = new FileLogger({
+	const logger = new Logger({
 		logDir: `/var/log/evals/runs/${run.id}`,
 		filename: `controller.log`,
 		tag: getTag("runEvals", { run }),
@@ -32,7 +31,7 @@ export const runEvals = async (runId: number) => {
 	const containerized = isDockerContainer()
 
 	if (!containerized) {
-		await resetEvalsRepo({ run, cwd: exercisesPath })
+		await resetEvalsRepo({ run, cwd: EVALS_REPO_PATH })
 	}
 
 	const heartbeat = await startHeartbeat(run.id)
@@ -63,7 +62,7 @@ export const runEvals = async (runId: number) => {
 		// will lost when the container is destroyed. I think we should
 		// store the diffs in the database instead.
 		if (!containerized) {
-			await commitEvalsRepoChanges({ run, cwd: exercisesPath })
+			await commitEvalsRepoChanges({ run, cwd: EVALS_REPO_PATH })
 		}
 	} finally {
 		logger.info("cleaning up")

+ 127 - 14
packages/evals/src/cli/runTask.ts

@@ -15,11 +15,21 @@ import {
 } from "@roo-code/types"
 import { IpcClient } from "@roo-code/ipc"
 
-import { type Run, type Task, updateTask, createTaskMetrics, updateTaskMetrics, createToolError } from "../db/index.js"
-import { exercisesPath } from "../exercises/index.js"
-
-import { isDockerContainer } from "./utils.js"
-import { FileLogger } from "./FileLogger.js"
+import {
+	type Run,
+	type Task,
+	findRun,
+	findTask,
+	updateTask,
+	createTaskMetrics,
+	updateTaskMetrics,
+	createToolError,
+} from "../db/index.js"
+import { EVALS_REPO_PATH } from "../exercises/index.js"
+
+import { Logger, getTag, isDockerContainer } from "./utils.js"
+import { redisClient, getPubSubKey, registerRunner, deregisterRunner } from "./redis.js"
+import { runUnitTest } from "./runUnitTest.js"
 
 class SubprocessTimeoutError extends Error {
 	constructor(timeout: number) {
@@ -28,17 +38,118 @@ class SubprocessTimeoutError extends Error {
 	}
 }
 
+export const processTask = async ({ taskId, logger }: { taskId: number; logger?: Logger }) => {
+	const task = await findTask(taskId)
+	const { language, exercise } = task
+	const run = await findRun(task.runId)
+	await registerRunner({ runId: run.id, taskId })
+
+	logger =
+		logger ||
+		new Logger({
+			logDir: `/var/log/evals/runs/${run.id}`,
+			filename: `${language}-${exercise}.log`,
+			tag: getTag("runTask", { run, task }),
+		})
+
+	try {
+		const publish = async (e: TaskEvent) => {
+			const redis = await redisClient()
+			await redis.publish(getPubSubKey(run.id), JSON.stringify(e))
+		}
+
+		logger.info(`running task ${task.id} (${language}/${exercise})...`)
+		await runTask({ run, task, publish, logger })
+
+		logger.info(`testing task ${task.id} (${language}/${exercise})...`)
+		const passed = await runUnitTest({ task, logger })
+
+		logger.info(`task ${task.id} (${language}/${exercise}) -> ${passed}`)
+		await updateTask(task.id, { passed })
+
+		await publish({
+			eventName: passed ? RooCodeEventName.EvalPass : RooCodeEventName.EvalFail,
+			taskId: task.id,
+		})
+	} finally {
+		await deregisterRunner({ runId: run.id, taskId })
+	}
+}
+
+export const processTaskInContainer = async ({
+	taskId,
+	logger,
+	maxRetries = 10,
+}: {
+	taskId: number
+	logger: Logger
+	maxRetries?: number
+}) => {
+	const baseArgs = [
+		"--rm",
+		"--network evals_default",
+		"-v /var/run/docker.sock:/var/run/docker.sock",
+		"-v /tmp/evals:/var/log/evals",
+		"-e HOST_EXECUTION_METHOD=docker",
+	]
+
+	const command = `pnpm --filter @roo-code/evals cli --taskId ${taskId}`
+	logger.info(command)
+
+	for (let attempt = 0; attempt <= maxRetries; attempt++) {
+		const containerName = `evals-task-${taskId}.${attempt}`
+		const args = [`--name ${containerName}`, ...baseArgs]
+		const isRetry = attempt > 0
+
+		if (isRetry) {
+			const delayMs = Math.pow(2, attempt - 1) * 1000 * (0.5 + Math.random())
+			logger.info(`retrying in ${delayMs}ms (attempt ${attempt + 1}/${maxRetries + 1})`)
+			await new Promise((resolve) => setTimeout(resolve, delayMs))
+		}
+
+		logger.info(
+			`${isRetry ? "retrying" : "executing"} container command (attempt ${attempt + 1}/${maxRetries + 1})`,
+		)
+
+		const subprocess = execa(`docker run ${args.join(" ")} evals-runner sh -c "${command}"`, { shell: true })
+		// subprocess.stdout?.on("data", (data) => console.log(data.toString()))
+		// subprocess.stderr?.on("data", (data) => console.error(data.toString()))
+
+		try {
+			const result = await subprocess
+			logger.info(`container process completed with exit code: ${result.exitCode}`)
+			return
+		} catch (error) {
+			if (error && typeof error === "object" && "exitCode" in error) {
+				logger.error(
+					`container process failed with exit code: ${error.exitCode} (attempt ${attempt + 1}/${maxRetries + 1})`,
+				)
+			} else {
+				logger.error(`container process failed with error: ${error} (attempt ${attempt + 1}/${maxRetries + 1})`)
+			}
+
+			if (attempt === maxRetries) {
+				break
+			}
+		}
+	}
+
+	logger.error(`all ${maxRetries + 1} attempts failed, giving up`)
+
+	// TODO: Mark task as failed.
+}
+
 type RunTaskOptions = {
 	run: Run
 	task: Task
 	publish: (taskEvent: TaskEvent) => Promise<void>
-	logger: FileLogger
+	logger: Logger
 }
 
 export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) => {
 	const { language, exercise } = task
-	const prompt = fs.readFileSync(path.resolve(exercisesPath, `prompts/${language}.md`), "utf-8")
-	const workspacePath = path.resolve(exercisesPath, language, exercise)
+	const prompt = fs.readFileSync(path.resolve(EVALS_REPO_PATH, `prompts/${language}.md`), "utf-8")
+	const workspacePath = path.resolve(EVALS_REPO_PATH, language, exercise)
 	const ipcSocketPath = path.resolve(os.tmpdir(), `evals-${run.id}-${task.id}.sock`)
 	const env = { ROO_CODE_IPC_SOCKET_PATH: ipcSocketPath }
 	const controller = new AbortController()
@@ -87,6 +198,7 @@ export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) =>
 	let taskStartedAt = Date.now()
 	let taskFinishedAt: number | undefined
 	let taskAbortedAt: number | undefined
+	let taskTimedOut: boolean = false
 	let taskMetricsId: number | undefined
 	let rooTaskId: string | undefined
 	let isClientDisconnected = false
@@ -196,6 +308,7 @@ export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) =>
 			timeout: EVALS_TIMEOUT,
 		})
 	} catch (_error) {
+		taskTimedOut = true
 		logger.error("time limit reached")
 
 		if (rooTaskId && !isClientDisconnected) {
@@ -207,16 +320,16 @@ export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) =>
 		taskFinishedAt = Date.now()
 	}
 
-	if (taskFinishedAt) {
-		logger.info("setting task finished at")
-		await updateTask(task.id, { finishedAt: new Date() })
-	}
-
-	if (!taskFinishedAt && isClientDisconnected) {
+	if (!taskFinishedAt && !taskTimedOut) {
 		logger.error("client disconnected before task finished")
 		throw new Error("Client disconnected before task completion.")
 	}
 
+	// If the task was aborted unexpectedly or the client disconnected
+	// unexpectedly, then throw to trigger a retry.
+	logger.info("setting task finished at")
+	await updateTask(task.id, { finishedAt: new Date() })
+
 	if (rooTaskId && !isClientDisconnected) {
 		logger.info("closing task")
 		client.sendCommand({ commandName: TaskCommandName.CloseTask, data: rooTaskId })

+ 19 - 18
packages/evals/src/cli/runUnitTest.ts

@@ -3,14 +3,14 @@ import * as path from "path"
 import { execa, parseCommandString } from "execa"
 import psTree from "ps-tree"
 
-import type { Run, Task } from "../db/index.js"
-import { type ExerciseLanguage, exercisesPath } from "../exercises/index.js"
+import type { Task } from "../db/index.js"
+import { type ExerciseLanguage, EVALS_REPO_PATH } from "../exercises/index.js"
 
-import { getTag } from "./utils.js"
+import { Logger } from "./utils.js"
 
 const UNIT_TEST_TIMEOUT = 2 * 60 * 1_000
 
-const testCommands: Record<ExerciseLanguage, { commands: string[]; timeout?: number; cwd?: string }> = {
+const testCommands: Record<ExerciseLanguage, { commands: string[]; timeout?: number }> = {
 	go: { commands: ["go test"] },
 	java: { commands: ["./gradlew test"] },
 	javascript: { commands: ["pnpm install", "pnpm test"] },
@@ -18,22 +18,21 @@ const testCommands: Record<ExerciseLanguage, { commands: string[]; timeout?: num
 	rust: { commands: ["cargo test"] },
 }
 
-export const runUnitTest = async ({ run, task }: { run: Run; task: Task }) => {
-	const tag = getTag("runUnitTest", { run, task })
-	const log = (message: string, ...args: unknown[]) => console.log(`[${Date.now()} | ${tag}] ${message}`, ...args)
-	const logError = (message: string, ...args: unknown[]) =>
-		console.error(`[${Date.now()} | ${tag}] ${message}`, ...args)
+type RunUnitTestOptions = {
+	task: Task
+	logger: Logger
+}
 
+export const runUnitTest = async ({ task, logger }: RunUnitTestOptions) => {
 	const cmd = testCommands[task.language]
-	const exercisePath = path.resolve(exercisesPath, task.language, task.exercise)
-	const cwd = cmd.cwd ? path.resolve(exercisePath, cmd.cwd) : exercisePath
+	const cwd = path.resolve(EVALS_REPO_PATH, task.language, task.exercise)
 	const commands = cmd.commands.map((cs) => parseCommandString(cs))
 
 	let passed = true
 
 	for (const command of commands) {
 		try {
-			log(`running "${command.join(" ")}"`)
+			logger.info(`running "${command.join(" ")}"`)
 			const subprocess = execa({ cwd, shell: "/bin/bash", reject: false })`${command}`
 			subprocess.stdout.pipe(process.stdout)
 			subprocess.stderr.pipe(process.stderr)
@@ -49,25 +48,27 @@ export const runUnitTest = async ({ run, task }: { run: Run; task: Task }) => {
 					})
 				})
 
-				log(`"${command.join(" ")}" timed out, killing ${subprocess.pid} + ${JSON.stringify(descendants)}`)
+				logger.info(
+					`"${command.join(" ")}" timed out, killing ${subprocess.pid} + ${JSON.stringify(descendants)}`,
+				)
 
 				if (descendants.length > 0) {
 					for (const descendant of descendants) {
 						try {
-							log(`killing descendant process ${descendant}`)
+							logger.info(`killing descendant process ${descendant}`)
 							await execa`kill -9 ${descendant}`
 						} catch (error) {
-							logError(`failed to kill descendant process ${descendant}:`, error)
+							logger.error(`failed to kill descendant process ${descendant}:`, error)
 						}
 					}
 				}
 
-				log(`killing main process ${subprocess.pid}`)
+				logger.info(`killing main process ${subprocess.pid}`)
 
 				try {
 					await execa`kill -9 ${subprocess.pid!}`
 				} catch (error) {
-					logError(`failed to kill main process ${subprocess.pid}:`, error)
+					logger.error(`failed to kill main process ${subprocess.pid}:`, error)
 				}
 			}, UNIT_TEST_TIMEOUT)
 
@@ -80,7 +81,7 @@ export const runUnitTest = async ({ run, task }: { run: Run; task: Task }) => {
 				break
 			}
 		} catch (error) {
-			logError(`unexpected error:`, error)
+			logger.error(`unexpected error:`, error)
 			passed = false
 			break
 		}

+ 85 - 0
packages/evals/src/cli/utils.ts

@@ -1,4 +1,5 @@
 import * as fs from "fs"
+import * as path from "path"
 
 import { execa } from "execa"
 
@@ -29,3 +30,87 @@ export const commitEvalsRepoChanges = async ({ run, cwd }: { run: Run; cwd: stri
 	await execa({ cwd })`git add .`
 	await execa({ cwd })`git commit -m ${`Run #${run.id}`} --no-verify`
 }
+
+enum LogLevel {
+	INFO = "INFO",
+	ERROR = "ERROR",
+	WARN = "WARN",
+	DEBUG = "DEBUG",
+}
+
+interface LoggerOptions {
+	logDir: string
+	filename: string
+	tag: string
+}
+
+export class Logger {
+	private logStream: fs.WriteStream | undefined
+	private logFilePath: string
+	private tag: string
+
+	constructor({ logDir, filename, tag }: LoggerOptions) {
+		this.tag = tag
+		this.logFilePath = path.join(logDir, filename)
+		this.initializeLogger(logDir)
+	}
+
+	private initializeLogger(logDir: string): void {
+		try {
+			fs.mkdirSync(logDir, { recursive: true })
+		} catch (error) {
+			console.error(`Failed to create log directory ${logDir}:`, error)
+		}
+
+		try {
+			this.logStream = fs.createWriteStream(this.logFilePath, { flags: "a" })
+		} catch (error) {
+			console.error(`Failed to create log file ${this.logFilePath}:`, error)
+		}
+	}
+
+	private writeToLog(level: LogLevel, message: string, ...args: unknown[]) {
+		try {
+			const timestamp = new Date().toISOString()
+
+			const logLine = `[${timestamp} | ${level} | ${this.tag}] ${message} ${
+				args.length > 0 ? JSON.stringify(args) : ""
+			}\n`
+
+			console.log(logLine.trim())
+
+			if (this.logStream) {
+				this.logStream.write(logLine)
+			}
+		} catch (error) {
+			console.error(`Failed to write to log file ${this.logFilePath}:`, error)
+		}
+	}
+
+	public info(message: string, ...args: unknown[]): void {
+		this.writeToLog(LogLevel.INFO, message, ...args)
+	}
+
+	public error(message: string, ...args: unknown[]): void {
+		this.writeToLog(LogLevel.ERROR, message, ...args)
+	}
+
+	public warn(message: string, ...args: unknown[]): void {
+		this.writeToLog(LogLevel.WARN, message, ...args)
+	}
+
+	public debug(message: string, ...args: unknown[]): void {
+		this.writeToLog(LogLevel.DEBUG, message, ...args)
+	}
+
+	public log(message: string, ...args: unknown[]): void {
+		this.info(message, ...args)
+	}
+
+	public close(): void {
+		if (this.logStream) {
+			this.logStream.end()
+			this.logStream = undefined
+		}
+	}
+}

+ 5 - 5
packages/evals/src/exercises/index.ts

@@ -4,15 +4,15 @@ import { fileURLToPath } from "url"
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url))
 
-export const exercisesPath = path.resolve(__dirname, "..", "..", "..", "..", "..", "evals")
+export const EVALS_REPO_PATH = path.resolve(__dirname, "..", "..", "..", "..", "..", "evals")
 
 export const exerciseLanguages = ["go", "java", "javascript", "python", "rust"] as const
 
 export type ExerciseLanguage = (typeof exerciseLanguages)[number]
 
-const listDirectories = async (relativePath: string) => {
+export const listDirectories = async (basePath: string, relativePath: string) => {
 	try {
-		const targetPath = path.resolve(__dirname, relativePath)
+		const targetPath = path.resolve(basePath, relativePath)
 		const entries = await fs.readdir(targetPath, { withFileTypes: true })
 		return entries.filter((entry) => entry.isDirectory() && !entry.name.startsWith(".")).map((entry) => entry.name)
 	} catch (error) {
@@ -21,5 +21,5 @@ const listDirectories = async (relativePath: string) => {
 	}
 }
 
-export const getExercisesForLanguage = async (language: ExerciseLanguage) =>
-	listDirectories(path.join(exercisesPath, language))
+export const getExercisesForLanguage = async (basePath: string, language: ExerciseLanguage) =>
+	listDirectories(__dirname, path.join(basePath, language))