Browse Source

[Evals] Headless evals with Docker (#4121)

* [Evals] Try to resurrect headless evals with Docker

* fixes; now it runs

* rm

* remove commented out files

* remove trailing space

* nit

* throw error on concurrency attempt
Canyon Robins 7 months ago
parent
commit
6762b579b2
4 changed files with 150 additions and 18 deletions
  1. 18 0
      .dockerignore
  2. 78 0
      evals/Dockerfile
  3. 46 17
      evals/apps/cli/src/index.ts
  4. 8 1
      evals/package.json

+ 18 - 0
.dockerignore

@@ -0,0 +1,18 @@
+# Build artifacts
+bin/
+!bin/roo-code-latest.vsix
+dist/
+**/dist/
+out/
+**/out/
+
+# Dependencies
+node_modules/
+**/node_modules/
+
+# Test and development files
+coverage/
+**/.vscode-test/
+
+knip.json
+.husky/

+ 78 - 0
evals/Dockerfile

@@ -0,0 +1,78 @@
+FROM node:20-slim AS base
+ ENV PNPM_HOME="/pnpm"
+ ENV PATH="$PNPM_HOME:$PATH"
+RUN corepack enable
+RUN npm install -g npm@latest
+RUN npm install -g npm-run-all
+# Install dependencies
+RUN apt update && apt install -y sudo curl git vim jq
+
+
+# Create a `vscode` user
+RUN useradd -m vscode -s /bin/bash && \
+  echo "vscode ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/vscode && \
+  chmod 0440 /etc/sudoers.d/vscode
+# Install VS Code
+# https://code.visualstudio.com/docs/setup/linux
+RUN apt install -y wget gpg apt-transport-https
+RUN wget -qO- https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > packages.microsoft.gpg
+RUN install -D -o root -g root -m 644 packages.microsoft.gpg /etc/apt/keyrings/packages.microsoft.gpg
+RUN echo "deb [arch=amd64,arm64,armhf signed-by=/etc/apt/keyrings/packages.microsoft.gpg] https://packages.microsoft.com/repos/code stable main" | tee /etc/apt/sources.list.d/vscode.list > /dev/null
+RUN rm -f packages.microsoft.gpg
+RUN apt update && apt install -y code
+# Install Xvfb
+RUN apt install -y xvfb
+# [cpp] Install cmake 3.28.3
+RUN apt install -y cmake
+# [go] Install Go 1.22.2
+RUN apt install -y golang-go
+# [java] Install Java 21
+RUN apt install -y default-jre
+# [python] Install Python 3.12.3 and uv 0.6.6
+RUN apt install -y python3 python3-venv python3-dev python3-pip
+# [rust] Install Rust 1.85
+RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
+RUN echo 'source $HOME/.cargo/env' >> $HOME/.bashrc
+ WORKDIR /home/vscode
+ USER vscode
+
+ # Copy evals
+ RUN git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals
+
+ # Prepare evals
+ WORKDIR /home/vscode/evals/python
+ RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+ RUN /home/vscode/.local/bin/uv sync
+
+ WORKDIR /home/vscode/repo/benchmark
+
+ # Install dependencies
+ COPY --chown=vscode:vscode ./evals/package.json ./evals/pnpm-lock.yaml ./evals/pnpm-workspace.yaml ./evals/.npmrc ./
+ RUN mkdir -p apps/cli apps/web \
+   config/eslint config/typescript \
+   packages/db packages/ipc packages/lib packages/types
+ COPY --chown=vscode:vscode ./evals/apps/cli/package.json          ./apps/cli/
+ COPY --chown=vscode:vscode ./evals/apps/web/package.json          ./apps/web/
+ COPY --chown=vscode:vscode ./evals/config/eslint/package.json     ./config/eslint/
+ COPY --chown=vscode:vscode ./evals/config/typescript/package.json ./config/typescript/
+ COPY --chown=vscode:vscode ./evals/packages/db/package.json       ./packages/db/
+ COPY --chown=vscode:vscode ./evals/packages/ipc/package.json      ./packages/ipc/
+ COPY --chown=vscode:vscode ./evals/packages/lib/package.json      ./packages/lib/
+ COPY --chown=vscode:vscode ./evals/packages/types/package.json    ./packages/types/
+ RUN pnpm install
+
+ # Copy & install extension
+ COPY --chown=vscode:vscode ./bin/roo-code-latest.vsix ./
+ RUN code --debug --install-extension ./roo-code-latest.vsix
+
+ # Copy application code
+ COPY --chown=vscode:vscode ./evals ./
+
+ # Copy environment variables
+ COPY --chown=vscode:vscode ./evals/.env ./
+
+ # Push database schema
+ RUN pnpm --filter @evals/db db:push
+
+ EXPOSE 3000
+ CMD ["pnpm", "web"]

+ 46 - 17
evals/apps/cli/src/index.ts

@@ -194,12 +194,31 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
 
 	console.log(`${Date.now()} [cli#runExercise] Opening new VS Code window at ${workspacePath}`)
 
-	await execa({
+	const controller = new AbortController()
+	const cancelSignal = controller.signal
+
+	// If debugging:
+	// Use --wait --log trace or --verbose.
+	let codeCommand = `code --disable-workspace-trust`
+	const isDocker = fs.existsSync("/.dockerenv")
+
+	if (isDocker) {
+		if (run.concurrency > 1) {
+			throw new Error("Cannot run multiple tasks in parallel in Docker. Please set concurrency to 1.")
+		}
+		codeCommand = `xvfb-run --auto-servernum --server-num=1 ${codeCommand} --wait --log trace --disable-gpu --password-store="basic"`
+	}
+
+	const subprocess = execa({
 		env: {
 			ROO_CODE_IPC_SOCKET_PATH: taskSocketPath,
 		},
 		shell: "/bin/bash",
-	})`code --disable-workspace-trust -n ${workspacePath}`
+		cancelSignal,
+	})`${codeCommand} -n ${workspacePath}`
+
+	// If debugging:
+	// subprocess.stdout.pipe(process.stdout)
 
 	// Give VSCode some time to spawn before connecting to its unix socket.
 	await new Promise((resolve) => setTimeout(resolve, 3_000))
@@ -309,23 +328,30 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
 
 	console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] starting task`)
 
-	client.sendMessage({
-		type: IpcMessageType.TaskCommand,
-		origin: IpcOrigin.Client,
-		clientId: client.clientId!,
-		data: {
-			commandName: TaskCommandName.StartNewTask,
+	if (client.isReady) {
+		client.sendMessage({
+			type: IpcMessageType.TaskCommand,
+			origin: IpcOrigin.Client,
+			clientId: client.clientId!,
 			data: {
-				configuration: {
-					...rooCodeDefaults,
-					openRouterApiKey: process.env.OPENROUTER_API_KEY!,
-					...run.settings,
+				commandName: TaskCommandName.StartNewTask,
+				data: {
+					configuration: {
+						...rooCodeDefaults,
+						openRouterApiKey: process.env.OPENROUTER_API_KEY!,
+						...run.settings,
+					},
+					text: prompt,
+					newTab: true,
 				},
-				text: prompt,
-				newTab: true,
 			},
-		},
-	})
+		})
+	} else {
+		console.log(`[cli#runExercise | ${language} / ${exercise}] unable to connect`)
+		client.disconnect()
+		taskFinishedAt = Date.now()
+		isClientDisconnected = true
+	}
 
 	try {
 		await pWaitFor(() => !!taskFinishedAt || isClientDisconnected, { interval: 1_000, timeout: TASK_TIMEOUT })
@@ -365,6 +391,9 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
 		client.disconnect()
 	}
 
+	controller.abort()
+	await subprocess
+
 	return { success: !!taskFinishedAt }
 }
 
@@ -520,7 +549,7 @@ if (!fs.existsSync(extensionDevelopmentPath)) {
 
 if (!fs.existsSync(exercisesPath)) {
 	console.error(
-		`Exercises path does not exist. Please run "git clone https://github.com/cte/Roo-Code-Benchmark.git exercises".`,
+		`Exercises do not exist at ${exercisesPath}. Please run "git clone https://github.com/RooCodeInc/Roo-Code-Evals.git evals".`,
 	)
 	process.exit(1)
 }

+ 8 - 1
evals/package.json

@@ -10,7 +10,14 @@
 		"build": "turbo build --log-order grouped --output-logs new-only",
 		"web": "turbo dev --filter @evals/web",
 		"cli": "turbo dev --filter @evals/cli -- run",
-		"drizzle:studio": "pnpm --filter @evals/db db:studio"
+		"drizzle:studio": "pnpm --filter @evals/db db:studio",
+		"docker:build": "docker build -f Dockerfile -t roo-code-eval --progress=plain ..",
+		"docker:run": "touch /tmp/evals.db && docker run -d -it -p 3000:3000 -v /tmp/evals.db:/tmp/evals.db roo-code-eval",
+		"docker:start": "pnpm docker:build && pnpm docker:run",
+		"docker:shell": "docker exec -it $(docker ps --filter \"ancestor=roo-code-eval\" -q) /bin/bash",
+		"docker:stop": "docker stop $(docker ps --filter \"ancestor=roo-code-eval\" -q)",
+		"docker:rm": "docker rm $(docker ps -a --filter \"ancestor=roo-code-eval\" -q)",
+		"docker:clean": "pnpm docker:stop && pnpm docker:rm"
 	},
 	"devDependencies": {
 		"@dotenvx/dotenvx": "^1.41.0",