Przeglądaj źródła

Minor fixes for local (non-Docker) evals (#5604)

Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com>
Chris Estreich 8 miesięcy temu
rodzic
commit
99448fc913

+ 8 - 8
packages/evals/README.md

@@ -26,26 +26,26 @@ echo "OPENROUTER_API_KEY=sk-or-v1-[...]" > packages/evals/.env.local
 Start the evals service:
 
 ```sh
-docker compose -f packages/evals/docker-compose.yml --profile server --profile runner up --build --scale runner=0
+pnpm evals
 ```
 
-The initial build process can take a minute or two. Upon success you should see output indicating that a web service is running on [localhost:3000](http://localhost:3000/):
-<img width="1182" alt="Screenshot 2025-06-05 at 12 05 38 PM" src="https://github.com/user-attachments/assets/34f25a59-1362-458c-aafa-25e13cdb2a7a" />
+The initial build process can take a minute or two. Upon success you should see output indicating that a web service is running on localhost:3000:
+<img width="1182" src="https://github.com/user-attachments/assets/34f25a59-1362-458c-aafa-25e13cdb2a7a" />
 
 Additionally, you'll find in Docker Desktop that database and redis services are running:
-<img width="1283" alt="Screenshot 2025-06-05 at 12 07 09 PM" src="https://github.com/user-attachments/assets/ad75d791-9cc7-41e3-8168-df7b21b49da2" />
+<img width="1283" src="https://github.com/user-attachments/assets/ad75d791-9cc7-41e3-8168-df7b21b49da2" />
 
 Navigate to [localhost:3446](http://localhost:3446/) in your browser and click the 🚀 button.
 
 By default a evals run will run all programming exercises in [Roo Code Evals](https://github.com/RooCodeInc/Roo-Code-Evals) repository with the Claude Sonnet 4 model and default settings. For basic configuration you can specify the LLM to use and any subset of the exercises you'd like. For advanced configuration you can import a Roo Code settings file which will allow you to run the evals with Roo Code configured any way you'd like (this includes custom modes, a footgun prompt, etc).
 
-<img width="1053" alt="Screenshot 2025-06-05 at 12 08 06 PM" src="https://github.com/user-attachments/assets/2367eef4-6ae9-4ac2-8ee4-80f981046486" />
+<img width="1053" src="https://github.com/user-attachments/assets/2367eef4-6ae9-4ac2-8ee4-80f981046486" />
 
 After clicking "Launch" you should find that a "controller" container has spawned as well as `N` "task" containers where `N` is the value you chose for concurrency:
-<img width="1283" alt="Screenshot 2025-06-05 at 12 13 29 PM" src="https://github.com/user-attachments/assets/024413e2-c886-4272-ab59-909b4b114e7c" />
+<img width="1283" src="https://github.com/user-attachments/assets/024413e2-c886-4272-ab59-909b4b114e7c" />
 
 The web app's UI should update in realtime with the results of the eval run:
-<img width="1053" alt="Screenshot 2025-06-05 at 12 14 52 PM" src="https://github.com/user-attachments/assets/6fe3b651-0898-4f14-a231-3cc8d66f0e1f" />
+<img width="1053" src="https://github.com/user-attachments/assets/6fe3b651-0898-4f14-a231-3cc8d66f0e1f" />
 
 ## Resource Usage
 
@@ -60,7 +60,7 @@ CPU Limit = 2 * concurrency
 
 The memory and CPU limits can be set from the "Resources" section of the Docker Desktop settings:
 
-<img width="996" alt="Screenshot 2025-06-06 at 8 54 24 AM" src="https://github.com/user-attachments/assets/a1cbb27d-b09c-450c-9fa8-b662c0537d48" />
+<img width="996" src="https://github.com/user-attachments/assets/a1cbb27d-b09c-450c-9fa8-b662c0537d48" />
 
 ## Stopping
 

+ 0 - 4
packages/evals/docker-compose.yml

@@ -17,8 +17,6 @@ services:
     db:
         container_name: evals-db
         image: postgres:15.4
-        # expose:
-        #     - 5432
         ports:
             - "${EVALS_DB_PORT:-5432}:5432"
         volumes:
@@ -40,8 +38,6 @@ services:
     redis:
         container_name: evals-redis
         image: redis:7-alpine
-        # expose:
-        #     - 6379
         ports:
             - "${EVALS_REDIS_PORT:-6379}:6379"
         volumes:

+ 3 - 3
packages/evals/src/cli/runEvals.ts

@@ -20,16 +20,16 @@ export const runEvals = async (runId: number) => {
 		throw new Error(`Run ${run.id} has no tasks.`)
 	}
 
+	const containerized = isDockerContainer()
+
 	const logger = new Logger({
-		logDir: `/var/log/evals/runs/${run.id}`,
+		logDir: containerized ? `/var/log/evals/runs/${run.id}` : `/tmp/evals/runs/${run.id}`,
 		filename: `controller.log`,
 		tag: getTag("runEvals", { run }),
 	})
 
 	logger.info(`running ${tasks.length} task(s)`)
 
-	const containerized = isDockerContainer()
-
 	if (!containerized) {
 		await resetEvalsRepo({ run, cwd: EVALS_REPO_PATH })
 	}

+ 3 - 2
packages/evals/src/cli/runTask.ts

@@ -44,10 +44,12 @@ export const processTask = async ({ taskId, logger }: { taskId: number; logger?:
 	const run = await findRun(task.runId)
 	await registerRunner({ runId: run.id, taskId })
 
+	const containerized = isDockerContainer()
+
 	logger =
 		logger ||
 		new Logger({
-			logDir: `/var/log/evals/runs/${run.id}`,
+			logDir: containerized ? `/var/log/evals/runs/${run.id}` : `/tmp/evals/runs/${run.id}`,
 			filename: `${language}-${exercise}.log`,
 			tag: getTag("runTask", { run, task }),
 		})
@@ -298,7 +300,6 @@ export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) =>
 				...run.settings, // Allow the provided settings to override `openRouterApiKey`.
 			},
 			text: prompt,
-			newTab: true,
 		},
 	})
 

+ 1 - 1
packages/types/src/global-settings.ts

@@ -177,7 +177,7 @@ export const EVALS_SETTINGS: RooCodeSettings = {
 	apiProvider: "openrouter",
 	openRouterUseMiddleOutTransform: false,
 
-	lastShownAnnouncementId: "may-29-2025-3-19",
+	lastShownAnnouncementId: "jul-09-2025-3-23-0",
 
 	pinnedApiConfigs: {},