8 miesięcy temu · 99448fc913
--- a/packages/evals/README.md
+++ b/packages/evals/README.md
@@ -26,26 +26,26 @@ echo "OPENROUTER_API_KEY=sk-or-v1-[...]" > packages/evals/.env.local
 
				 Start the evals service:
			
 
				 
			
 
				 ```sh
			
 
				-docker compose -f packages/evals/docker-compose.yml --profile server --profile runner up --build --scale runner=0
			
 
				+pnpm evals
			
 
				 ```
			
 
				 
			
 
				-The initial build process can take a minute or two. Upon success you should see output indicating that a web service is running on [localhost:3000](http://localhost:3000/):
			
 
				-<img width="1182" alt="Screenshot 2025-06-05 at 12 05 38 PM" src="https://github.com/user-attachments/assets/34f25a59-1362-458c-aafa-25e13cdb2a7a" />
			
 
				+The initial build process can take a minute or two. Upon success you should see output indicating that a web service is running on localhost:3000:
			
 
				+<img width="1182" src="https://github.com/user-attachments/assets/34f25a59-1362-458c-aafa-25e13cdb2a7a" />
			
 
				 
			
 
				 Additionally, you'll find in Docker Desktop that database and redis services are running:
			
 
				-<img width="1283" alt="Screenshot 2025-06-05 at 12 07 09 PM" src="https://github.com/user-attachments/assets/ad75d791-9cc7-41e3-8168-df7b21b49da2" />
			
 
				+<img width="1283" src="https://github.com/user-attachments/assets/ad75d791-9cc7-41e3-8168-df7b21b49da2" />
			
 
				 
			
 
				 Navigate to [localhost:3446](http://localhost:3446/) in your browser and click the 🚀 button.
			
 
				 
			
 
				 By default a evals run will run all programming exercises in [Roo Code Evals](https://github.com/RooCodeInc/Roo-Code-Evals) repository with the Claude Sonnet 4 model and default settings. For basic configuration you can specify the LLM to use and any subset of the exercises you'd like. For advanced configuration you can import a Roo Code settings file which will allow you to run the evals with Roo Code configured any way you'd like (this includes custom modes, a footgun prompt, etc).
			
 
				 
			
 
				-<img width="1053" alt="Screenshot 2025-06-05 at 12 08 06 PM" src="https://github.com/user-attachments/assets/2367eef4-6ae9-4ac2-8ee4-80f981046486" />
			
 
				+<img width="1053" src="https://github.com/user-attachments/assets/2367eef4-6ae9-4ac2-8ee4-80f981046486" />
			
 
				 
			
 
				 After clicking "Launch" you should find that a "controller" container has spawned as well as `N` "task" containers where `N` is the value you chose for concurrency:
			
 
				-<img width="1283" alt="Screenshot 2025-06-05 at 12 13 29 PM" src="https://github.com/user-attachments/assets/024413e2-c886-4272-ab59-909b4b114e7c" />
			
 
				+<img width="1283" src="https://github.com/user-attachments/assets/024413e2-c886-4272-ab59-909b4b114e7c" />
			
 
				 
			
 
				 The web app's UI should update in realtime with the results of the eval run:
			
 
				-<img width="1053" alt="Screenshot 2025-06-05 at 12 14 52 PM" src="https://github.com/user-attachments/assets/6fe3b651-0898-4f14-a231-3cc8d66f0e1f" />
			
 
				+<img width="1053" src="https://github.com/user-attachments/assets/6fe3b651-0898-4f14-a231-3cc8d66f0e1f" />
			
 
				 
			
 
				 ## Resource Usage
			
 
				 
			
@@ -60,7 +60,7 @@ CPU Limit = 2 * concurrency
 
				 
			
 
				 The memory and CPU limits can be set from the "Resources" section of the Docker Desktop settings:
			
 
				 
			
 
				-<img width="996" alt="Screenshot 2025-06-06 at 8 54 24 AM" src="https://github.com/user-attachments/assets/a1cbb27d-b09c-450c-9fa8-b662c0537d48" />
			
 
				+<img width="996" src="https://github.com/user-attachments/assets/a1cbb27d-b09c-450c-9fa8-b662c0537d48" />
			
 
				 
			
 
				 ## Stopping
			
 
				 
			
--- a/packages/evals/docker-compose.yml
+++ b/packages/evals/docker-compose.yml
@@ -17,8 +17,6 @@ services:
 
				     db:
			
 
				         container_name: evals-db
			
 
				         image: postgres:15.4
			
 
				-        # expose:
			
 
				-        #     - 5432
			
 
				         ports:
			
 
				             - "${EVALS_DB_PORT:-5432}:5432"
			
 
				         volumes:
			
@@ -40,8 +38,6 @@ services:
 
				     redis:
			
 
				         container_name: evals-redis
			
 
				         image: redis:7-alpine
			
 
				-        # expose:
			
 
				-        #     - 6379
			
 
				         ports:
			
 
				             - "${EVALS_REDIS_PORT:-6379}:6379"
			
 
				         volumes:
			
--- a/packages/evals/src/cli/runEvals.ts
+++ b/packages/evals/src/cli/runEvals.ts
@@ -20,16 +20,16 @@ export const runEvals = async (runId: number) => {
 
				 		throw new Error(`Run ${run.id} has no tasks.`)
			
 
				 	}
			
 
				 
			
 
				+	const containerized = isDockerContainer()
			
 
				+
			
 
				 	const logger = new Logger({
			
 
				-		logDir: `/var/log/evals/runs/${run.id}`,
			
 
				+		logDir: containerized ? `/var/log/evals/runs/${run.id}` : `/tmp/evals/runs/${run.id}`,
			
 
				 		filename: `controller.log`,
			
 
				 		tag: getTag("runEvals", { run }),
			
 
				 	})
			
 
				 
			
 
				 	logger.info(`running ${tasks.length} task(s)`)
			
 
				 
			
 
				-	const containerized = isDockerContainer()
			
 
				-
			
 
				 	if (!containerized) {
			
 
				 		await resetEvalsRepo({ run, cwd: EVALS_REPO_PATH })
			
 
				 	}
			
--- a/packages/evals/src/cli/runTask.ts
+++ b/packages/evals/src/cli/runTask.ts
@@ -44,10 +44,12 @@ export const processTask = async ({ taskId, logger }: { taskId: number; logger?:
 
				 	const run = await findRun(task.runId)
			
 
				 	await registerRunner({ runId: run.id, taskId })
			
 
				 
			
 
				+	const containerized = isDockerContainer()
			
 
				+
			
 
				 	logger =
			
 
				 		logger ||
			
 
				 		new Logger({
			
 
				-			logDir: `/var/log/evals/runs/${run.id}`,
			
 
				+			logDir: containerized ? `/var/log/evals/runs/${run.id}` : `/tmp/evals/runs/${run.id}`,
			
 
				 			filename: `${language}-${exercise}.log`,
			
 
				 			tag: getTag("runTask", { run, task }),
			
 
				 		})
			
@@ -298,7 +300,6 @@ export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) =>
 
				 				...run.settings, // Allow the provided settings to override `openRouterApiKey`.
			
 
				 			},
			
 
				 			text: prompt,
			
 
				-			newTab: true,
			
 
				 		},
			
 
				 	})
			
 
				 
			
--- a/packages/types/src/global-settings.ts
+++ b/packages/types/src/global-settings.ts
@@ -177,7 +177,7 @@ export const EVALS_SETTINGS: RooCodeSettings = {
 
				 	apiProvider: "openrouter",
			
 
				 	openRouterUseMiddleOutTransform: false,
			
 
				 
			
 
				-	lastShownAnnouncementId: "may-29-2025-3-19",
			
 
				+	lastShownAnnouncementId: "jul-09-2025-3-23-0",
			
 
				 
			
 
				 	pinnedApiConfigs: {},