10 months ago · 0dfbae64f3
--- a/.changeset/thin-tigers-yawn.md
+++ b/.changeset/thin-tigers-yawn.md
@@ -0,0 +1,5 @@
 
				+---
			
 
				+"roo-cline": patch
			
 
				+---
			
 
				+
			
 
				+Allow users to turn prompt caching on / off for Gemini 2.5 on OpenRouter
			
--- a/src/api/providers/__tests__/gemini.test.ts
+++ b/src/api/providers/__tests__/gemini.test.ts
@@ -74,14 +74,7 @@ describe("GeminiHandler", () => {
 
				 			expect(chunks.length).toBe(3)
			
 
				 			expect(chunks[0]).toEqual({ type: "text", text: "Hello" })
			
 
				 			expect(chunks[1]).toEqual({ type: "text", text: " world!" })
			
 
				-			expect(chunks[2]).toEqual({
			
 
				-				type: "usage",
			
 
				-				inputTokens: 10,
			
 
				-				outputTokens: 5,
			
 
				-				cacheReadTokens: undefined,
			
 
				-				cacheWriteTokens: undefined,
			
 
				-				thinkingTokens: undefined,
			
 
				-			})
			
 
				+			expect(chunks[2]).toEqual({ type: "usage", inputTokens: 10, outputTokens: 5 })
			
 
				 
			
 
				 			// Verify the call to generateContentStream
			
 
				 			expect(handler["client"].models.generateContentStream).toHaveBeenCalledWith(
			
--- a/src/api/providers/__tests__/openrouter.test.ts
+++ b/src/api/providers/__tests__/openrouter.test.ts
@@ -54,10 +54,14 @@ describe("OpenRouterHandler", () => {
 
				 				id: mockOptions.openRouterModelId,
			
 
				 				info: mockOptions.openRouterModelInfo,
			
 
				 				maxTokens: 1000,
			
 
				-				reasoning: undefined,
			
 
				-				temperature: 0,
			
 
				 				thinking: undefined,
			
 
				+				temperature: 0,
			
 
				+				reasoningEffort: undefined,
			
 
				 				topP: undefined,
			
 
				+				promptCache: {
			
 
				+					supported: false,
			
 
				+					optional: false,
			
 
				+				},
			
 
				 			})
			
 
				 		})
			
 
				 
			
--- a/src/api/providers/fetchers/__tests__/fixtures/openrouter-models.json
+++ b/src/api/providers/fetchers/__tests__/fixtures/openrouter-models.json
--- a/src/api/providers/fetchers/__tests__/openrouter.test.ts
+++ b/src/api/providers/fetchers/__tests__/openrouter.test.ts
@@ -9,7 +9,7 @@ import { PROMPT_CACHING_MODELS } from "../../../../shared/api"
 
				 import { getOpenRouterModels } from "../openrouter"
			
 
				 
			
 
				 nockBack.fixtures = path.join(__dirname, "fixtures")
			
 
				-nockBack.setMode("dryrun")
			
 
				+nockBack.setMode("lockdown")
			
 
				 
			
 
				 describe("OpenRouter API", () => {
			
 
				 	describe("getOpenRouterModels", () => {
			
@@ -66,6 +66,31 @@ describe("OpenRouter API", () => {
 
				 				supportsComputerUse: true,
			
 
				 			})
			
 
				 
			
 
				+			expect(
			
 
				+				Object.entries(models)
			
 
				+					.filter(([id, _]) => id.startsWith("anthropic/claude-3"))
			
 
				+					.map(([id, model]) => ({ id, maxTokens: model.maxTokens }))
			
 
				+					.sort(({ id: a }, { id: b }) => a.localeCompare(b)),
			
 
				+			).toEqual([
			
 
				+				{ id: "anthropic/claude-3-haiku", maxTokens: 4096 },
			
 
				+				{ id: "anthropic/claude-3-haiku:beta", maxTokens: 4096 },
			
 
				+				{ id: "anthropic/claude-3-opus", maxTokens: 4096 },
			
 
				+				{ id: "anthropic/claude-3-opus:beta", maxTokens: 4096 },
			
 
				+				{ id: "anthropic/claude-3-sonnet", maxTokens: 4096 },
			
 
				+				{ id: "anthropic/claude-3-sonnet:beta", maxTokens: 4096 },
			
 
				+				{ id: "anthropic/claude-3.5-haiku", maxTokens: 8192 },
			
 
				+				{ id: "anthropic/claude-3.5-haiku-20241022", maxTokens: 8192 },
			
 
				+				{ id: "anthropic/claude-3.5-haiku-20241022:beta", maxTokens: 8192 },
			
 
				+				{ id: "anthropic/claude-3.5-haiku:beta", maxTokens: 8192 },
			
 
				+				{ id: "anthropic/claude-3.5-sonnet", maxTokens: 8192 },
			
 
				+				{ id: "anthropic/claude-3.5-sonnet-20240620", maxTokens: 8192 },
			
 
				+				{ id: "anthropic/claude-3.5-sonnet-20240620:beta", maxTokens: 8192 },
			
 
				+				{ id: "anthropic/claude-3.5-sonnet:beta", maxTokens: 8192 },
			
 
				+				{ id: "anthropic/claude-3.7-sonnet", maxTokens: 8192 },
			
 
				+				{ id: "anthropic/claude-3.7-sonnet:beta", maxTokens: 8192 },
			
 
				+				{ id: "anthropic/claude-3.7-sonnet:thinking", maxTokens: 128000 },
			
 
				+			])
			
 
				+
			
 
				 			nockDone()
			
 
				 		})
			
 
				 	})
			
--- a/src/api/providers/fetchers/openrouter.ts
+++ b/src/api/providers/fetchers/openrouter.ts
@@ -1,7 +1,13 @@
 
				 import axios from "axios"
			
 
				 import { z } from "zod"
			
 
				 
			
 
				-import { ApiHandlerOptions, ModelInfo } from "../../../shared/api"
			
 
				+import {
			
 
				+	ApiHandlerOptions,
			
 
				+	ModelInfo,
			
 
				+	anthropicModels,
			
 
				+	COMPUTER_USE_MODELS,
			
 
				+	OPTIONAL_PROMPT_CACHING_MODELS,
			
 
				+} from "../../../shared/api"
			
 
				 import { parseApiPrice } from "../../../utils/cost"
			
 
				 
			
 
				 // https://openrouter.ai/api/v1/models
			
@@ -62,8 +68,8 @@ export async function getOpenRouterModels(options?: ApiHandlerOptions) {
 
				 				? parseApiPrice(rawModel.pricing?.input_cache_read)
			
 
				 				: undefined
			
 
				 
			
 
				-			// Disable prompt caching for Gemini models for now.
			
 
				-			const supportsPromptCache = !!cacheWritesPrice && !!cacheReadsPrice && !rawModel.id.startsWith("google")
			
 
				+			const supportsPromptCache =
			
 
				+				typeof cacheWritesPrice !== "undefined" && typeof cacheReadsPrice !== "undefined"
			
 
				 
			
 
				 			const modelInfo: ModelInfo = {
			
 
				 				maxTokens: rawModel.top_provider?.max_completion_tokens,
			
@@ -78,29 +84,25 @@ export async function getOpenRouterModels(options?: ApiHandlerOptions) {
 
				 				thinking: rawModel.id === "anthropic/claude-3.7-sonnet:thinking",
			
 
				 			}
			
 
				 
			
 
				-			// Then OpenRouter model definition doesn't give us any hints about computer use,
			
 
				-			// so we need to set that manually.
			
 
				-			// The ideal `maxTokens` values are model dependent, but we should probably DRY
			
 
				-			// this up and use the values defined for the Anthropic providers.
			
 
				-			switch (true) {
			
 
				-				case rawModel.id.startsWith("anthropic/claude-3.7-sonnet"):
			
 
				-					modelInfo.supportsComputerUse = true
			
 
				-					modelInfo.maxTokens = rawModel.id === "anthropic/claude-3.7-sonnet:thinking" ? 128_000 : 8192
			
 
				-					break
			
 
				-				case rawModel.id.startsWith("anthropic/claude-3.5-sonnet-20240620"):
			
 
				-					modelInfo.maxTokens = 8192
			
 
				-					break
			
 
				-				case rawModel.id.startsWith("anthropic/claude-3.5-sonnet"):
			
 
				-					modelInfo.supportsComputerUse = true
			
 
				-					modelInfo.maxTokens = 8192
			
 
				-					break
			
 
				-				case rawModel.id.startsWith("anthropic/claude-3-5-haiku"):
			
 
				-				case rawModel.id.startsWith("anthropic/claude-3-opus"):
			
 
				-				case rawModel.id.startsWith("anthropic/claude-3-haiku"):
			
 
				-					modelInfo.maxTokens = 8192
			
 
				-					break
			
 
				-				default:
			
 
				-					break
			
 
				+			// The OpenRouter model definition doesn't give us any hints about
			
 
				+			// computer use, so we need to set that manually.
			
 
				+			if (COMPUTER_USE_MODELS.has(rawModel.id)) {
			
 
				+				modelInfo.supportsComputerUse = true
			
 
				+			}
			
 
				+
			
 
				+			// We want to treat prompt caching as "experimental" for these models.
			
 
				+			if (OPTIONAL_PROMPT_CACHING_MODELS.has(rawModel.id)) {
			
 
				+				modelInfo.isPromptCacheOptional = true
			
 
				+			}
			
 
				+
			
 
				+			// Claude 3.7 Sonnet is a "hybrid" thinking model, and the `maxTokens`
			
 
				+			// values can be configured. For the non-thinking variant we want to
			
 
				+			// use 8k. The `thinking` variant can be run in 64k and 128k modes,
			
 
				+			// and we want to use 128k.
			
 
				+			if (rawModel.id.startsWith("anthropic/claude-3.7-sonnet")) {
			
 
				+				modelInfo.maxTokens = rawModel.id.includes("thinking")
			
 
				+					? anthropicModels["claude-3-7-sonnet-20250219:thinking"].maxTokens
			
 
				+					: anthropicModels["claude-3-7-sonnet-20250219"].maxTokens
			
 
				 			}
			
 
				 
			
 
				 			models[rawModel.id] = modelInfo
			
--- a/src/api/providers/gemini.ts
+++ b/src/api/providers/gemini.ts
@@ -54,7 +54,6 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 
				 
			
 
				 		let uncachedContent: Content[] | undefined = undefined
			
 
				 		let cachedContent: string | undefined = undefined
			
 
				-		let cacheWriteTokens: number | undefined = undefined
			
 
				 
			
 
				 		// The minimum input token count for context caching is 4,096.
			
 
				 		// For a basic approximation we assume 4 characters per token.
			
@@ -67,6 +66,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 
				 			cacheKey &&
			
 
				 			contentsLength > 4 * CONTEXT_CACHE_TOKEN_MINIMUM
			
 
				 
			
 
				+		let cacheWrite = false
			
 
				+
			
 
				 		if (isCacheAvailable) {
			
 
				 			const cacheEntry = this.contentCaches.get<CacheEntry>(cacheKey)
			
 
				 
			
@@ -97,9 +98,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 
				 
			
 
				 						if (name) {
			
 
				 							this.contentCaches.set<CacheEntry>(cacheKey, { key: name, count: contents.length })
			
 
				-							cacheWriteTokens = usageMetadata?.totalTokenCount ?? 0
			
 
				 							console.log(
			
 
				-								`[GeminiHandler] cached ${contents.length} messages (${cacheWriteTokens} tokens) in ${Date.now() - timestamp}ms`,
			
 
				+								`[GeminiHandler] cached ${contents.length} messages (${usageMetadata?.totalTokenCount ?? "-"} tokens) in ${Date.now() - timestamp}ms`,
			
 
				 							)
			
 
				 						}
			
 
				 					})
			
@@ -109,6 +109,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 
				 					.finally(() => {
			
 
				 						this.isCacheBusy = false
			
 
				 					})
			
 
				+
			
 
				+				cacheWrite = true
			
 
				 			}
			
 
				 		}
			
 
				 
			
@@ -146,19 +148,10 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 
				 		if (lastUsageMetadata) {
			
 
				 			const inputTokens = lastUsageMetadata.promptTokenCount ?? 0
			
 
				 			const outputTokens = lastUsageMetadata.candidatesTokenCount ?? 0
			
 
				+			const cacheWriteTokens = cacheWrite ? inputTokens : undefined
			
 
				 			const cacheReadTokens = lastUsageMetadata.cachedContentTokenCount
			
 
				 			const reasoningTokens = lastUsageMetadata.thoughtsTokenCount
			
 
				 
			
 
				-			const totalCost = isCacheUsed
			
 
				-				? this.calculateCost({
			
 
				-						info,
			
 
				-						inputTokens,
			
 
				-						outputTokens,
			
 
				-						cacheWriteTokens,
			
 
				-						cacheReadTokens,
			
 
				-					})
			
 
				-				: undefined
			
 
				-
			
 
				 			yield {
			
 
				 				type: "usage",
			
 
				 				inputTokens,
			
@@ -166,7 +159,13 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 
				 				cacheWriteTokens,
			
 
				 				cacheReadTokens,
			
 
				 				reasoningTokens,
			
 
				-				totalCost,
			
 
				+				totalCost: this.calculateCost({
			
 
				+					info,
			
 
				+					inputTokens,
			
 
				+					outputTokens,
			
 
				+					cacheWriteTokens,
			
 
				+					cacheReadTokens,
			
 
				+				}),
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
@@ -250,8 +249,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 
				 		info,
			
 
				 		inputTokens,
			
 
				 		outputTokens,
			
 
				-		cacheWriteTokens,
			
 
				-		cacheReadTokens,
			
 
				+		cacheWriteTokens = 0,
			
 
				+		cacheReadTokens = 0,
			
 
				 	}: {
			
 
				 		info: ModelInfo
			
 
				 		inputTokens: number
			
@@ -281,21 +280,32 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 
				 			}
			
 
				 		}
			
 
				 
			
 
				-		let inputTokensCost = inputPrice * (inputTokens / 1_000_000)
			
 
				-		let outputTokensCost = outputPrice * (outputTokens / 1_000_000)
			
 
				-		let cacheWriteCost = 0
			
 
				-		let cacheReadCost = 0
			
 
				+		// Subtract the cached input tokens from the total input tokens.
			
 
				+		const uncachedInputTokens = inputTokens - cacheReadTokens
			
 
				 
			
 
				-		if (cacheWriteTokens) {
			
 
				-			cacheWriteCost = cacheWritesPrice * (cacheWriteTokens / 1_000_000) * (CACHE_TTL / 60)
			
 
				+		let cacheWriteCost =
			
 
				+			cacheWriteTokens > 0 ? cacheWritesPrice * (cacheWriteTokens / 1_000_000) * (CACHE_TTL / 60) : 0
			
 
				+		let cacheReadCost = cacheReadTokens > 0 ? cacheReadsPrice * (cacheReadTokens / 1_000_000) : 0
			
 
				+
			
 
				+		const inputTokensCost = inputPrice * (uncachedInputTokens / 1_000_000)
			
 
				+		const outputTokensCost = outputPrice * (outputTokens / 1_000_000)
			
 
				+		const totalCost = inputTokensCost + outputTokensCost + cacheWriteCost + cacheReadCost
			
 
				+
			
 
				+		const trace: Record<string, { price: number; tokens: number; cost: number }> = {
			
 
				+			input: { price: inputPrice, tokens: uncachedInputTokens, cost: inputTokensCost },
			
 
				+			output: { price: outputPrice, tokens: outputTokens, cost: outputTokensCost },
			
 
				 		}
			
 
				 
			
 
				-		if (cacheReadTokens) {
			
 
				-			const uncachedReadTokens = inputTokens - cacheReadTokens
			
 
				-			cacheReadCost = cacheReadsPrice * (cacheReadTokens / 1_000_000)
			
 
				-			inputTokensCost = inputPrice * (uncachedReadTokens / 1_000_000)
			
 
				+		if (cacheWriteTokens > 0) {
			
 
				+			trace.cacheWrite = { price: cacheWritesPrice, tokens: cacheWriteTokens, cost: cacheWriteCost }
			
 
				 		}
			
 
				 
			
 
				-		return inputTokensCost + outputTokensCost + cacheWriteCost + cacheReadCost
			
 
				+		if (cacheReadTokens > 0) {
			
 
				+			trace.cacheRead = { price: cacheReadsPrice, tokens: cacheReadTokens, cost: cacheReadCost }
			
 
				+		}
			
 
				+
			
 
				+		// console.log(`[GeminiHandler] calculateCost -> ${totalCost}`, trace)
			
 
				+
			
 
				+		return totalCost
			
 
				 	}
			
 
				 }
			
--- a/src/api/providers/openrouter.ts
+++ b/src/api/providers/openrouter.ts
@@ -7,6 +7,7 @@ import {
 
				 	openRouterDefaultModelId,
			
 
				 	openRouterDefaultModelInfo,
			
 
				 	PROMPT_CACHING_MODELS,
			
 
				+	OPTIONAL_PROMPT_CACHING_MODELS,
			
 
				 } from "../../shared/api"
			
 
				 import { convertToOpenAiMessages } from "../transform/openai-format"
			
 
				 import { ApiStreamChunk } from "../transform/stream"
			
@@ -65,7 +66,7 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
 
				 		systemPrompt: string,
			
 
				 		messages: Anthropic.Messages.MessageParam[],
			
 
				 	): AsyncGenerator<ApiStreamChunk> {
			
 
				-		let { id: modelId, maxTokens, thinking, temperature, topP, reasoningEffort, info } = this.getModel()
			
 
				+		let { id: modelId, maxTokens, thinking, temperature, topP, reasoningEffort, promptCache } = this.getModel()
			
 
				 
			
 
				 		// Convert Anthropic messages to OpenAI format.
			
 
				 		let openAiMessages: OpenAI.Chat.ChatCompletionMessageParam[] = [
			
@@ -78,11 +79,13 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
 
				 			openAiMessages = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages])
			
 
				 		}
			
 
				 
			
 
				+		const isCacheAvailable = promptCache.supported && (!promptCache.optional || this.options.promptCachingEnabled)
			
 
				+
			
 
				 		// Prompt caching: https://openrouter.ai/docs/prompt-caching
			
 
				 		// Now with Gemini support: https://openrouter.ai/docs/features/prompt-caching
			
 
				 		// Note that we don't check the `ModelInfo` object because it is cached
			
 
				 		// in the settings for OpenRouter and the value could be stale.
			
 
				-		if (PROMPT_CACHING_MODELS.has(modelId)) {
			
 
				+		if (isCacheAvailable) {
			
 
				 			openAiMessages[0] = {
			
 
				 				role: "system",
			
 
				 				// @ts-ignore-next-line
			
@@ -193,8 +196,13 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
 
				 		return {
			
 
				 			id,
			
 
				 			info,
			
 
				+			// maxTokens, thinking, temperature, reasoningEffort
			
 
				 			...getModelParams({ options: this.options, model: info, defaultTemperature }),
			
 
				 			topP,
			
 
				+			promptCache: {
			
 
				+				supported: PROMPT_CACHING_MODELS.has(id),
			
 
				+				optional: OPTIONAL_PROMPT_CACHING_MODELS.has(id),
			
 
				+			},
			
 
				 		}
			
 
				 	}
			
 
				 
			
--- a/src/shared/api.ts
+++ b/src/shared/api.ts
@@ -1401,8 +1401,10 @@ export const vscodeLlmModels = {
 
				  * Constants
			
 
				  */
			
 
				 
			
 
				+// These models support reasoning efforts.
			
 
				 export const REASONING_MODELS = new Set(["x-ai/grok-3-mini-beta", "grok-3-mini-beta", "grok-3-mini-fast-beta"])
			
 
				 
			
 
				+// These models support prompt caching.
			
 
				 export const PROMPT_CACHING_MODELS = new Set([
			
 
				 	"anthropic/claude-3-haiku",
			
 
				 	"anthropic/claude-3-haiku:beta",
			
@@ -1421,7 +1423,26 @@ export const PROMPT_CACHING_MODELS = new Set([
 
				 	"anthropic/claude-3.7-sonnet",
			
 
				 	"anthropic/claude-3.7-sonnet:beta",
			
 
				 	"anthropic/claude-3.7-sonnet:thinking",
			
 
				-	// "google/gemini-2.0-flash-001",
			
 
				-	// "google/gemini-flash-1.5",
			
 
				-	// "google/gemini-flash-1.5-8b",
			
 
				+	"google/gemini-2.5-pro-preview-03-25",
			
 
				+	"google/gemini-2.0-flash-001",
			
 
				+	"google/gemini-flash-1.5",
			
 
				+	"google/gemini-flash-1.5-8b",
			
 
				+])
			
 
				+
			
 
				+// These models don't have prompt caching enabled by default (you can turn it on
			
 
				+// in settings).
			
 
				+export const OPTIONAL_PROMPT_CACHING_MODELS = new Set([
			
 
				+	"google/gemini-2.5-pro-preview-03-25",
			
 
				+	"google/gemini-2.0-flash-001",
			
 
				+	"google/gemini-flash-1.5",
			
 
				+	"google/gemini-flash-1.5-8b",
			
 
				+])
			
 
				+
			
 
				+// https://www.anthropic.com/news/3-5-models-and-computer-use
			
 
				+export const COMPUTER_USE_MODELS = new Set([
			
 
				+	"anthropic/claude-3.5-sonnet",
			
 
				+	"anthropic/claude-3.5-sonnet:beta",
			
 
				+	"anthropic/claude-3.7-sonnet",
			
 
				+	"anthropic/claude-3.7-sonnet:beta",
			
 
				+	"anthropic/claude-3.7-sonnet:thinking",
			
 
				 ])