Przeglądaj źródła

Gemini implicit caching (#3515)

Chris Estreich 7 miesięcy temu
rodzic
commit
20fab97b4d

+ 0 - 3
evals/packages/types/src/roo-code.ts

@@ -102,7 +102,6 @@ export const modelInfoSchema = z.object({
 	supportsImages: z.boolean().optional(),
 	supportsComputerUse: z.boolean().optional(),
 	supportsPromptCache: z.boolean(),
-	isPromptCacheOptional: z.boolean().optional(),
 	inputPrice: z.number().optional(),
 	outputPrice: z.number().optional(),
 	cacheWritesPrice: z.number().optional(),
@@ -336,7 +335,6 @@ export type ProviderSettingsEntry = z.infer<typeof providerSettingsEntrySchema>
 const genericProviderSettingsSchema = z.object({
 	includeMaxTokens: z.boolean().optional(),
 	reasoningEffort: reasoningEffortsSchema.optional(),
-	promptCachingDisabled: z.boolean().optional(),
 	diffEnabled: z.boolean().optional(),
 	fuzzyMatchThreshold: z.number().optional(),
 	modelTemperature: z.number().nullish(),
@@ -699,7 +697,6 @@ const providerSettingsRecord: ProviderSettingsRecord = {
 	// Generic
 	includeMaxTokens: undefined,
 	reasoningEffort: undefined,
-	promptCachingDisabled: undefined,
 	diffEnabled: undefined,
 	fuzzyMatchThreshold: undefined,
 	modelTemperature: undefined,

+ 1 - 1
src/api/index.ts

@@ -32,7 +32,7 @@ export interface SingleCompletionHandler {
 }
 
 export interface ApiHandler {
-	createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[], cacheKey?: string): ApiStream
+	createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream
 
 	getModel(): { id: string; info: ModelInfo }
 

+ 1 - 382
src/api/providers/__tests__/gemini.test.ts

@@ -219,7 +219,7 @@ describe("GeminiHandler", () => {
 				mockInfo.cacheWritesPrice! * (cacheWriteTokens / 1_000_000) * (CACHE_TTL / 60)
 			const expectedCost = expectedInputCost + expectedOutputCost + expectedCacheWriteCost
 
-			const cost = handler.calculateCost({ info: mockInfo, inputTokens, outputTokens, cacheWriteTokens })
+			const cost = handler.calculateCost({ info: mockInfo, inputTokens, outputTokens })
 			expect(cost).toBeCloseTo(expectedCost)
 		})
 
@@ -247,384 +247,3 @@ describe("GeminiHandler", () => {
 		})
 	})
 })
-
-describe("Caching Logic", () => {
-	const systemPrompt = "System prompt"
-	const longContent = "a".repeat(5 * 4096) // Ensure content is long enough for caching
-	const mockMessagesLong: Anthropic.Messages.MessageParam[] = [
-		{ role: "user", content: longContent },
-		{ role: "assistant", content: "OK" },
-	]
-	const cacheKey = "test-cache-key"
-	const mockCacheName = "generated/caches/mock-cache-name"
-	const mockCacheTokens = 5000
-
-	let handlerWithCache: GeminiHandler
-	let mockGenerateContentStream: jest.Mock
-	let mockCreateCache: jest.Mock
-	let mockDeleteCache: jest.Mock
-	let mockCacheGet: jest.Mock
-	let mockCacheSet: jest.Mock
-
-	beforeEach(() => {
-		mockGenerateContentStream = jest.fn().mockResolvedValue({
-			[Symbol.asyncIterator]: async function* () {
-				yield { text: "Response" }
-				yield {
-					usageMetadata: {
-						promptTokenCount: 100, // Uncached input
-						candidatesTokenCount: 50, // Output
-						cachedContentTokenCount: 0, // Default, override in tests
-					},
-				}
-			},
-		})
-		mockCreateCache = jest.fn().mockResolvedValue({
-			name: mockCacheName,
-			usageMetadata: { totalTokenCount: mockCacheTokens },
-		})
-		mockDeleteCache = jest.fn().mockResolvedValue({})
-		mockCacheGet = jest.fn().mockReturnValue(undefined) // Default: cache miss
-		mockCacheSet = jest.fn()
-
-		handlerWithCache = new GeminiHandler({
-			apiKey: "test-key",
-			apiModelId: "gemini-1.5-flash-latest", // Use a model that supports caching
-			geminiApiKey: "test-key",
-			promptCachingDisabled: false,
-		})
-
-		handlerWithCache["client"] = {
-			models: {
-				generateContentStream: mockGenerateContentStream,
-			},
-			caches: {
-				create: mockCreateCache,
-				delete: mockDeleteCache,
-			},
-		} as any
-		handlerWithCache["contentCaches"] = {
-			get: mockCacheGet,
-			set: mockCacheSet,
-		} as any
-	})
-
-	it("should not use cache if promptCachingDisabled is true", async () => {
-		handlerWithCache["options"].promptCachingDisabled = true
-		const stream = handlerWithCache.createMessage(systemPrompt, mockMessagesLong, cacheKey)
-
-		for await (const _ of stream) {
-		}
-
-		expect(mockCacheGet).not.toHaveBeenCalled()
-		expect(mockGenerateContentStream).toHaveBeenCalledWith(
-			expect.objectContaining({
-				config: expect.objectContaining({
-					cachedContent: undefined,
-					systemInstruction: systemPrompt,
-				}),
-			}),
-		)
-		expect(mockCreateCache).not.toHaveBeenCalled()
-	})
-
-	it("should not use cache if content length is below threshold", async () => {
-		const shortMessages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "short" }]
-		const stream = handlerWithCache.createMessage(systemPrompt, shortMessages, cacheKey)
-		for await (const _ of stream) {
-			/* consume stream */
-		}
-
-		expect(mockCacheGet).not.toHaveBeenCalled() // Doesn't even check cache if too short
-		expect(mockGenerateContentStream).toHaveBeenCalledWith(
-			expect.objectContaining({
-				config: expect.objectContaining({
-					cachedContent: undefined,
-					systemInstruction: systemPrompt,
-				}),
-			}),
-		)
-		expect(mockCreateCache).not.toHaveBeenCalled()
-	})
-
-	it("should perform cache write on miss when conditions met", async () => {
-		const stream = handlerWithCache.createMessage(systemPrompt, mockMessagesLong, cacheKey)
-		const chunks = []
-
-		for await (const chunk of stream) {
-			chunks.push(chunk)
-		}
-
-		expect(mockCacheGet).toHaveBeenCalledWith(cacheKey)
-		expect(mockGenerateContentStream).toHaveBeenCalledWith(
-			expect.objectContaining({
-				config: expect.objectContaining({
-					cachedContent: undefined,
-					systemInstruction: systemPrompt,
-				}),
-			}),
-		)
-
-		await new Promise(process.nextTick) // Allow microtasks (like the async writeCache) to run
-
-		expect(mockCreateCache).toHaveBeenCalledTimes(1)
-		expect(mockCreateCache).toHaveBeenCalledWith(
-			expect.objectContaining({
-				model: expect.stringContaining("gemini-2.0-flash-001"), // Adjusted expectation based on test run
-				config: expect.objectContaining({
-					systemInstruction: systemPrompt,
-					contents: expect.any(Array), // Verify contents structure if needed
-					ttl: expect.stringContaining("300s"),
-				}),
-			}),
-		)
-		expect(mockCacheSet).toHaveBeenCalledWith(
-			cacheKey,
-			expect.objectContaining({
-				key: mockCacheName,
-				count: mockMessagesLong.length,
-				tokens: mockCacheTokens,
-			}),
-		)
-		expect(mockDeleteCache).not.toHaveBeenCalled() // No previous cache to delete
-
-		const usageChunk = chunks.find((c) => c.type === "usage")
-
-		expect(usageChunk).toEqual(
-			expect.objectContaining({
-				cacheWriteTokens: 100, // Should match promptTokenCount when write is queued
-				cacheReadTokens: 0,
-			}),
-		)
-	})
-
-	it("should use cache on hit and not send system prompt", async () => {
-		const cachedMessagesCount = 1
-		const cacheReadTokensCount = 4000
-		mockCacheGet.mockReturnValue({ key: mockCacheName, count: cachedMessagesCount, tokens: cacheReadTokensCount })
-
-		mockGenerateContentStream.mockResolvedValue({
-			[Symbol.asyncIterator]: async function* () {
-				yield { text: "Response" }
-				yield {
-					usageMetadata: {
-						promptTokenCount: 10, // Uncached input tokens
-						candidatesTokenCount: 50,
-						cachedContentTokenCount: cacheReadTokensCount, // Simulate cache hit reporting
-					},
-				}
-			},
-		})
-
-		// Only send the second message (index 1) as uncached
-		const stream = handlerWithCache.createMessage(systemPrompt, mockMessagesLong, cacheKey)
-		const chunks = []
-
-		for await (const chunk of stream) {
-			chunks.push(chunk)
-		}
-
-		expect(mockCacheGet).toHaveBeenCalledWith(cacheKey)
-		expect(mockGenerateContentStream).toHaveBeenCalledWith(
-			expect.objectContaining({
-				contents: expect.any(Array), // Should contain only the *uncached* messages
-				config: expect.objectContaining({
-					cachedContent: mockCacheName, // Cache name provided
-					systemInstruction: undefined, // System prompt NOT sent on hit
-				}),
-			}),
-		)
-
-		// Check that the contents sent are only the *new* messages
-		const calledContents = mockGenerateContentStream.mock.calls[0][0].contents
-		expect(calledContents.length).toBe(mockMessagesLong.length - cachedMessagesCount) // Only new messages sent
-
-		// Wait for potential async cache write (shouldn't happen here)
-		await new Promise(process.nextTick)
-		expect(mockCreateCache).not.toHaveBeenCalled()
-		expect(mockCacheSet).not.toHaveBeenCalled() // No write occurred
-
-		// Check usage data for cache read tokens
-		const usageChunk = chunks.find((c) => c.type === "usage")
-		expect(usageChunk).toEqual(
-			expect.objectContaining({
-				inputTokens: 10, // Uncached tokens
-				outputTokens: 50,
-				cacheWriteTokens: undefined, // No write queued
-				cacheReadTokens: cacheReadTokensCount, // Read tokens reported
-			}),
-		)
-	})
-
-	it("should trigger cache write and delete old cache on hit with enough new messages", async () => {
-		const previousCacheName = "generated/caches/old-cache-name"
-		const previousCacheTokens = 3000
-		const previousMessageCount = 1
-
-		mockCacheGet.mockReturnValue({
-			key: previousCacheName,
-			count: previousMessageCount,
-			tokens: previousCacheTokens,
-		})
-
-		// Simulate enough new messages to trigger write (>= CACHE_WRITE_FREQUENCY)
-		const newMessagesCount = 10
-
-		const messagesForCacheWrite = [
-			mockMessagesLong[0], // Will be considered cached
-			...Array(newMessagesCount).fill({ role: "user", content: "new message" }),
-		] as Anthropic.Messages.MessageParam[]
-
-		// Mock generateContentStream to report some uncached tokens
-		mockGenerateContentStream.mockResolvedValue({
-			[Symbol.asyncIterator]: async function* () {
-				yield { text: "Response" }
-				yield {
-					usageMetadata: {
-						promptTokenCount: 500, // Uncached input tokens for the 10 new messages
-						candidatesTokenCount: 50,
-						cachedContentTokenCount: previousCacheTokens,
-					},
-				}
-			},
-		})
-
-		const stream = handlerWithCache.createMessage(systemPrompt, messagesForCacheWrite, cacheKey)
-		const chunks = []
-
-		for await (const chunk of stream) {
-			chunks.push(chunk)
-		}
-
-		expect(mockCacheGet).toHaveBeenCalledWith(cacheKey)
-
-		expect(mockGenerateContentStream).toHaveBeenCalledWith(
-			expect.objectContaining({
-				contents: expect.any(Array), // Should contain only the *new* messages
-				config: expect.objectContaining({
-					cachedContent: previousCacheName, // Old cache name used for reading
-					systemInstruction: undefined, // System prompt NOT sent
-				}),
-			}),
-		)
-		const calledContents = mockGenerateContentStream.mock.calls[0][0].contents
-		expect(calledContents.length).toBe(newMessagesCount) // Only new messages sent
-
-		// Wait for async cache write and delete
-		await new Promise(process.nextTick)
-		await new Promise(process.nextTick) // Needs extra tick for delete promise chain?
-
-		expect(mockCreateCache).toHaveBeenCalledTimes(1)
-		expect(mockCreateCache).toHaveBeenCalledWith(
-			expect.objectContaining({
-				// New cache uses *all* messages
-				config: expect.objectContaining({
-					contents: expect.any(Array), // Should contain *all* messagesForCacheWrite
-					systemInstruction: systemPrompt, // System prompt included in *new* cache
-				}),
-			}),
-		)
-		const createCallContents = mockCreateCache.mock.calls[0][0].config.contents
-		expect(createCallContents.length).toBe(messagesForCacheWrite.length) // All messages in new cache
-
-		expect(mockCacheSet).toHaveBeenCalledWith(
-			cacheKey,
-			expect.objectContaining({
-				key: mockCacheName, // New cache name
-				count: messagesForCacheWrite.length, // New count
-				tokens: mockCacheTokens,
-			}),
-		)
-
-		expect(mockDeleteCache).toHaveBeenCalledTimes(1)
-		expect(mockDeleteCache).toHaveBeenCalledWith({ name: previousCacheName }) // Old cache deleted
-
-		const usageChunk = chunks.find((c) => c.type === "usage")
-
-		expect(usageChunk).toEqual(
-			expect.objectContaining({
-				inputTokens: 500, // Uncached tokens
-				outputTokens: 50,
-				cacheWriteTokens: 500, // Write tokens match uncached input when write is queued on hit? No, should be total tokens for the *new* cache. Let's adjust mockCreateCache.
-				cacheReadTokens: previousCacheTokens,
-			}),
-		)
-
-		// Re-run with adjusted expectation after fixing mockCreateCache if needed
-		// Let's assume mockCreateCache returns the *total* tokens for the *new* cache (system + all messages)
-		const expectedNewCacheTotalTokens = 6000 // Example total tokens for the new cache
-
-		mockCreateCache.mockResolvedValue({
-			name: mockCacheName,
-			usageMetadata: { totalTokenCount: expectedNewCacheTotalTokens },
-		})
-
-		// Re-run the stream consumption and checks if necessary, or adjust expectation:
-		// The cacheWriteTokens in usage should reflect the *input* tokens that triggered the write,
-		// which are the *uncached* tokens in this hit scenario.
-		// The cost calculation uses the token count from the *create* response though.
-		// Let's stick to the current implementation: cacheWriteTokens = inputTokens when write is queued.
-		expect(usageChunk?.cacheWriteTokens).toBe(500) // Matches the uncached promptTokenCount
-	})
-
-	it("should handle cache create error gracefully", async () => {
-		const consoleErrorSpy = jest.spyOn(console, "error").mockImplementation(() => {})
-		const createError = new Error("Failed to create cache")
-		mockCreateCache.mockRejectedValue(createError)
-
-		const stream = handlerWithCache.createMessage(systemPrompt, mockMessagesLong, cacheKey)
-
-		for await (const _ of stream) {
-		}
-
-		// Wait for async cache write attempt
-		await new Promise(process.nextTick)
-
-		expect(mockCreateCache).toHaveBeenCalledTimes(1)
-		expect(mockCacheSet).not.toHaveBeenCalled() // Set should not be called on error
-		expect(consoleErrorSpy).toHaveBeenCalledWith(
-			expect.stringContaining("[GeminiHandler] caches.create error"),
-			createError,
-		)
-		consoleErrorSpy.mockRestore()
-	})
-
-	it("should handle cache delete error gracefully", async () => {
-		const consoleErrorSpy = jest.spyOn(console, "error").mockImplementation(() => {})
-		const deleteError = new Error("Failed to delete cache")
-		mockDeleteCache.mockRejectedValue(deleteError)
-
-		// Setup for cache hit + write scenario to trigger delete
-		const previousCacheName = "generated/caches/old-cache-name"
-		mockCacheGet.mockReturnValue({ key: previousCacheName, count: 1, tokens: 3000 })
-
-		const newMessagesCount = 10
-
-		const messagesForCacheWrite = [
-			mockMessagesLong[0],
-			...Array(newMessagesCount).fill({ role: "user", content: "new message" }),
-		] as Anthropic.Messages.MessageParam[]
-
-		const stream = handlerWithCache.createMessage(systemPrompt, messagesForCacheWrite, cacheKey)
-
-		for await (const _ of stream) {
-		}
-
-		// Wait for async cache write and delete attempt
-		await new Promise(process.nextTick)
-		await new Promise(process.nextTick)
-
-		expect(mockCreateCache).toHaveBeenCalledTimes(1) // Create still happens
-		expect(mockCacheSet).toHaveBeenCalledTimes(1) // Set still happens
-		expect(mockDeleteCache).toHaveBeenCalledTimes(1) // Delete was attempted
-
-		// Expect a single string argument containing both parts
-		expect(consoleErrorSpy).toHaveBeenCalledWith(
-			expect.stringContaining(
-				`[GeminiHandler] failed to delete stale cache entry ${previousCacheName} -> ${deleteError.message}`,
-			),
-		)
-
-		consoleErrorSpy.mockRestore()
-	})
-})

+ 0 - 1
src/api/providers/__tests__/openrouter.test.ts

@@ -78,7 +78,6 @@ describe("OpenRouterHandler", () => {
 				topP: undefined,
 				promptCache: {
 					supported: true,
-					optional: false,
 				},
 			})
 		})

+ 1 - 5
src/api/providers/__tests__/vertex.test.ts

@@ -56,11 +56,7 @@ describe("VertexHandler", () => {
 				yield { type: "usage", inputTokens: 0, outputTokens: 5 }
 			})
 
-			const mockCacheKey = "cacheKey"
-			// Since we're directly mocking createMessage, we don't need to spy on it
-			// We just need to call it and verify the results
-
-			const stream = handler.createMessage(systemPrompt, mockMessages, mockCacheKey)
+			const stream = handler.createMessage(systemPrompt, mockMessages)
 
 			const chunks: ApiStreamChunk[] = []
 

Plik diff jest za duży
+ 0 - 0
src/api/providers/fetchers/__tests__/fixtures/openrouter-models.json


+ 7 - 6
src/api/providers/fetchers/__tests__/openrouter.test.ts

@@ -13,6 +13,7 @@ nockBack.setMode("lockdown")
 
 describe("OpenRouter API", () => {
 	describe("getOpenRouterModels", () => {
+		// This flakes in CI (probably related to Nock). Need to figure out why.
 		it.skip("fetches models and validates schema", async () => {
 			const { nockDone } = await nockBack("openrouter-models.json")
 
@@ -66,12 +67,12 @@ describe("OpenRouter API", () => {
 				supportsComputerUse: true,
 			})
 
-			expect(
-				Object.entries(models)
-					.filter(([id, _]) => id.startsWith("anthropic/claude-3"))
-					.map(([id, model]) => ({ id, maxTokens: model.maxTokens }))
-					.sort(({ id: a }, { id: b }) => a.localeCompare(b)),
-			).toEqual([
+			const anthropicModels = Object.entries(models)
+				.filter(([id, _]) => id.startsWith("anthropic/claude-3"))
+				.map(([id, model]) => ({ id, maxTokens: model.maxTokens }))
+				.sort(({ id: a }, { id: b }) => a.localeCompare(b))
+
+			expect(anthropicModels).toEqual([
 				{ id: "anthropic/claude-3-haiku", maxTokens: 4096 },
 				{ id: "anthropic/claude-3-haiku:beta", maxTokens: 4096 },
 				{ id: "anthropic/claude-3-opus", maxTokens: 4096 },

+ 2 - 13
src/api/providers/fetchers/openrouter.ts

@@ -1,13 +1,7 @@
 import axios from "axios"
 import { z } from "zod"
 
-import {
-	ApiHandlerOptions,
-	ModelInfo,
-	anthropicModels,
-	COMPUTER_USE_MODELS,
-	OPTIONAL_PROMPT_CACHING_MODELS,
-} from "../../../shared/api"
+import { ApiHandlerOptions, ModelInfo, anthropicModels, COMPUTER_USE_MODELS } from "../../../shared/api"
 import { parseApiPrice } from "../../../utils/cost"
 
 // https://openrouter.ai/api/v1/models
@@ -72,7 +66,7 @@ export async function getOpenRouterModels(options?: ApiHandlerOptions): Promise<
 				typeof cacheWritesPrice !== "undefined" && typeof cacheReadsPrice !== "undefined"
 
 			const modelInfo: ModelInfo = {
-				maxTokens: 0,
+				maxTokens: rawModel.id.startsWith("anthropic/") ? rawModel.top_provider?.max_completion_tokens : 0,
 				contextWindow: rawModel.context_length,
 				supportsImages: rawModel.architecture?.modality?.includes("image"),
 				supportsPromptCache,
@@ -90,11 +84,6 @@ export async function getOpenRouterModels(options?: ApiHandlerOptions): Promise<
 				modelInfo.supportsComputerUse = true
 			}
 
-			// We want to treat prompt caching as "experimental" for these models.
-			if (OPTIONAL_PROMPT_CACHING_MODELS.has(rawModel.id)) {
-				modelInfo.isPromptCacheOptional = true
-			}
-
 			// Claude 3.7 Sonnet is a "hybrid" thinking model, and the `maxTokens`
 			// values can be configured. For the non-thinking variant we want to
 			// use 8k. The `thinking` variant can be run in 64k and 128k modes,

+ 13 - 174
src/api/providers/gemini.ts

@@ -3,33 +3,18 @@ import {
 	GoogleGenAI,
 	type GenerateContentResponseUsageMetadata,
 	type GenerateContentParameters,
-	type Content,
+	type GenerateContentConfig,
 } from "@google/genai"
 import type { JWTInput } from "google-auth-library"
-import NodeCache from "node-cache"
 
 import { ApiHandlerOptions, ModelInfo, GeminiModelId, geminiDefaultModelId, geminiModels } from "../../shared/api"
 import { safeJsonParse } from "../../shared/safeJsonParse"
 
 import { SingleCompletionHandler } from "../index"
-import {
-	convertAnthropicContentToGemini,
-	convertAnthropicMessageToGemini,
-	getMessagesLength,
-} from "../transform/gemini-format"
+import { convertAnthropicContentToGemini, convertAnthropicMessageToGemini } from "../transform/gemini-format"
 import type { ApiStream } from "../transform/stream"
 import { BaseProvider } from "./base-provider"
 
-const CACHE_TTL = 5
-const CACHE_WRITE_FREQUENCY = 10
-const CONTEXT_CACHE_TOKEN_MINIMUM = 4096
-
-type CacheEntry = {
-	key: string
-	count: number
-	tokens?: number
-}
-
 type GeminiHandlerOptions = ApiHandlerOptions & {
 	isVertex?: boolean
 }
@@ -38,8 +23,6 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 	protected options: ApiHandlerOptions
 
 	private client: GoogleGenAI
-	private contentCaches: NodeCache
-	private isCacheBusy = false
 
 	constructor({ isVertex, ...options }: GeminiHandlerOptions) {
 		super()
@@ -69,78 +52,25 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 				: isVertex
 					? new GoogleGenAI({ vertexai: true, project, location })
 					: new GoogleGenAI({ apiKey })
-
-		this.contentCaches = new NodeCache({ stdTTL: 5 * 60, checkperiod: 5 * 60 })
 	}
 
-	async *createMessage(
-		systemInstruction: string,
-		messages: Anthropic.Messages.MessageParam[],
-		cacheKey?: string,
-	): ApiStream {
+	async *createMessage(systemInstruction: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
 		const { id: model, thinkingConfig, maxOutputTokens, info } = this.getModel()
 
 		const contents = messages.map(convertAnthropicMessageToGemini)
-		const contentsLength = systemInstruction.length + getMessagesLength(contents)
-
-		let uncachedContent: Content[] | undefined = undefined
-		let cachedContent: string | undefined = undefined
-
-		// The minimum input token count for context caching is 4,096.
-		// For a basic approximation we assume 4 characters per token.
-		// We can use tiktoken eventually to get a more accurat token count.
-		// https://ai.google.dev/gemini-api/docs/caching?lang=node
-		// https://ai.google.dev/gemini-api/docs/tokens?lang=node
-		const isCacheAvailable =
-			info.supportsPromptCache &&
-			!this.options.promptCachingDisabled &&
-			cacheKey &&
-			contentsLength > 4 * CONTEXT_CACHE_TOKEN_MINIMUM
-
-		let isCacheWriteQueued = false
-
-		if (isCacheAvailable) {
-			const cacheEntry = this.contentCaches.get<CacheEntry>(cacheKey)
-
-			if (cacheEntry) {
-				uncachedContent = contents.slice(cacheEntry.count, contents.length)
-				cachedContent = cacheEntry.key
-				// console.log(
-				// 	`[GeminiHandler] using cache entry ${cacheEntry.key} -> ${cacheEntry.count} messages, ${cacheEntry.tokens} tokens (+${uncachedContent.length} uncached messages)`,
-				// )
-			}
 
-			// If `CACHE_WRITE_FREQUENCY` messages have been appended since the
-			// last cache write then write a new cache entry.
-			// TODO: Use a token count instead.
-			if (!cacheEntry || (uncachedContent && uncachedContent.length >= CACHE_WRITE_FREQUENCY)) {
-				isCacheWriteQueued = true
-			}
+		const config: GenerateContentConfig = {
+			systemInstruction,
+			httpOptions: this.options.googleGeminiBaseUrl ? { baseUrl: this.options.googleGeminiBaseUrl } : undefined,
+			thinkingConfig,
+			maxOutputTokens,
+			temperature: this.options.modelTemperature ?? 0,
 		}
 
-		const isCacheUsed = !!cachedContent
-
-		const params: GenerateContentParameters = {
-			model,
-			contents: uncachedContent ?? contents,
-			config: {
-				cachedContent,
-				systemInstruction: isCacheUsed ? undefined : systemInstruction,
-				httpOptions: this.options.googleGeminiBaseUrl
-					? { baseUrl: this.options.googleGeminiBaseUrl }
-					: undefined,
-				thinkingConfig,
-				maxOutputTokens,
-				temperature: this.options.modelTemperature ?? 0,
-			},
-		}
+		const params: GenerateContentParameters = { model, contents, config }
 
 		const result = await this.client.models.generateContentStream(params)
 
-		if (cacheKey && isCacheWriteQueued) {
-			this.writeCache({ cacheKey, model, systemInstruction, contents })
-		}
-
 		let lastUsageMetadata: GenerateContentResponseUsageMetadata | undefined
 
 		for await (const chunk of result) {
@@ -156,7 +86,6 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 		if (lastUsageMetadata) {
 			const inputTokens = lastUsageMetadata.promptTokenCount ?? 0
 			const outputTokens = lastUsageMetadata.candidatesTokenCount ?? 0
-			const cacheWriteTokens = isCacheWriteQueued ? inputTokens : undefined
 			const cacheReadTokens = lastUsageMetadata.cachedContentTokenCount
 			const reasoningTokens = lastUsageMetadata.thoughtsTokenCount
 
@@ -164,16 +93,9 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 				type: "usage",
 				inputTokens,
 				outputTokens,
-				cacheWriteTokens,
 				cacheReadTokens,
 				reasoningTokens,
-				totalCost: this.calculateCost({
-					info,
-					inputTokens,
-					outputTokens,
-					cacheWriteTokens,
-					cacheReadTokens,
-				}),
+				totalCost: this.calculateCost({ info, inputTokens, outputTokens, cacheReadTokens }),
 			}
 		}
 	}
@@ -257,22 +179,19 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 		info,
 		inputTokens,
 		outputTokens,
-		cacheWriteTokens = 0,
 		cacheReadTokens = 0,
 	}: {
 		info: ModelInfo
 		inputTokens: number
 		outputTokens: number
-		cacheWriteTokens?: number
 		cacheReadTokens?: number
 	}) {
-		if (!info.inputPrice || !info.outputPrice || !info.cacheWritesPrice || !info.cacheReadsPrice) {
+		if (!info.inputPrice || !info.outputPrice || !info.cacheReadsPrice) {
 			return undefined
 		}
 
 		let inputPrice = info.inputPrice
 		let outputPrice = info.outputPrice
-		let cacheWritesPrice = info.cacheWritesPrice
 		let cacheReadsPrice = info.cacheReadsPrice
 
 		// If there's tiered pricing then adjust the input and output token prices
@@ -283,7 +202,6 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 			if (tier) {
 				inputPrice = tier.inputPrice ?? inputPrice
 				outputPrice = tier.outputPrice ?? outputPrice
-				cacheWritesPrice = tier.cacheWritesPrice ?? cacheWritesPrice
 				cacheReadsPrice = tier.cacheReadsPrice ?? cacheReadsPrice
 			}
 		}
@@ -291,23 +209,17 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 		// Subtract the cached input tokens from the total input tokens.
 		const uncachedInputTokens = inputTokens - cacheReadTokens
 
-		let cacheWriteCost =
-			cacheWriteTokens > 0 ? cacheWritesPrice * (cacheWriteTokens / 1_000_000) * (CACHE_TTL / 60) : 0
 		let cacheReadCost = cacheReadTokens > 0 ? cacheReadsPrice * (cacheReadTokens / 1_000_000) : 0
 
 		const inputTokensCost = inputPrice * (uncachedInputTokens / 1_000_000)
 		const outputTokensCost = outputPrice * (outputTokens / 1_000_000)
-		const totalCost = inputTokensCost + outputTokensCost + cacheWriteCost + cacheReadCost
+		const totalCost = inputTokensCost + outputTokensCost + cacheReadCost
 
 		const trace: Record<string, { price: number; tokens: number; cost: number }> = {
 			input: { price: inputPrice, tokens: uncachedInputTokens, cost: inputTokensCost },
 			output: { price: outputPrice, tokens: outputTokens, cost: outputTokensCost },
 		}
 
-		if (cacheWriteTokens > 0) {
-			trace.cacheWrite = { price: cacheWritesPrice, tokens: cacheWriteTokens, cost: cacheWriteCost }
-		}
-
 		if (cacheReadTokens > 0) {
 			trace.cacheRead = { price: cacheReadsPrice, tokens: cacheReadTokens, cost: cacheReadCost }
 		}
@@ -316,77 +228,4 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 
 		return totalCost
 	}
-
-	private writeCache({
-		cacheKey,
-		model,
-		systemInstruction,
-		contents,
-	}: {
-		cacheKey: string
-		model: string
-		systemInstruction: string
-		contents: Content[]
-	}) {
-		// TODO: https://www.npmjs.com/package/p-queue
-		if (this.isCacheBusy) {
-			return
-		}
-
-		this.isCacheBusy = true
-		// const timestamp = Date.now()
-
-		const previousCacheEntry = this.contentCaches.get<CacheEntry>(cacheKey)
-
-		this.client.caches
-			.create({
-				model,
-				config: {
-					contents,
-					systemInstruction,
-					ttl: `${CACHE_TTL * 60}s`,
-					httpOptions: { timeout: 120_000 },
-				},
-			})
-			.then((result) => {
-				const { name, usageMetadata } = result
-
-				if (name) {
-					const newCacheEntry: CacheEntry = {
-						key: name,
-						count: contents.length,
-						tokens: usageMetadata?.totalTokenCount,
-					}
-
-					this.contentCaches.set<CacheEntry>(cacheKey, newCacheEntry)
-
-					// console.log(
-					// 	`[GeminiHandler] created cache entry ${newCacheEntry.key} -> ${newCacheEntry.count} messages, ${newCacheEntry.tokens} tokens (${Date.now() - timestamp}ms)`,
-					// )
-
-					if (previousCacheEntry) {
-						// const timestamp = Date.now()
-
-						this.client.caches
-							.delete({ name: previousCacheEntry.key })
-							.then(() => {
-								// console.log(
-								// 	`[GeminiHandler] deleted cache entry ${previousCacheEntry.key} -> ${previousCacheEntry.count} messages, ${previousCacheEntry.tokens} tokens (${Date.now() - timestamp}ms)`,
-								// )
-							})
-							.catch((error) => {
-								console.error(
-									`[GeminiHandler] failed to delete stale cache entry ${previousCacheEntry.key} -> ${error instanceof Error ? error.message : String(error)}`,
-								)
-							})
-					}
-				}
-			})
-			.catch((error) => {
-				console.error(`[GeminiHandler] caches.create error`, error)
-			})
-			.finally(() => {
-				this.isCacheBusy = false
-			})
-	}
 }

+ 1 - 3
src/api/providers/openrouter.ts

@@ -8,7 +8,6 @@ import {
 	openRouterDefaultModelId,
 	openRouterDefaultModelInfo,
 	PROMPT_CACHING_MODELS,
-	OPTIONAL_PROMPT_CACHING_MODELS,
 	REASONING_MODELS,
 } from "../../shared/api"
 
@@ -94,7 +93,7 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
 			openAiMessages = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages])
 		}
 
-		const isCacheAvailable = promptCache.supported && (!promptCache.optional || !this.options.promptCachingDisabled)
+		const isCacheAvailable = promptCache.supported
 
 		// https://openrouter.ai/docs/features/prompt-caching
 		if (isCacheAvailable) {
@@ -191,7 +190,6 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
 			topP: isDeepSeekR1 ? 0.95 : undefined,
 			promptCache: {
 				supported: PROMPT_CACHING_MODELS.has(id),
-				optional: OPTIONAL_PROMPT_CACHING_MODELS.has(id),
 			},
 		}
 	}

+ 0 - 6
src/api/transform/gemini-format.ts

@@ -76,9 +76,3 @@ export function convertAnthropicMessageToGemini(message: Anthropic.Messages.Mess
 		parts: convertAnthropicContentToGemini(message.content),
 	}
 }
-
-const getContentLength = ({ parts }: Content): number =>
-	parts?.reduce((length, { text }) => length + (text?.length ?? 0), 0) ?? 0
-
-export const getMessagesLength = (contents: Content[]): number =>
-	contents.reduce((length, content) => length + getContentLength(content), 0)

+ 1 - 5
src/core/task/Task.ts

@@ -132,7 +132,6 @@ export class Task extends EventEmitter<ClineEvents> {
 	// API
 	readonly apiConfiguration: ProviderSettings
 	api: ApiHandler
-	private promptCacheKey: string
 	private lastApiRequestTime?: number
 
 	toolRepetitionDetector: ToolRepetitionDetector
@@ -225,7 +224,6 @@ export class Task extends EventEmitter<ClineEvents> {
 
 		this.apiConfiguration = apiConfiguration
 		this.api = buildApiHandler(apiConfiguration)
-		this.promptCacheKey = crypto.randomUUID()
 
 		this.urlContentFetcher = new UrlContentFetcher(provider.context)
 		this.browserSession = new BrowserSession(provider.context)
@@ -324,8 +322,6 @@ export class Task extends EventEmitter<ClineEvents> {
 	}
 
 	public async overwriteClineMessages(newMessages: ClineMessage[]) {
-		// Reset the the prompt cache key since we've altered the conversation history.
-		this.promptCacheKey = crypto.randomUUID()
 		this.clineMessages = newMessages
 		await this.saveClineMessages()
 	}
@@ -1493,7 +1489,7 @@ export class Task extends EventEmitter<ClineEvents> {
 			return { role, content }
 		})
 
-		const stream = this.api.createMessage(systemPrompt, cleanConversationHistory, this.promptCacheKey)
+		const stream = this.api.createMessage(systemPrompt, cleanConversationHistory)
 		const iterator = stream[Symbol.asyncIterator]()
 
 		try {

+ 0 - 2
src/exports/roo-code.d.ts

@@ -198,7 +198,6 @@ type ProviderSettings = {
 		| undefined
 	includeMaxTokens?: boolean | undefined
 	reasoningEffort?: ("low" | "medium" | "high") | undefined
-	promptCachingDisabled?: boolean | undefined
 	diffEnabled?: boolean | undefined
 	fuzzyMatchThreshold?: number | undefined
 	modelTemperature?: (number | null) | undefined
@@ -242,7 +241,6 @@ type ProviderSettings = {
 				supportsImages?: boolean | undefined
 				supportsComputerUse?: boolean | undefined
 				supportsPromptCache: boolean
-				isPromptCacheOptional?: boolean | undefined
 				inputPrice?: number | undefined
 				outputPrice?: number | undefined
 				cacheWritesPrice?: number | undefined

+ 0 - 2
src/exports/types.ts

@@ -201,7 +201,6 @@ type ProviderSettings = {
 		| undefined
 	includeMaxTokens?: boolean | undefined
 	reasoningEffort?: ("low" | "medium" | "high") | undefined
-	promptCachingDisabled?: boolean | undefined
 	diffEnabled?: boolean | undefined
 	fuzzyMatchThreshold?: number | undefined
 	modelTemperature?: (number | null) | undefined
@@ -245,7 +244,6 @@ type ProviderSettings = {
 				supportsImages?: boolean | undefined
 				supportsComputerUse?: boolean | undefined
 				supportsPromptCache: boolean
-				isPromptCacheOptional?: boolean | undefined
 				inputPrice?: number | undefined
 				outputPrice?: number | undefined
 				cacheWritesPrice?: number | undefined

+ 0 - 3
src/schemas/index.ts

@@ -109,7 +109,6 @@ export const modelInfoSchema = z.object({
 	supportsImages: z.boolean().optional(),
 	supportsComputerUse: z.boolean().optional(),
 	supportsPromptCache: z.boolean(),
-	isPromptCacheOptional: z.boolean().optional(),
 	inputPrice: z.number().optional(),
 	outputPrice: z.number().optional(),
 	cacheWritesPrice: z.number().optional(),
@@ -348,7 +347,6 @@ export type ProviderSettingsEntry = z.infer<typeof providerSettingsEntrySchema>
 const baseProviderSettingsSchema = z.object({
 	includeMaxTokens: z.boolean().optional(),
 	reasoningEffort: reasoningEffortsSchema.optional(),
-	promptCachingDisabled: z.boolean().optional(),
 	diffEnabled: z.boolean().optional(),
 	fuzzyMatchThreshold: z.number().optional(),
 	modelTemperature: z.number().nullish(),
@@ -629,7 +627,6 @@ const providerSettingsRecord: ProviderSettingsRecord = {
 	// Generic
 	includeMaxTokens: undefined,
 	reasoningEffort: undefined,
-	promptCachingDisabled: undefined,
 	diffEnabled: undefined,
 	fuzzyMatchThreshold: undefined,
 	modelTemperature: undefined,

+ 3 - 20
src/shared/api.ts

@@ -496,7 +496,6 @@ export const vertexModels = {
 		contextWindow: 1_048_576,
 		supportsImages: true,
 		supportsPromptCache: true,
-		isPromptCacheOptional: true,
 		inputPrice: 2.5,
 		outputPrice: 15,
 	},
@@ -505,7 +504,6 @@ export const vertexModels = {
 		contextWindow: 1_048_576,
 		supportsImages: true,
 		supportsPromptCache: true,
-		isPromptCacheOptional: true,
 		inputPrice: 2.5,
 		outputPrice: 15,
 	},
@@ -530,7 +528,6 @@ export const vertexModels = {
 		contextWindow: 1_048_576,
 		supportsImages: true,
 		supportsPromptCache: true,
-		isPromptCacheOptional: true,
 		inputPrice: 0.15,
 		outputPrice: 0.6,
 	},
@@ -555,7 +552,6 @@ export const vertexModels = {
 		contextWindow: 1_048_576,
 		supportsImages: true,
 		supportsPromptCache: true,
-		isPromptCacheOptional: true,
 		inputPrice: 0.075,
 		outputPrice: 0.3,
 	},
@@ -690,7 +686,6 @@ export const geminiModels = {
 		contextWindow: 1_048_576,
 		supportsImages: true,
 		supportsPromptCache: true,
-		isPromptCacheOptional: true,
 		inputPrice: 2.5, // This is the pricing for prompts above 200k tokens.
 		outputPrice: 15,
 		cacheReadsPrice: 0.625,
@@ -715,7 +710,6 @@ export const geminiModels = {
 		contextWindow: 1_048_576,
 		supportsImages: true,
 		supportsPromptCache: true,
-		isPromptCacheOptional: true,
 		inputPrice: 2.5, // This is the pricing for prompts above 200k tokens.
 		outputPrice: 15,
 		cacheReadsPrice: 0.625,
@@ -740,7 +734,6 @@ export const geminiModels = {
 		contextWindow: 1_048_576,
 		supportsImages: true,
 		supportsPromptCache: true,
-		isPromptCacheOptional: true,
 		inputPrice: 0.1,
 		outputPrice: 0.4,
 		cacheReadsPrice: 0.025,
@@ -791,7 +784,6 @@ export const geminiModels = {
 		contextWindow: 1_048_576,
 		supportsImages: true,
 		supportsPromptCache: true,
-		isPromptCacheOptional: true,
 		inputPrice: 0.15, // This is the pricing for prompts above 128k tokens.
 		outputPrice: 0.6,
 		cacheReadsPrice: 0.0375,
@@ -1717,18 +1709,9 @@ export const PROMPT_CACHING_MODELS = new Set([
 	"anthropic/claude-3.7-sonnet",
 	"anthropic/claude-3.7-sonnet:beta",
 	"anthropic/claude-3.7-sonnet:thinking",
-	"google/gemini-2.5-pro-preview-03-25",
-	"google/gemini-2.5-pro-preview-05-06",
-	"google/gemini-2.0-flash-001",
-	"google/gemini-flash-1.5",
-	"google/gemini-flash-1.5-8b",
-])
-
-// These models don't have prompt caching enabled by default (you can turn it on
-// in settings).
-export const OPTIONAL_PROMPT_CACHING_MODELS = new Set([
-	"google/gemini-2.5-pro-preview-03-25",
-	"google/gemini-2.5-pro-preview-05-06",
+	"google/gemini-2.5-pro-preview",
+	"google/gemini-2.5-flash-preview",
+	"google/gemini-2.5-flash-preview:thinking",
 	"google/gemini-2.0-flash-001",
 	"google/gemini-flash-1.5",
 	"google/gemini-flash-1.5-8b",

+ 0 - 8
webview-ui/src/components/settings/ApiOptions.tsx

@@ -48,7 +48,6 @@ import { ModelInfoView } from "./ModelInfoView"
 import { ApiErrorMessage } from "./ApiErrorMessage"
 import { ThinkingBudget } from "./ThinkingBudget"
 import { ReasoningEffort } from "./ReasoningEffort"
-import { PromptCachingControl } from "./PromptCachingControl"
 import { DiffSettingsControl } from "./DiffSettingsControl"
 import { TemperatureControl } from "./TemperatureControl"
 import { RateLimitSecondsControl } from "./RateLimitSecondsControl"
@@ -469,13 +468,6 @@ const ApiOptions = ({
 				/>
 			)}
 
-			{selectedModelInfo && selectedModelInfo.supportsPromptCache && selectedModelInfo.isPromptCacheOptional && (
-				<PromptCachingControl
-					apiConfiguration={apiConfiguration}
-					setApiConfigurationField={setApiConfigurationField}
-				/>
-			)}
-
 			{!fromWelcomeView && (
 				<>
 					<DiffSettingsControl

+ 0 - 29
webview-ui/src/components/settings/PromptCachingControl.tsx

@@ -1,29 +0,0 @@
-import { VSCodeCheckbox } from "@vscode/webview-ui-toolkit/react"
-
-import { ProviderSettings } from "@roo/shared/api"
-
-import { useAppTranslation } from "@src/i18n/TranslationContext"
-
-interface PromptCachingControlProps {
-	apiConfiguration: ProviderSettings
-	setApiConfigurationField: <K extends keyof ProviderSettings>(field: K, value: ProviderSettings[K]) => void
-}
-
-export const PromptCachingControl = ({ apiConfiguration, setApiConfigurationField }: PromptCachingControlProps) => {
-	const { t } = useAppTranslation()
-
-	return (
-		<>
-			<div>
-				<VSCodeCheckbox
-					checked={apiConfiguration.promptCachingDisabled}
-					onChange={(e: any) => setApiConfigurationField("promptCachingDisabled", e.target.checked)}>
-					<label className="block font-medium mb-1">{t("settings:promptCaching.label")}</label>
-				</VSCodeCheckbox>
-				<div className="text-sm text-vscode-descriptionForeground mt-1">
-					{t("settings:promptCaching.description")}
-				</div>
-			</div>
-		</>
-	)
-}

Niektóre pliki nie zostały wyświetlone z powodu dużej ilości zmienionych plików