9 months ago · 20fab97b4d
--- a/evals/packages/types/src/roo-code.ts
+++ b/evals/packages/types/src/roo-code.ts
@@ -102,7 +102,6 @@ export const modelInfoSchema = z.object({
 
				 	supportsImages: z.boolean().optional(),
			
 
				 	supportsComputerUse: z.boolean().optional(),
			
 
				 	supportsPromptCache: z.boolean(),
			
 
				-	isPromptCacheOptional: z.boolean().optional(),
			
 
				 	inputPrice: z.number().optional(),
			
 
				 	outputPrice: z.number().optional(),
			
 
				 	cacheWritesPrice: z.number().optional(),
			
@@ -336,7 +335,6 @@ export type ProviderSettingsEntry = z.infer<typeof providerSettingsEntrySchema>
 
				 const genericProviderSettingsSchema = z.object({
			
 
				 	includeMaxTokens: z.boolean().optional(),
			
 
				 	reasoningEffort: reasoningEffortsSchema.optional(),
			
 
				-	promptCachingDisabled: z.boolean().optional(),
			
 
				 	diffEnabled: z.boolean().optional(),
			
 
				 	fuzzyMatchThreshold: z.number().optional(),
			
 
				 	modelTemperature: z.number().nullish(),
			
@@ -699,7 +697,6 @@ const providerSettingsRecord: ProviderSettingsRecord = {
 
				 	// Generic
			
 
				 	includeMaxTokens: undefined,
			
 
				 	reasoningEffort: undefined,
			
 
				-	promptCachingDisabled: undefined,
			
 
				 	diffEnabled: undefined,
			
 
				 	fuzzyMatchThreshold: undefined,
			
 
				 	modelTemperature: undefined,
			
--- a/src/api/index.ts
+++ b/src/api/index.ts
@@ -32,7 +32,7 @@ export interface SingleCompletionHandler {
 
				 }
			
 
				 
			
 
				 export interface ApiHandler {
			
 
				-	createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[], cacheKey?: string): ApiStream
			
 
				+	createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream
			
 
				 
			
 
				 	getModel(): { id: string; info: ModelInfo }
			
 
				 
			
--- a/src/api/providers/__tests__/gemini.test.ts
+++ b/src/api/providers/__tests__/gemini.test.ts
@@ -219,7 +219,7 @@ describe("GeminiHandler", () => {
 
				 				mockInfo.cacheWritesPrice! * (cacheWriteTokens / 1_000_000) * (CACHE_TTL / 60)
			
 
				 			const expectedCost = expectedInputCost + expectedOutputCost + expectedCacheWriteCost
			
 
				 
			
 
				-			const cost = handler.calculateCost({ info: mockInfo, inputTokens, outputTokens, cacheWriteTokens })
			
 
				+			const cost = handler.calculateCost({ info: mockInfo, inputTokens, outputTokens })
			
 
				 			expect(cost).toBeCloseTo(expectedCost)
			
 
				 		})
			
 
				 
			
@@ -247,384 +247,3 @@ describe("GeminiHandler", () => {
 
				 		})
			
 
				 	})
			
 
				 })
			
 
				-
			
 
				-describe("Caching Logic", () => {
			
 
				-	const systemPrompt = "System prompt"
			
 
				-	const longContent = "a".repeat(5 * 4096) // Ensure content is long enough for caching
			
 
				-	const mockMessagesLong: Anthropic.Messages.MessageParam[] = [
			
 
				-		{ role: "user", content: longContent },
			
 
				-		{ role: "assistant", content: "OK" },
			
 
				-	]
			
 
				-	const cacheKey = "test-cache-key"
			
 
				-	const mockCacheName = "generated/caches/mock-cache-name"
			
 
				-	const mockCacheTokens = 5000
			
 
				-
			
 
				-	let handlerWithCache: GeminiHandler
			
 
				-	let mockGenerateContentStream: jest.Mock
			
 
				-	let mockCreateCache: jest.Mock
			
 
				-	let mockDeleteCache: jest.Mock
			
 
				-	let mockCacheGet: jest.Mock
			
 
				-	let mockCacheSet: jest.Mock
			
 
				-
			
 
				-	beforeEach(() => {
			
 
				-		mockGenerateContentStream = jest.fn().mockResolvedValue({
			
 
				-			[Symbol.asyncIterator]: async function* () {
			
 
				-				yield { text: "Response" }
			
 
				-				yield {
			
 
				-					usageMetadata: {
			
 
				-						promptTokenCount: 100, // Uncached input
			
 
				-						candidatesTokenCount: 50, // Output
			
 
				-						cachedContentTokenCount: 0, // Default, override in tests
			
 
				-					},
			
 
				-				}
			
 
				-			},
			
 
				-		})
			
 
				-		mockCreateCache = jest.fn().mockResolvedValue({
			
 
				-			name: mockCacheName,
			
 
				-			usageMetadata: { totalTokenCount: mockCacheTokens },
			
 
				-		})
			
 
				-		mockDeleteCache = jest.fn().mockResolvedValue({})
			
 
				-		mockCacheGet = jest.fn().mockReturnValue(undefined) // Default: cache miss
			
 
				-		mockCacheSet = jest.fn()
			
 
				-
			
 
				-		handlerWithCache = new GeminiHandler({
			
 
				-			apiKey: "test-key",
			
 
				-			apiModelId: "gemini-1.5-flash-latest", // Use a model that supports caching
			
 
				-			geminiApiKey: "test-key",
			
 
				-			promptCachingDisabled: false,
			
 
				-		})
			
 
				-
			
 
				-		handlerWithCache["client"] = {
			
 
				-			models: {
			
 
				-				generateContentStream: mockGenerateContentStream,
			
 
				-			},
			
 
				-			caches: {
			
 
				-				create: mockCreateCache,
			
 
				-				delete: mockDeleteCache,
			
 
				-			},
			
 
				-		} as any
			
 
				-		handlerWithCache["contentCaches"] = {
			
 
				-			get: mockCacheGet,
			
 
				-			set: mockCacheSet,
			
 
				-		} as any
			
 
				-	})
			
 
				-
			
 
				-	it("should not use cache if promptCachingDisabled is true", async () => {
			
 
				-		handlerWithCache["options"].promptCachingDisabled = true
			
 
				-		const stream = handlerWithCache.createMessage(systemPrompt, mockMessagesLong, cacheKey)
			
 
				-
			
 
				-		for await (const _ of stream) {
			
 
				-		}
			
 
				-
			
 
				-		expect(mockCacheGet).not.toHaveBeenCalled()
			
 
				-		expect(mockGenerateContentStream).toHaveBeenCalledWith(
			
 
				-			expect.objectContaining({
			
 
				-				config: expect.objectContaining({
			
 
				-					cachedContent: undefined,
			
 
				-					systemInstruction: systemPrompt,
			
 
				-				}),
			
 
				-			}),
			
 
				-		)
			
 
				-		expect(mockCreateCache).not.toHaveBeenCalled()
			
 
				-	})
			
 
				-
			
 
				-	it("should not use cache if content length is below threshold", async () => {
			
 
				-		const shortMessages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "short" }]
			
 
				-		const stream = handlerWithCache.createMessage(systemPrompt, shortMessages, cacheKey)
			
 
				-		for await (const _ of stream) {
			
 
				-			/* consume stream */
			
 
				-		}
			
 
				-
			
 
				-		expect(mockCacheGet).not.toHaveBeenCalled() // Doesn't even check cache if too short
			
 
				-		expect(mockGenerateContentStream).toHaveBeenCalledWith(
			
 
				-			expect.objectContaining({
			
 
				-				config: expect.objectContaining({
			
 
				-					cachedContent: undefined,
			
 
				-					systemInstruction: systemPrompt,
			
 
				-				}),
			
 
				-			}),
			
 
				-		)
			
 
				-		expect(mockCreateCache).not.toHaveBeenCalled()
			
 
				-	})
			
 
				-
			
 
				-	it("should perform cache write on miss when conditions met", async () => {
			
 
				-		const stream = handlerWithCache.createMessage(systemPrompt, mockMessagesLong, cacheKey)
			
 
				-		const chunks = []
			
 
				-
			
 
				-		for await (const chunk of stream) {
			
 
				-			chunks.push(chunk)
			
 
				-		}
			
 
				-
			
 
				-		expect(mockCacheGet).toHaveBeenCalledWith(cacheKey)
			
 
				-		expect(mockGenerateContentStream).toHaveBeenCalledWith(
			
 
				-			expect.objectContaining({
			
 
				-				config: expect.objectContaining({
			
 
				-					cachedContent: undefined,
			
 
				-					systemInstruction: systemPrompt,
			
 
				-				}),
			
 
				-			}),
			
 
				-		)
			
 
				-
			
 
				-		await new Promise(process.nextTick) // Allow microtasks (like the async writeCache) to run
			
 
				-
			
 
				-		expect(mockCreateCache).toHaveBeenCalledTimes(1)
			
 
				-		expect(mockCreateCache).toHaveBeenCalledWith(
			
 
				-			expect.objectContaining({
			
 
				-				model: expect.stringContaining("gemini-2.0-flash-001"), // Adjusted expectation based on test run
			
 
				-				config: expect.objectContaining({
			
 
				-					systemInstruction: systemPrompt,
			
 
				-					contents: expect.any(Array), // Verify contents structure if needed
			
 
				-					ttl: expect.stringContaining("300s"),
			
 
				-				}),
			
 
				-			}),
			
 
				-		)
			
 
				-		expect(mockCacheSet).toHaveBeenCalledWith(
			
 
				-			cacheKey,
			
 
				-			expect.objectContaining({
			
 
				-				key: mockCacheName,
			
 
				-				count: mockMessagesLong.length,
			
 
				-				tokens: mockCacheTokens,
			
 
				-			}),
			
 
				-		)
			
 
				-		expect(mockDeleteCache).not.toHaveBeenCalled() // No previous cache to delete
			
 
				-
			
 
				-		const usageChunk = chunks.find((c) => c.type === "usage")
			
 
				-
			
 
				-		expect(usageChunk).toEqual(
			
 
				-			expect.objectContaining({
			
 
				-				cacheWriteTokens: 100, // Should match promptTokenCount when write is queued
			
 
				-				cacheReadTokens: 0,
			
 
				-			}),
			
 
				-		)
			
 
				-	})
			
 
				-
			
 
				-	it("should use cache on hit and not send system prompt", async () => {
			
 
				-		const cachedMessagesCount = 1
			
 
				-		const cacheReadTokensCount = 4000
			
 
				-		mockCacheGet.mockReturnValue({ key: mockCacheName, count: cachedMessagesCount, tokens: cacheReadTokensCount })
			
 
				-
			
 
				-		mockGenerateContentStream.mockResolvedValue({
			
 
				-			[Symbol.asyncIterator]: async function* () {
			
 
				-				yield { text: "Response" }
			
 
				-				yield {
			
 
				-					usageMetadata: {
			
 
				-						promptTokenCount: 10, // Uncached input tokens
			
 
				-						candidatesTokenCount: 50,
			
 
				-						cachedContentTokenCount: cacheReadTokensCount, // Simulate cache hit reporting
			
 
				-					},
			
 
				-				}
			
 
				-			},
			
 
				-		})
			
 
				-
			
 
				-		// Only send the second message (index 1) as uncached
			
 
				-		const stream = handlerWithCache.createMessage(systemPrompt, mockMessagesLong, cacheKey)
			
 
				-		const chunks = []
			
 
				-
			
 
				-		for await (const chunk of stream) {
			
 
				-			chunks.push(chunk)
			
 
				-		}
			
 
				-
			
 
				-		expect(mockCacheGet).toHaveBeenCalledWith(cacheKey)
			
 
				-		expect(mockGenerateContentStream).toHaveBeenCalledWith(
			
 
				-			expect.objectContaining({
			
 
				-				contents: expect.any(Array), // Should contain only the *uncached* messages
			
 
				-				config: expect.objectContaining({
			
 
				-					cachedContent: mockCacheName, // Cache name provided
			
 
				-					systemInstruction: undefined, // System prompt NOT sent on hit
			
 
				-				}),
			
 
				-			}),
			
 
				-		)
			
 
				-
			
 
				-		// Check that the contents sent are only the *new* messages
			
 
				-		const calledContents = mockGenerateContentStream.mock.calls[0][0].contents
			
 
				-		expect(calledContents.length).toBe(mockMessagesLong.length - cachedMessagesCount) // Only new messages sent
			
 
				-
			
 
				-		// Wait for potential async cache write (shouldn't happen here)
			
 
				-		await new Promise(process.nextTick)
			
 
				-		expect(mockCreateCache).not.toHaveBeenCalled()
			
 
				-		expect(mockCacheSet).not.toHaveBeenCalled() // No write occurred
			
 
				-
			
 
				-		// Check usage data for cache read tokens
			
 
				-		const usageChunk = chunks.find((c) => c.type === "usage")
			
 
				-		expect(usageChunk).toEqual(
			
 
				-			expect.objectContaining({
			
 
				-				inputTokens: 10, // Uncached tokens
			
 
				-				outputTokens: 50,
			
 
				-				cacheWriteTokens: undefined, // No write queued
			
 
				-				cacheReadTokens: cacheReadTokensCount, // Read tokens reported
			
 
				-			}),
			
 
				-		)
			
 
				-	})
			
 
				-
			
 
				-	it("should trigger cache write and delete old cache on hit with enough new messages", async () => {
			
 
				-		const previousCacheName = "generated/caches/old-cache-name"
			
 
				-		const previousCacheTokens = 3000
			
 
				-		const previousMessageCount = 1
			
 
				-
			
 
				-		mockCacheGet.mockReturnValue({
			
 
				-			key: previousCacheName,
			
 
				-			count: previousMessageCount,
			
 
				-			tokens: previousCacheTokens,
			
 
				-		})
			
 
				-
			
 
				-		// Simulate enough new messages to trigger write (>= CACHE_WRITE_FREQUENCY)
			
 
				-		const newMessagesCount = 10
			
 
				-
			
 
				-		const messagesForCacheWrite = [
			
 
				-			mockMessagesLong[0], // Will be considered cached
			
 
				-			...Array(newMessagesCount).fill({ role: "user", content: "new message" }),
			
 
				-		] as Anthropic.Messages.MessageParam[]
			
 
				-
			
 
				-		// Mock generateContentStream to report some uncached tokens
			
 
				-		mockGenerateContentStream.mockResolvedValue({
			
 
				-			[Symbol.asyncIterator]: async function* () {
			
 
				-				yield { text: "Response" }
			
 
				-				yield {
			
 
				-					usageMetadata: {
			
 
				-						promptTokenCount: 500, // Uncached input tokens for the 10 new messages
			
 
				-						candidatesTokenCount: 50,
			
 
				-						cachedContentTokenCount: previousCacheTokens,
			
 
				-					},
			
 
				-				}
			
 
				-			},
			
 
				-		})
			
 
				-
			
 
				-		const stream = handlerWithCache.createMessage(systemPrompt, messagesForCacheWrite, cacheKey)
			
 
				-		const chunks = []
			
 
				-
			
 
				-		for await (const chunk of stream) {
			
 
				-			chunks.push(chunk)
			
 
				-		}
			
 
				-
			
 
				-		expect(mockCacheGet).toHaveBeenCalledWith(cacheKey)
			
 
				-
			
 
				-		expect(mockGenerateContentStream).toHaveBeenCalledWith(
			
 
				-			expect.objectContaining({
			
 
				-				contents: expect.any(Array), // Should contain only the *new* messages
			
 
				-				config: expect.objectContaining({
			
 
				-					cachedContent: previousCacheName, // Old cache name used for reading
			
 
				-					systemInstruction: undefined, // System prompt NOT sent
			
 
				-				}),
			
 
				-			}),
			
 
				-		)
			
 
				-		const calledContents = mockGenerateContentStream.mock.calls[0][0].contents
			
 
				-		expect(calledContents.length).toBe(newMessagesCount) // Only new messages sent
			
 
				-
			
 
				-		// Wait for async cache write and delete
			
 
				-		await new Promise(process.nextTick)
			
 
				-		await new Promise(process.nextTick) // Needs extra tick for delete promise chain?
			
 
				-
			
 
				-		expect(mockCreateCache).toHaveBeenCalledTimes(1)
			
 
				-		expect(mockCreateCache).toHaveBeenCalledWith(
			
 
				-			expect.objectContaining({
			
 
				-				// New cache uses *all* messages
			
 
				-				config: expect.objectContaining({
			
 
				-					contents: expect.any(Array), // Should contain *all* messagesForCacheWrite
			
 
				-					systemInstruction: systemPrompt, // System prompt included in *new* cache
			
 
				-				}),
			
 
				-			}),
			
 
				-		)
			
 
				-		const createCallContents = mockCreateCache.mock.calls[0][0].config.contents
			
 
				-		expect(createCallContents.length).toBe(messagesForCacheWrite.length) // All messages in new cache
			
 
				-
			
 
				-		expect(mockCacheSet).toHaveBeenCalledWith(
			
 
				-			cacheKey,
			
 
				-			expect.objectContaining({
			
 
				-				key: mockCacheName, // New cache name
			
 
				-				count: messagesForCacheWrite.length, // New count
			
 
				-				tokens: mockCacheTokens,
			
 
				-			}),
			
 
				-		)
			
 
				-
			
 
				-		expect(mockDeleteCache).toHaveBeenCalledTimes(1)
			
 
				-		expect(mockDeleteCache).toHaveBeenCalledWith({ name: previousCacheName }) // Old cache deleted
			
 
				-
			
 
				-		const usageChunk = chunks.find((c) => c.type === "usage")
			
 
				-
			
 
				-		expect(usageChunk).toEqual(
			
 
				-			expect.objectContaining({
			
 
				-				inputTokens: 500, // Uncached tokens
			
 
				-				outputTokens: 50,
			
 
				-				cacheWriteTokens: 500, // Write tokens match uncached input when write is queued on hit? No, should be total tokens for the *new* cache. Let's adjust mockCreateCache.
			
 
				-				cacheReadTokens: previousCacheTokens,
			
 
				-			}),
			
 
				-		)
			
 
				-
			
 
				-		// Re-run with adjusted expectation after fixing mockCreateCache if needed
			
 
				-		// Let's assume mockCreateCache returns the *total* tokens for the *new* cache (system + all messages)
			
 
				-		const expectedNewCacheTotalTokens = 6000 // Example total tokens for the new cache
			
 
				-
			
 
				-		mockCreateCache.mockResolvedValue({
			
 
				-			name: mockCacheName,
			
 
				-			usageMetadata: { totalTokenCount: expectedNewCacheTotalTokens },
			
 
				-		})
			
 
				-
			
 
				-		// Re-run the stream consumption and checks if necessary, or adjust expectation:
			
 
				-		// The cacheWriteTokens in usage should reflect the *input* tokens that triggered the write,
			
 
				-		// which are the *uncached* tokens in this hit scenario.
			
 
				-		// The cost calculation uses the token count from the *create* response though.
			
 
				-		// Let's stick to the current implementation: cacheWriteTokens = inputTokens when write is queued.
			
 
				-		expect(usageChunk?.cacheWriteTokens).toBe(500) // Matches the uncached promptTokenCount
			
 
				-	})
			
 
				-
			
 
				-	it("should handle cache create error gracefully", async () => {
			
 
				-		const consoleErrorSpy = jest.spyOn(console, "error").mockImplementation(() => {})
			
 
				-		const createError = new Error("Failed to create cache")
			
 
				-		mockCreateCache.mockRejectedValue(createError)
			
 
				-
			
 
				-		const stream = handlerWithCache.createMessage(systemPrompt, mockMessagesLong, cacheKey)
			
 
				-
			
 
				-		for await (const _ of stream) {
			
 
				-		}
			
 
				-
			
 
				-		// Wait for async cache write attempt
			
 
				-		await new Promise(process.nextTick)
			
 
				-
			
 
				-		expect(mockCreateCache).toHaveBeenCalledTimes(1)
			
 
				-		expect(mockCacheSet).not.toHaveBeenCalled() // Set should not be called on error
			
 
				-		expect(consoleErrorSpy).toHaveBeenCalledWith(
			
 
				-			expect.stringContaining("[GeminiHandler] caches.create error"),
			
 
				-			createError,
			
 
				-		)
			
 
				-		consoleErrorSpy.mockRestore()
			
 
				-	})
			
 
				-
			
 
				-	it("should handle cache delete error gracefully", async () => {
			
 
				-		const consoleErrorSpy = jest.spyOn(console, "error").mockImplementation(() => {})
			
 
				-		const deleteError = new Error("Failed to delete cache")
			
 
				-		mockDeleteCache.mockRejectedValue(deleteError)
			
 
				-
			
 
				-		// Setup for cache hit + write scenario to trigger delete
			
 
				-		const previousCacheName = "generated/caches/old-cache-name"
			
 
				-		mockCacheGet.mockReturnValue({ key: previousCacheName, count: 1, tokens: 3000 })
			
 
				-
			
 
				-		const newMessagesCount = 10
			
 
				-
			
 
				-		const messagesForCacheWrite = [
			
 
				-			mockMessagesLong[0],
			
 
				-			...Array(newMessagesCount).fill({ role: "user", content: "new message" }),
			
 
				-		] as Anthropic.Messages.MessageParam[]
			
 
				-
			
 
				-		const stream = handlerWithCache.createMessage(systemPrompt, messagesForCacheWrite, cacheKey)
			
 
				-
			
 
				-		for await (const _ of stream) {
			
 
				-		}
			
 
				-
			
 
				-		// Wait for async cache write and delete attempt
			
 
				-		await new Promise(process.nextTick)
			
 
				-		await new Promise(process.nextTick)
			
 
				-
			
 
				-		expect(mockCreateCache).toHaveBeenCalledTimes(1) // Create still happens
			
 
				-		expect(mockCacheSet).toHaveBeenCalledTimes(1) // Set still happens
			
 
				-		expect(mockDeleteCache).toHaveBeenCalledTimes(1) // Delete was attempted
			
 
				-
			
 
				-		// Expect a single string argument containing both parts
			
 
				-		expect(consoleErrorSpy).toHaveBeenCalledWith(
			
 
				-			expect.stringContaining(
			
 
				-				`[GeminiHandler] failed to delete stale cache entry ${previousCacheName} -> ${deleteError.message}`,
			
 
				-			),
			
 
				-		)
			
 
				-
			
 
				-		consoleErrorSpy.mockRestore()
			
 
				-	})
			
 
				-})
			
--- a/src/api/providers/__tests__/openrouter.test.ts
+++ b/src/api/providers/__tests__/openrouter.test.ts
@@ -78,7 +78,6 @@ describe("OpenRouterHandler", () => {
 
				 				topP: undefined,
			
 
				 				promptCache: {
			
 
				 					supported: true,
			
 
				-					optional: false,
			
 
				 				},
			
 
				 			})
			
 
				 		})
			
--- a/src/api/providers/__tests__/vertex.test.ts
+++ b/src/api/providers/__tests__/vertex.test.ts
@@ -56,11 +56,7 @@ describe("VertexHandler", () => {
 
				 				yield { type: "usage", inputTokens: 0, outputTokens: 5 }
			
 
				 			})
			
 
				 
			
 
				-			const mockCacheKey = "cacheKey"
			
 
				-			// Since we're directly mocking createMessage, we don't need to spy on it
			
 
				-			// We just need to call it and verify the results
			
 
				-
			
 
				-			const stream = handler.createMessage(systemPrompt, mockMessages, mockCacheKey)
			
 
				+			const stream = handler.createMessage(systemPrompt, mockMessages)
			
 
				 
			
 
				 			const chunks: ApiStreamChunk[] = []
			
 
				 
			
--- a/src/api/providers/fetchers/__tests__/fixtures/openrouter-models.json
+++ b/src/api/providers/fetchers/__tests__/fixtures/openrouter-models.json
--- a/src/api/providers/fetchers/__tests__/openrouter.test.ts
+++ b/src/api/providers/fetchers/__tests__/openrouter.test.ts
@@ -13,6 +13,7 @@ nockBack.setMode("lockdown")
 
				 
			
 
				 describe("OpenRouter API", () => {
			
 
				 	describe("getOpenRouterModels", () => {
			
 
				+		// This flakes in CI (probably related to Nock). Need to figure out why.
			
 
				 		it.skip("fetches models and validates schema", async () => {
			
 
				 			const { nockDone } = await nockBack("openrouter-models.json")
			
 
				 
			
@@ -66,12 +67,12 @@ describe("OpenRouter API", () => {
 
				 				supportsComputerUse: true,
			
 
				 			})
			
 
				 
			
 
				-			expect(
			
 
				-				Object.entries(models)
			
 
				-					.filter(([id, _]) => id.startsWith("anthropic/claude-3"))
			
 
				-					.map(([id, model]) => ({ id, maxTokens: model.maxTokens }))
			
 
				-					.sort(({ id: a }, { id: b }) => a.localeCompare(b)),
			
 
				-			).toEqual([
			
 
				+			const anthropicModels = Object.entries(models)
			
 
				+				.filter(([id, _]) => id.startsWith("anthropic/claude-3"))
			
 
				+				.map(([id, model]) => ({ id, maxTokens: model.maxTokens }))
			
 
				+				.sort(({ id: a }, { id: b }) => a.localeCompare(b))
			
 
				+
			
 
				+			expect(anthropicModels).toEqual([
			
 
				 				{ id: "anthropic/claude-3-haiku", maxTokens: 4096 },
			
 
				 				{ id: "anthropic/claude-3-haiku:beta", maxTokens: 4096 },
			
 
				 				{ id: "anthropic/claude-3-opus", maxTokens: 4096 },
			
--- a/src/api/providers/fetchers/openrouter.ts
+++ b/src/api/providers/fetchers/openrouter.ts
@@ -1,13 +1,7 @@
 
				 import axios from "axios"
			
 
				 import { z } from "zod"
			
 
				 
			
 
				-import {
			
 
				-	ApiHandlerOptions,
			
 
				-	ModelInfo,
			
 
				-	anthropicModels,
			
 
				-	COMPUTER_USE_MODELS,
			
 
				-	OPTIONAL_PROMPT_CACHING_MODELS,
			
 
				-} from "../../../shared/api"
			
 
				+import { ApiHandlerOptions, ModelInfo, anthropicModels, COMPUTER_USE_MODELS } from "../../../shared/api"
			
 
				 import { parseApiPrice } from "../../../utils/cost"
			
 
				 
			
 
				 // https://openrouter.ai/api/v1/models
			
@@ -72,7 +66,7 @@ export async function getOpenRouterModels(options?: ApiHandlerOptions): Promise<
 
				 				typeof cacheWritesPrice !== "undefined" && typeof cacheReadsPrice !== "undefined"
			
 
				 
			
 
				 			const modelInfo: ModelInfo = {
			
 
				-				maxTokens: 0,
			
 
				+				maxTokens: rawModel.id.startsWith("anthropic/") ? rawModel.top_provider?.max_completion_tokens : 0,
			
 
				 				contextWindow: rawModel.context_length,
			
 
				 				supportsImages: rawModel.architecture?.modality?.includes("image"),
			
 
				 				supportsPromptCache,
			
@@ -90,11 +84,6 @@ export async function getOpenRouterModels(options?: ApiHandlerOptions): Promise<
 
				 				modelInfo.supportsComputerUse = true
			
 
				 			}
			
 
				 
			
 
				-			// We want to treat prompt caching as "experimental" for these models.
			
 
				-			if (OPTIONAL_PROMPT_CACHING_MODELS.has(rawModel.id)) {
			
 
				-				modelInfo.isPromptCacheOptional = true
			
 
				-			}
			
 
				-
			
 
				 			// Claude 3.7 Sonnet is a "hybrid" thinking model, and the `maxTokens`
			
 
				 			// values can be configured. For the non-thinking variant we want to
			
 
				 			// use 8k. The `thinking` variant can be run in 64k and 128k modes,
			
--- a/src/api/providers/gemini.ts
+++ b/src/api/providers/gemini.ts
@@ -3,33 +3,18 @@ import {
 
				 	GoogleGenAI,
			
 
				 	type GenerateContentResponseUsageMetadata,
			
 
				 	type GenerateContentParameters,
			
 
				-	type Content,
			
 
				+	type GenerateContentConfig,
			
 
				 } from "@google/genai"
			
 
				 import type { JWTInput } from "google-auth-library"
			
 
				-import NodeCache from "node-cache"
			
 
				 
			
 
				 import { ApiHandlerOptions, ModelInfo, GeminiModelId, geminiDefaultModelId, geminiModels } from "../../shared/api"
			
 
				 import { safeJsonParse } from "../../shared/safeJsonParse"
			
 
				 
			
 
				 import { SingleCompletionHandler } from "../index"
			
 
				-import {
			
 
				-	convertAnthropicContentToGemini,
			
 
				-	convertAnthropicMessageToGemini,
			
 
				-	getMessagesLength,
			
 
				-} from "../transform/gemini-format"
			
 
				+import { convertAnthropicContentToGemini, convertAnthropicMessageToGemini } from "../transform/gemini-format"
			
 
				 import type { ApiStream } from "../transform/stream"
			
 
				 import { BaseProvider } from "./base-provider"
			
 
				 
			
 
				-const CACHE_TTL = 5
			
 
				-const CACHE_WRITE_FREQUENCY = 10
			
 
				-const CONTEXT_CACHE_TOKEN_MINIMUM = 4096
			
 
				-
			
 
				-type CacheEntry = {
			
 
				-	key: string
			
 
				-	count: number
			
 
				-	tokens?: number
			
 
				-}
			
 
				-
			
 
				 type GeminiHandlerOptions = ApiHandlerOptions & {
			
 
				 	isVertex?: boolean
			
 
				 }
			
@@ -38,8 +23,6 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 
				 	protected options: ApiHandlerOptions
			
 
				 
			
 
				 	private client: GoogleGenAI
			
 
				-	private contentCaches: NodeCache
			
 
				-	private isCacheBusy = false
			
 
				 
			
 
				 	constructor({ isVertex, ...options }: GeminiHandlerOptions) {
			
 
				 		super()
			
@@ -69,78 +52,25 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 
				 				: isVertex
			
 
				 					? new GoogleGenAI({ vertexai: true, project, location })
			
 
				 					: new GoogleGenAI({ apiKey })
			
 
				-
			
 
				-		this.contentCaches = new NodeCache({ stdTTL: 5 * 60, checkperiod: 5 * 60 })
			
 
				 	}
			
 
				 
			
 
				-	async *createMessage(
			
 
				-		systemInstruction: string,
			
 
				-		messages: Anthropic.Messages.MessageParam[],
			
 
				-		cacheKey?: string,
			
 
				-	): ApiStream {
			
 
				+	async *createMessage(systemInstruction: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
			
 
				 		const { id: model, thinkingConfig, maxOutputTokens, info } = this.getModel()
			
 
				 
			
 
				 		const contents = messages.map(convertAnthropicMessageToGemini)
			
 
				-		const contentsLength = systemInstruction.length + getMessagesLength(contents)
			
 
				-
			
 
				-		let uncachedContent: Content[] | undefined = undefined
			
 
				-		let cachedContent: string | undefined = undefined
			
 
				-
			
 
				-		// The minimum input token count for context caching is 4,096.
			
 
				-		// For a basic approximation we assume 4 characters per token.
			
 
				-		// We can use tiktoken eventually to get a more accurat token count.
			
 
				-		// https://ai.google.dev/gemini-api/docs/caching?lang=node
			
 
				-		// https://ai.google.dev/gemini-api/docs/tokens?lang=node
			
 
				-		const isCacheAvailable =
			
 
				-			info.supportsPromptCache &&
			
 
				-			!this.options.promptCachingDisabled &&
			
 
				-			cacheKey &&
			
 
				-			contentsLength > 4 * CONTEXT_CACHE_TOKEN_MINIMUM
			
 
				-
			
 
				-		let isCacheWriteQueued = false
			
 
				-
			
 
				-		if (isCacheAvailable) {
			
 
				-			const cacheEntry = this.contentCaches.get<CacheEntry>(cacheKey)
			
 
				-
			
 
				-			if (cacheEntry) {
			
 
				-				uncachedContent = contents.slice(cacheEntry.count, contents.length)
			
 
				-				cachedContent = cacheEntry.key
			
 
				-				// console.log(
			
 
				-				// 	`[GeminiHandler] using cache entry ${cacheEntry.key} -> ${cacheEntry.count} messages, ${cacheEntry.tokens} tokens (+${uncachedContent.length} uncached messages)`,
			
 
				-				// )
			
 
				-			}
			
 
				 
			
 
				-			// If `CACHE_WRITE_FREQUENCY` messages have been appended since the
			
 
				-			// last cache write then write a new cache entry.
			
 
				-			// TODO: Use a token count instead.
			
 
				-			if (!cacheEntry || (uncachedContent && uncachedContent.length >= CACHE_WRITE_FREQUENCY)) {
			
 
				-				isCacheWriteQueued = true
			
 
				-			}
			
 
				+		const config: GenerateContentConfig = {
			
 
				+			systemInstruction,
			
 
				+			httpOptions: this.options.googleGeminiBaseUrl ? { baseUrl: this.options.googleGeminiBaseUrl } : undefined,
			
 
				+			thinkingConfig,
			
 
				+			maxOutputTokens,
			
 
				+			temperature: this.options.modelTemperature ?? 0,
			
 
				 		}
			
 
				 
			
 
				-		const isCacheUsed = !!cachedContent
			
 
				-
			
 
				-		const params: GenerateContentParameters = {
			
 
				-			model,
			
 
				-			contents: uncachedContent ?? contents,
			
 
				-			config: {
			
 
				-				cachedContent,
			
 
				-				systemInstruction: isCacheUsed ? undefined : systemInstruction,
			
 
				-				httpOptions: this.options.googleGeminiBaseUrl
			
 
				-					? { baseUrl: this.options.googleGeminiBaseUrl }
			
 
				-					: undefined,
			
 
				-				thinkingConfig,
			
 
				-				maxOutputTokens,
			
 
				-				temperature: this.options.modelTemperature ?? 0,
			
 
				-			},
			
 
				-		}
			
 
				+		const params: GenerateContentParameters = { model, contents, config }
			
 
				 
			
 
				 		const result = await this.client.models.generateContentStream(params)
			
 
				 
			
 
				-		if (cacheKey && isCacheWriteQueued) {
			
 
				-			this.writeCache({ cacheKey, model, systemInstruction, contents })
			
 
				-		}
			
 
				-
			
 
				 		let lastUsageMetadata: GenerateContentResponseUsageMetadata | undefined
			
 
				 
			
 
				 		for await (const chunk of result) {
			
@@ -156,7 +86,6 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 
				 		if (lastUsageMetadata) {
			
 
				 			const inputTokens = lastUsageMetadata.promptTokenCount ?? 0
			
 
				 			const outputTokens = lastUsageMetadata.candidatesTokenCount ?? 0
			
 
				-			const cacheWriteTokens = isCacheWriteQueued ? inputTokens : undefined
			
 
				 			const cacheReadTokens = lastUsageMetadata.cachedContentTokenCount
			
 
				 			const reasoningTokens = lastUsageMetadata.thoughtsTokenCount
			
 
				 
			
@@ -164,16 +93,9 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 
				 				type: "usage",
			
 
				 				inputTokens,
			
 
				 				outputTokens,
			
 
				-				cacheWriteTokens,
			
 
				 				cacheReadTokens,
			
 
				 				reasoningTokens,
			
 
				-				totalCost: this.calculateCost({
			
 
				-					info,
			
 
				-					inputTokens,
			
 
				-					outputTokens,
			
 
				-					cacheWriteTokens,
			
 
				-					cacheReadTokens,
			
 
				-				}),
			
 
				+				totalCost: this.calculateCost({ info, inputTokens, outputTokens, cacheReadTokens }),
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
@@ -257,22 +179,19 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 
				 		info,
			
 
				 		inputTokens,
			
 
				 		outputTokens,
			
 
				-		cacheWriteTokens = 0,
			
 
				 		cacheReadTokens = 0,
			
 
				 	}: {
			
 
				 		info: ModelInfo
			
 
				 		inputTokens: number
			
 
				 		outputTokens: number
			
 
				-		cacheWriteTokens?: number
			
 
				 		cacheReadTokens?: number
			
 
				 	}) {
			
 
				-		if (!info.inputPrice || !info.outputPrice || !info.cacheWritesPrice || !info.cacheReadsPrice) {
			
 
				+		if (!info.inputPrice || !info.outputPrice || !info.cacheReadsPrice) {
			
 
				 			return undefined
			
 
				 		}
			
 
				 
			
 
				 		let inputPrice = info.inputPrice
			
 
				 		let outputPrice = info.outputPrice
			
 
				-		let cacheWritesPrice = info.cacheWritesPrice
			
 
				 		let cacheReadsPrice = info.cacheReadsPrice
			
 
				 
			
 
				 		// If there's tiered pricing then adjust the input and output token prices
			
@@ -283,7 +202,6 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 
				 			if (tier) {
			
 
				 				inputPrice = tier.inputPrice ?? inputPrice
			
 
				 				outputPrice = tier.outputPrice ?? outputPrice
			
 
				-				cacheWritesPrice = tier.cacheWritesPrice ?? cacheWritesPrice
			
 
				 				cacheReadsPrice = tier.cacheReadsPrice ?? cacheReadsPrice
			
 
				 			}
			
 
				 		}
			
@@ -291,23 +209,17 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 
				 		// Subtract the cached input tokens from the total input tokens.
			
 
				 		const uncachedInputTokens = inputTokens - cacheReadTokens
			
 
				 
			
 
				-		let cacheWriteCost =
			
 
				-			cacheWriteTokens > 0 ? cacheWritesPrice * (cacheWriteTokens / 1_000_000) * (CACHE_TTL / 60) : 0
			
 
				 		let cacheReadCost = cacheReadTokens > 0 ? cacheReadsPrice * (cacheReadTokens / 1_000_000) : 0
			
 
				 
			
 
				 		const inputTokensCost = inputPrice * (uncachedInputTokens / 1_000_000)
			
 
				 		const outputTokensCost = outputPrice * (outputTokens / 1_000_000)
			
 
				-		const totalCost = inputTokensCost + outputTokensCost + cacheWriteCost + cacheReadCost
			
 
				+		const totalCost = inputTokensCost + outputTokensCost + cacheReadCost
			
 
				 
			
 
				 		const trace: Record<string, { price: number; tokens: number; cost: number }> = {
			
 
				 			input: { price: inputPrice, tokens: uncachedInputTokens, cost: inputTokensCost },
			
 
				 			output: { price: outputPrice, tokens: outputTokens, cost: outputTokensCost },
			
 
				 		}
			
 
				 
			
 
				-		if (cacheWriteTokens > 0) {
			
 
				-			trace.cacheWrite = { price: cacheWritesPrice, tokens: cacheWriteTokens, cost: cacheWriteCost }
			
 
				-		}
			
 
				-
			
 
				 		if (cacheReadTokens > 0) {
			
 
				 			trace.cacheRead = { price: cacheReadsPrice, tokens: cacheReadTokens, cost: cacheReadCost }
			
 
				 		}
			
@@ -316,77 +228,4 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 
				 
			
 
				 		return totalCost
			
 
				 	}
			
 
				-
			
 
				-	private writeCache({
			
 
				-		cacheKey,
			
 
				-		model,
			
 
				-		systemInstruction,
			
 
				-		contents,
			
 
				-	}: {
			
 
				-		cacheKey: string
			
 
				-		model: string
			
 
				-		systemInstruction: string
			
 
				-		contents: Content[]
			
 
				-	}) {
			
 
				-		// TODO: https://www.npmjs.com/package/p-queue
			
 
				-		if (this.isCacheBusy) {
			
 
				-			return
			
 
				-		}
			
 
				-
			
 
				-		this.isCacheBusy = true
			
 
				-		// const timestamp = Date.now()
			
 
				-
			
 
				-		const previousCacheEntry = this.contentCaches.get<CacheEntry>(cacheKey)
			
 
				-
			
 
				-		this.client.caches
			
 
				-			.create({
			
 
				-				model,
			
 
				-				config: {
			
 
				-					contents,
			
 
				-					systemInstruction,
			
 
				-					ttl: `${CACHE_TTL * 60}s`,
			
 
				-					httpOptions: { timeout: 120_000 },
			
 
				-				},
			
 
				-			})
			
 
				-			.then((result) => {
			
 
				-				const { name, usageMetadata } = result
			
 
				-
			
 
				-				if (name) {
			
 
				-					const newCacheEntry: CacheEntry = {
			
 
				-						key: name,
			
 
				-						count: contents.length,
			
 
				-						tokens: usageMetadata?.totalTokenCount,
			
 
				-					}
			
 
				-
			
 
				-					this.contentCaches.set<CacheEntry>(cacheKey, newCacheEntry)
			
 
				-
			
 
				-					// console.log(
			
 
				-					// 	`[GeminiHandler] created cache entry ${newCacheEntry.key} -> ${newCacheEntry.count} messages, ${newCacheEntry.tokens} tokens (${Date.now() - timestamp}ms)`,
			
 
				-					// )
			
 
				-
			
 
				-					if (previousCacheEntry) {
			
 
				-						// const timestamp = Date.now()
			
 
				-
			
 
				-						this.client.caches
			
 
				-							.delete({ name: previousCacheEntry.key })
			
 
				-							.then(() => {
			
 
				-								// console.log(
			
 
				-								// 	`[GeminiHandler] deleted cache entry ${previousCacheEntry.key} -> ${previousCacheEntry.count} messages, ${previousCacheEntry.tokens} tokens (${Date.now() - timestamp}ms)`,
			
 
				-								// )
			
 
				-							})
			
 
				-							.catch((error) => {
			
 
				-								console.error(
			
 
				-									`[GeminiHandler] failed to delete stale cache entry ${previousCacheEntry.key} -> ${error instanceof Error ? error.message : String(error)}`,
			
 
				-								)
			
 
				-							})
			
 
				-					}
			
 
				-				}
			
 
				-			})
			
 
				-			.catch((error) => {
			
 
				-				console.error(`[GeminiHandler] caches.create error`, error)
			
 
				-			})
			
 
				-			.finally(() => {
			
 
				-				this.isCacheBusy = false
			
 
				-			})
			
 
				-	}
			
 
				 }
			
--- a/src/api/providers/openrouter.ts
+++ b/src/api/providers/openrouter.ts
@@ -8,7 +8,6 @@ import {
 
				 	openRouterDefaultModelId,
			
 
				 	openRouterDefaultModelInfo,
			
 
				 	PROMPT_CACHING_MODELS,
			
 
				-	OPTIONAL_PROMPT_CACHING_MODELS,
			
 
				 	REASONING_MODELS,
			
 
				 } from "../../shared/api"
			
 
				 
			
@@ -94,7 +93,7 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
 
				 			openAiMessages = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages])
			
 
				 		}
			
 
				 
			
 
				-		const isCacheAvailable = promptCache.supported && (!promptCache.optional || !this.options.promptCachingDisabled)
			
 
				+		const isCacheAvailable = promptCache.supported
			
 
				 
			
 
				 		// https://openrouter.ai/docs/features/prompt-caching
			
 
				 		if (isCacheAvailable) {
			
@@ -191,7 +190,6 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
 
				 			topP: isDeepSeekR1 ? 0.95 : undefined,
			
 
				 			promptCache: {
			
 
				 				supported: PROMPT_CACHING_MODELS.has(id),
			
 
				-				optional: OPTIONAL_PROMPT_CACHING_MODELS.has(id),
			
 
				 			},
			
 
				 		}
			
 
				 	}
			
--- a/src/api/transform/gemini-format.ts
+++ b/src/api/transform/gemini-format.ts
@@ -76,9 +76,3 @@ export function convertAnthropicMessageToGemini(message: Anthropic.Messages.Mess
 
				 		parts: convertAnthropicContentToGemini(message.content),
			
 
				 	}
			
 
				 }
			
 
				-
			
 
				-const getContentLength = ({ parts }: Content): number =>
			
 
				-	parts?.reduce((length, { text }) => length + (text?.length ?? 0), 0) ?? 0
			
 
				-
			
 
				-export const getMessagesLength = (contents: Content[]): number =>
			
 
				-	contents.reduce((length, content) => length + getContentLength(content), 0)
			
--- a/src/core/task/Task.ts
+++ b/src/core/task/Task.ts
@@ -132,7 +132,6 @@ export class Task extends EventEmitter<ClineEvents> {
 
				 	// API
			
 
				 	readonly apiConfiguration: ProviderSettings
			
 
				 	api: ApiHandler
			
 
				-	private promptCacheKey: string
			
 
				 	private lastApiRequestTime?: number
			
 
				 
			
 
				 	toolRepetitionDetector: ToolRepetitionDetector
			
@@ -225,7 +224,6 @@ export class Task extends EventEmitter<ClineEvents> {
 
				 
			
 
				 		this.apiConfiguration = apiConfiguration
			
 
				 		this.api = buildApiHandler(apiConfiguration)
			
 
				-		this.promptCacheKey = crypto.randomUUID()
			
 
				 
			
 
				 		this.urlContentFetcher = new UrlContentFetcher(provider.context)
			
 
				 		this.browserSession = new BrowserSession(provider.context)
			
@@ -324,8 +322,6 @@ export class Task extends EventEmitter<ClineEvents> {
 
				 	}
			
 
				 
			
 
				 	public async overwriteClineMessages(newMessages: ClineMessage[]) {
			
 
				-		// Reset the the prompt cache key since we've altered the conversation history.
			
 
				-		this.promptCacheKey = crypto.randomUUID()
			
 
				 		this.clineMessages = newMessages
			
 
				 		await this.saveClineMessages()
			
 
				 	}
			
@@ -1493,7 +1489,7 @@ export class Task extends EventEmitter<ClineEvents> {
 
				 			return { role, content }
			
 
				 		})
			
 
				 
			
 
				-		const stream = this.api.createMessage(systemPrompt, cleanConversationHistory, this.promptCacheKey)
			
 
				+		const stream = this.api.createMessage(systemPrompt, cleanConversationHistory)
			
 
				 		const iterator = stream[Symbol.asyncIterator]()
			
 
				 
			
 
				 		try {
			
--- a/src/exports/roo-code.d.ts
+++ b/src/exports/roo-code.d.ts
@@ -198,7 +198,6 @@ type ProviderSettings = {
 
				 		| undefined
			
 
				 	includeMaxTokens?: boolean | undefined
			
 
				 	reasoningEffort?: ("low" | "medium" | "high") | undefined
			
 
				-	promptCachingDisabled?: boolean | undefined
			
 
				 	diffEnabled?: boolean | undefined
			
 
				 	fuzzyMatchThreshold?: number | undefined
			
 
				 	modelTemperature?: (number | null) | undefined
			
@@ -242,7 +241,6 @@ type ProviderSettings = {
 
				 				supportsImages?: boolean | undefined
			
 
				 				supportsComputerUse?: boolean | undefined
			
 
				 				supportsPromptCache: boolean
			
 
				-				isPromptCacheOptional?: boolean | undefined
			
 
				 				inputPrice?: number | undefined
			
 
				 				outputPrice?: number | undefined
			
 
				 				cacheWritesPrice?: number | undefined
			
--- a/src/exports/types.ts
+++ b/src/exports/types.ts
@@ -201,7 +201,6 @@ type ProviderSettings = {
 
				 		| undefined
			
 
				 	includeMaxTokens?: boolean | undefined
			
 
				 	reasoningEffort?: ("low" | "medium" | "high") | undefined
			
 
				-	promptCachingDisabled?: boolean | undefined
			
 
				 	diffEnabled?: boolean | undefined
			
 
				 	fuzzyMatchThreshold?: number | undefined
			
 
				 	modelTemperature?: (number | null) | undefined
			
@@ -245,7 +244,6 @@ type ProviderSettings = {
 
				 				supportsImages?: boolean | undefined
			
 
				 				supportsComputerUse?: boolean | undefined
			
 
				 				supportsPromptCache: boolean
			
 
				-				isPromptCacheOptional?: boolean | undefined
			
 
				 				inputPrice?: number | undefined
			
 
				 				outputPrice?: number | undefined
			
 
				 				cacheWritesPrice?: number | undefined
			
--- a/src/schemas/index.ts
+++ b/src/schemas/index.ts
@@ -109,7 +109,6 @@ export const modelInfoSchema = z.object({
 
				 	supportsImages: z.boolean().optional(),
			
 
				 	supportsComputerUse: z.boolean().optional(),
			
 
				 	supportsPromptCache: z.boolean(),
			
 
				-	isPromptCacheOptional: z.boolean().optional(),
			
 
				 	inputPrice: z.number().optional(),
			
 
				 	outputPrice: z.number().optional(),
			
 
				 	cacheWritesPrice: z.number().optional(),
			
@@ -348,7 +347,6 @@ export type ProviderSettingsEntry = z.infer<typeof providerSettingsEntrySchema>
 
				 const baseProviderSettingsSchema = z.object({
			
 
				 	includeMaxTokens: z.boolean().optional(),
			
 
				 	reasoningEffort: reasoningEffortsSchema.optional(),
			
 
				-	promptCachingDisabled: z.boolean().optional(),
			
 
				 	diffEnabled: z.boolean().optional(),
			
 
				 	fuzzyMatchThreshold: z.number().optional(),
			
 
				 	modelTemperature: z.number().nullish(),
			
@@ -629,7 +627,6 @@ const providerSettingsRecord: ProviderSettingsRecord = {
 
				 	// Generic
			
 
				 	includeMaxTokens: undefined,
			
 
				 	reasoningEffort: undefined,
			
 
				-	promptCachingDisabled: undefined,
			
 
				 	diffEnabled: undefined,
			
 
				 	fuzzyMatchThreshold: undefined,
			
 
				 	modelTemperature: undefined,
			
--- a/src/shared/api.ts
+++ b/src/shared/api.ts
@@ -496,7 +496,6 @@ export const vertexModels = {
 
				 		contextWindow: 1_048_576,
			
 
				 		supportsImages: true,
			
 
				 		supportsPromptCache: true,
			
 
				-		isPromptCacheOptional: true,
			
 
				 		inputPrice: 2.5,
			
 
				 		outputPrice: 15,
			
 
				 	},
			
@@ -505,7 +504,6 @@ export const vertexModels = {
 
				 		contextWindow: 1_048_576,
			
 
				 		supportsImages: true,
			
 
				 		supportsPromptCache: true,
			
 
				-		isPromptCacheOptional: true,
			
 
				 		inputPrice: 2.5,
			
 
				 		outputPrice: 15,
			
 
				 	},
			
@@ -530,7 +528,6 @@ export const vertexModels = {
 
				 		contextWindow: 1_048_576,
			
 
				 		supportsImages: true,
			
 
				 		supportsPromptCache: true,
			
 
				-		isPromptCacheOptional: true,
			
 
				 		inputPrice: 0.15,
			
 
				 		outputPrice: 0.6,
			
 
				 	},
			
@@ -555,7 +552,6 @@ export const vertexModels = {
 
				 		contextWindow: 1_048_576,
			
 
				 		supportsImages: true,
			
 
				 		supportsPromptCache: true,
			
 
				-		isPromptCacheOptional: true,
			
 
				 		inputPrice: 0.075,
			
 
				 		outputPrice: 0.3,
			
 
				 	},
			
@@ -690,7 +686,6 @@ export const geminiModels = {
 
				 		contextWindow: 1_048_576,
			
 
				 		supportsImages: true,
			
 
				 		supportsPromptCache: true,
			
 
				-		isPromptCacheOptional: true,
			
 
				 		inputPrice: 2.5, // This is the pricing for prompts above 200k tokens.
			
 
				 		outputPrice: 15,
			
 
				 		cacheReadsPrice: 0.625,
			
@@ -715,7 +710,6 @@ export const geminiModels = {
 
				 		contextWindow: 1_048_576,
			
 
				 		supportsImages: true,
			
 
				 		supportsPromptCache: true,
			
 
				-		isPromptCacheOptional: true,
			
 
				 		inputPrice: 2.5, // This is the pricing for prompts above 200k tokens.
			
 
				 		outputPrice: 15,
			
 
				 		cacheReadsPrice: 0.625,
			
@@ -740,7 +734,6 @@ export const geminiModels = {
 
				 		contextWindow: 1_048_576,
			
 
				 		supportsImages: true,
			
 
				 		supportsPromptCache: true,
			
 
				-		isPromptCacheOptional: true,
			
 
				 		inputPrice: 0.1,
			
 
				 		outputPrice: 0.4,
			
 
				 		cacheReadsPrice: 0.025,
			
@@ -791,7 +784,6 @@ export const geminiModels = {
 
				 		contextWindow: 1_048_576,
			
 
				 		supportsImages: true,
			
 
				 		supportsPromptCache: true,
			
 
				-		isPromptCacheOptional: true,
			
 
				 		inputPrice: 0.15, // This is the pricing for prompts above 128k tokens.
			
 
				 		outputPrice: 0.6,
			
 
				 		cacheReadsPrice: 0.0375,
			
@@ -1717,18 +1709,9 @@ export const PROMPT_CACHING_MODELS = new Set([
 
				 	"anthropic/claude-3.7-sonnet",
			
 
				 	"anthropic/claude-3.7-sonnet:beta",
			
 
				 	"anthropic/claude-3.7-sonnet:thinking",
			
 
				-	"google/gemini-2.5-pro-preview-03-25",
			
 
				-	"google/gemini-2.5-pro-preview-05-06",
			
 
				-	"google/gemini-2.0-flash-001",
			
 
				-	"google/gemini-flash-1.5",
			
 
				-	"google/gemini-flash-1.5-8b",
			
 
				-])
			
 
				-
			
 
				-// These models don't have prompt caching enabled by default (you can turn it on
			
 
				-// in settings).
			
 
				-export const OPTIONAL_PROMPT_CACHING_MODELS = new Set([
			
 
				-	"google/gemini-2.5-pro-preview-03-25",
			
 
				-	"google/gemini-2.5-pro-preview-05-06",
			
 
				+	"google/gemini-2.5-pro-preview",
			
 
				+	"google/gemini-2.5-flash-preview",
			
 
				+	"google/gemini-2.5-flash-preview:thinking",
			
 
				 	"google/gemini-2.0-flash-001",
			
 
				 	"google/gemini-flash-1.5",
			
 
				 	"google/gemini-flash-1.5-8b",
			
--- a/webview-ui/src/components/settings/ApiOptions.tsx
+++ b/webview-ui/src/components/settings/ApiOptions.tsx
@@ -48,7 +48,6 @@ import { ModelInfoView } from "./ModelInfoView"
 
				 import { ApiErrorMessage } from "./ApiErrorMessage"
			
 
				 import { ThinkingBudget } from "./ThinkingBudget"
			
 
				 import { ReasoningEffort } from "./ReasoningEffort"
			
 
				-import { PromptCachingControl } from "./PromptCachingControl"
			
 
				 import { DiffSettingsControl } from "./DiffSettingsControl"
			
 
				 import { TemperatureControl } from "./TemperatureControl"
			
 
				 import { RateLimitSecondsControl } from "./RateLimitSecondsControl"
			
@@ -469,13 +468,6 @@ const ApiOptions = ({
 
				 				/>
			
 
				 			)}
			
 
				 
			
 
				-			{selectedModelInfo && selectedModelInfo.supportsPromptCache && selectedModelInfo.isPromptCacheOptional && (
			
 
				-				<PromptCachingControl
			
 
				-					apiConfiguration={apiConfiguration}
			
 
				-					setApiConfigurationField={setApiConfigurationField}
			
 
				-				/>
			
 
				-			)}
			
 
				-
			
 
				 			{!fromWelcomeView && (
			
 
				 				<>
			
 
				 					<DiffSettingsControl
			
--- a/webview-ui/src/components/settings/PromptCachingControl.tsx
+++ b/webview-ui/src/components/settings/PromptCachingControl.tsx
@@ -1,29 +0,0 @@
 
				-import { VSCodeCheckbox } from "@vscode/webview-ui-toolkit/react"
			
 
				-
			
 
				-import { ProviderSettings } from "@roo/shared/api"
			
 
				-
			
 
				-import { useAppTranslation } from "@src/i18n/TranslationContext"
			
 
				-
			
 
				-interface PromptCachingControlProps {
			
 
				-	apiConfiguration: ProviderSettings
			
 
				-	setApiConfigurationField: <K extends keyof ProviderSettings>(field: K, value: ProviderSettings[K]) => void
			
 
				-}
			
 
				-
			
 
				-export const PromptCachingControl = ({ apiConfiguration, setApiConfigurationField }: PromptCachingControlProps) => {
			
 
				-	const { t } = useAppTranslation()
			
 
				-
			
 
				-	return (
			
 
				-		<>
			
 
				-			<div>
			
 
				-				<VSCodeCheckbox
			
 
				-					checked={apiConfiguration.promptCachingDisabled}
			
 
				-					onChange={(e: any) => setApiConfigurationField("promptCachingDisabled", e.target.checked)}>
			
 
				-					<label className="block font-medium mb-1">{t("settings:promptCaching.label")}</label>
			
 
				-				</VSCodeCheckbox>
			
 
				-				<div className="text-sm text-vscode-descriptionForeground mt-1">
			
 
				-					{t("settings:promptCaching.description")}
			
 
				-				</div>
			
 
				-			</div>
			
 
				-		</>
			
 
				-	)
			
 
				-}