Răsfoiți Sursa

refactor: unify cache control with centralized breakpoints and universal provider options (#11426)

Hannes Rudolph 2 zile în urmă
părinte
comite
897c372d2d

+ 7 - 0
src/api/index.ts

@@ -88,6 +88,13 @@ export interface ApiHandlerCreateMessageMetadata {
 	 * Only applies to providers that support function calling restrictions (e.g., Gemini).
 	 */
 	allowedFunctionNames?: string[]
+	/** Provider-specific options for tool definitions (e.g. cache control). */
+	toolProviderOptions?: Record<string, Record<string, unknown>>
+	/** Provider-specific options for the system prompt (e.g. cache control).
+	 * Cache-aware providers use this to inject the system prompt as a cached
+	 * system message, since AI SDK v6 does not support providerOptions on the
+	 * `system` string parameter. */
+	systemProviderOptions?: Record<string, Record<string, unknown>>
 }
 
 export interface ApiHandler {

+ 26 - 5
src/api/providers/__tests__/anthropic.spec.ts

@@ -399,7 +399,7 @@ describe("AnthropicHandler", () => {
 			expect(endChunk).toBeDefined()
 		})
 
-		it("should pass system prompt via system param with systemProviderOptions for cache control", async () => {
+		it("should pass system prompt via system param when no systemProviderOptions", async () => {
 			setupStreamTextMock([{ type: "text-delta", text: "test" }])
 
 			const stream = handler.createMessage(systemPrompt, [
@@ -410,16 +410,37 @@ describe("AnthropicHandler", () => {
 				// Consume
 			}
 
-			// Verify streamText was called with system + systemProviderOptions (not as a message)
+			// Without systemProviderOptions, system prompt is passed via the system parameter
 			const callArgs = mockStreamText.mock.calls[0]![0]
 			expect(callArgs.system).toBe(systemPrompt)
-			expect(callArgs.systemProviderOptions).toEqual({
-				anthropic: { cacheControl: { type: "ephemeral" } },
-			})
 			// System prompt should NOT be in the messages array
 			const systemMessages = callArgs.messages.filter((m: any) => m.role === "system")
 			expect(systemMessages).toHaveLength(0)
 		})
+
+		it("should inject system prompt as cached system message when systemProviderOptions provided", async () => {
+			setupStreamTextMock([{ type: "text-delta", text: "test" }])
+
+			const cacheOpts = { anthropic: { cacheControl: { type: "ephemeral" } } }
+			const stream = handler.createMessage(
+				systemPrompt,
+				[{ role: "user", content: [{ type: "text" as const, text: "test" }] }],
+				{ taskId: "test-task", systemProviderOptions: cacheOpts },
+			)
+
+			for await (const _chunk of stream) {
+				// Consume
+			}
+
+			// With systemProviderOptions, system prompt is injected as messages[0]
+			const callArgs = mockStreamText.mock.calls[0]![0]
+			expect(callArgs.system).toBeUndefined()
+			// System prompt should be the first message with providerOptions
+			const systemMessages = callArgs.messages.filter((m: any) => m.role === "system")
+			expect(systemMessages).toHaveLength(1)
+			expect(systemMessages[0].content).toBe(systemPrompt)
+			expect(systemMessages[0].providerOptions).toEqual(cacheOpts)
+		})
 	})
 
 	describe("completePrompt", () => {

+ 1 - 5
src/api/providers/__tests__/minimax.spec.ts

@@ -338,16 +338,12 @@ describe("MiniMaxHandler", () => {
 
 			expect(mockMergeEnvironmentDetailsForMiniMax).toHaveBeenCalledWith(messages)
 			const callArgs = mockStreamText.mock.calls[0]?.[0]
+			// Cache control is now applied centrally in Task.ts, not per-provider
 			expect(callArgs.messages).toEqual(
 				expect.arrayContaining([
 					expect.objectContaining({
 						role: "user",
 						content: [{ type: "text", text: "Merged message" }],
-						providerOptions: {
-							anthropic: {
-								cacheControl: { type: "ephemeral" },
-							},
-						},
 					}),
 				]),
 			)

+ 18 - 59
src/api/providers/anthropic-vertex.ts

@@ -26,6 +26,7 @@ import {
 	handleAiSdkError,
 	yieldResponseMessage,
 } from "../transform/ai-sdk"
+import { applyToolCacheOptions, applySystemPromptCaching } from "../transform/cache-breakpoints"
 import { calculateApiCostAnthropic } from "../../shared/cost"
 
 import { DEFAULT_HEADERS } from "./constants"
@@ -96,6 +97,7 @@ export class AnthropicVertexHandler extends BaseProvider implements SingleComple
 		// Convert tools to AI SDK format
 		const openAiTools = this.convertToolsForOpenAI(metadata?.tools)
 		const aiSdkTools = convertToolsForAiSdk(openAiTools) as ToolSet | undefined
+		applyToolCacheOptions(aiSdkTools as Parameters<typeof applyToolCacheOptions>[0], metadata?.toolProviderOptions)
 
 		// Build Anthropic provider options
 		const anthropicProviderOptions: Record<string, unknown> = {}
@@ -119,45 +121,18 @@ export class AnthropicVertexHandler extends BaseProvider implements SingleComple
 			anthropicProviderOptions.disableParallelToolUse = true
 		}
 
-		/**
-		 * Vertex API has specific limitations for prompt caching:
-		 * 1. Maximum of 4 blocks can have cache_control
-		 * 2. Only text blocks can be cached (images and other content types cannot)
-		 * 3. Cache control can only be applied to user messages, not assistant messages
-		 *
-		 * Our caching strategy:
-		 * - Cache the system prompt (1 block)
-		 * - Cache the last text block of the second-to-last user message (1 block)
-		 * - Cache the last text block of the last user message (1 block)
-		 * This ensures we stay under the 4-block limit while maintaining effective caching
-		 * for the most relevant context.
-		 */
-		const cacheProviderOption = { anthropic: { cacheControl: { type: "ephemeral" as const } } }
-
-		const userMsgIndices = messages.reduce(
-			(acc, msg, index) => ("role" in msg && msg.role === "user" ? [...acc, index] : acc),
-			[] as number[],
+		// Breakpoint 1: System prompt caching — inject as cached system message
+		const effectiveSystemPrompt = applySystemPromptCaching(
+			systemPrompt,
+			aiSdkMessages,
+			metadata?.systemProviderOptions,
 		)
 
-		const targetIndices = new Set<number>()
-		const lastUserMsgIndex = userMsgIndices[userMsgIndices.length - 1] ?? -1
-		const secondLastUserMsgIndex = userMsgIndices[userMsgIndices.length - 2] ?? -1
-
-		if (lastUserMsgIndex >= 0) targetIndices.add(lastUserMsgIndex)
-		if (secondLastUserMsgIndex >= 0) targetIndices.add(secondLastUserMsgIndex)
-
-		if (targetIndices.size > 0) {
-			this.applyCacheControlToAiSdkMessages(messages as ModelMessage[], targetIndices, cacheProviderOption)
-		}
-
 		// Build streamText request
 		// Cast providerOptions to any to bypass strict JSONObject typing — the AI SDK accepts the correct runtime values
 		const requestOptions: Parameters<typeof streamText>[0] = {
 			model: this.provider(modelConfig.id),
-			system: systemPrompt,
-			...({
-				systemProviderOptions: { anthropic: { cacheControl: { type: "ephemeral" } } },
-			} as Record<string, unknown>),
+			system: effectiveSystemPrompt,
 			messages: aiSdkMessages,
 			temperature: modelConfig.temperature,
 			maxOutputTokens: modelConfig.maxTokens ?? ANTHROPIC_DEFAULT_MAX_TOKENS,
@@ -216,12 +191,19 @@ export class AnthropicVertexHandler extends BaseProvider implements SingleComple
 		const inputTokens = usage.inputTokens ?? 0
 		const outputTokens = usage.outputTokens ?? 0
 
-		// Extract cache metrics from Anthropic's providerMetadata
+		// Extract cache metrics from Anthropic's providerMetadata.
+		// In @ai-sdk/anthropic v3.0.38+, cacheReadInputTokens may only exist at
+		// usage.cache_read_input_tokens rather than the top-level property.
 		const anthropicMeta = providerMetadata?.anthropic as
-			| { cacheCreationInputTokens?: number; cacheReadInputTokens?: number }
+			| {
+					cacheCreationInputTokens?: number
+					cacheReadInputTokens?: number
+					usage?: { cache_read_input_tokens?: number }
+			  }
 			| undefined
 		const cacheWriteTokens = anthropicMeta?.cacheCreationInputTokens ?? 0
-		const cacheReadTokens = anthropicMeta?.cacheReadInputTokens ?? 0
+		const cacheReadTokens =
+			anthropicMeta?.cacheReadInputTokens ?? anthropicMeta?.usage?.cache_read_input_tokens ?? 0
 
 		const { totalCost } = calculateApiCostAnthropic(
 			info,
@@ -241,29 +223,6 @@ export class AnthropicVertexHandler extends BaseProvider implements SingleComple
 		}
 	}
 
-	/**
-	 * Apply cacheControl providerOptions to the correct AI SDK messages by walking
-	 * the original Anthropic messages and converted AI SDK messages in parallel.
-	 *
-	 * convertToAiSdkMessages() can split a single Anthropic user message (containing
-	 * tool_results + text) into 2 AI SDK messages (tool role + user role). This method
-	 * accounts for that split so cache control lands on the right message.
-	 */
-	private applyCacheControlToAiSdkMessages(
-		aiSdkMessages: { role: string; providerOptions?: Record<string, Record<string, unknown>> }[],
-		targetIndices: Set<number>,
-		cacheProviderOption: Record<string, Record<string, unknown>>,
-	): void {
-		for (const idx of targetIndices) {
-			if (idx >= 0 && idx < aiSdkMessages.length) {
-				aiSdkMessages[idx].providerOptions = {
-					...aiSdkMessages[idx].providerOptions,
-					...cacheProviderOption,
-				}
-			}
-		}
-	}
-
 	getModel() {
 		const modelId = this.options.apiModelId
 		let id = modelId && modelId in vertexModels ? (modelId as VertexModelId) : vertexDefaultModelId

+ 20 - 48
src/api/providers/anthropic.ts

@@ -24,6 +24,7 @@ import {
 	handleAiSdkError,
 	yieldResponseMessage,
 } from "../transform/ai-sdk"
+import { applyToolCacheOptions, applySystemPromptCaching } from "../transform/cache-breakpoints"
 import { calculateApiCostAnthropic } from "../../shared/cost"
 
 import { DEFAULT_HEADERS } from "./constants"
@@ -82,6 +83,7 @@ export class AnthropicHandler extends BaseProvider implements SingleCompletionHa
 		// Convert tools to AI SDK format
 		const openAiTools = this.convertToolsForOpenAI(metadata?.tools)
 		const aiSdkTools = convertToolsForAiSdk(openAiTools) as ToolSet | undefined
+		applyToolCacheOptions(aiSdkTools as Parameters<typeof applyToolCacheOptions>[0], metadata?.toolProviderOptions)
 
 		// Build Anthropic provider options
 		const anthropicProviderOptions: Record<string, unknown> = {}
@@ -105,34 +107,20 @@ export class AnthropicHandler extends BaseProvider implements SingleCompletionHa
 			anthropicProviderOptions.disableParallelToolUse = true
 		}
 
-		// Apply cache control to user messages
-		// Strategy: cache the last 2 user messages (write-to-cache + read-from-cache)
-		const cacheProviderOption = { anthropic: { cacheControl: { type: "ephemeral" as const } } }
-
-		const userMsgIndices = messages.reduce(
-			(acc, msg, index) => ("role" in msg && msg.role === "user" ? [...acc, index] : acc),
-			[] as number[],
+		// Breakpoint 1: System prompt caching — inject as cached system message
+		// AI SDK v6 does not support providerOptions on the system string parameter,
+		// so cache-aware providers convert it to a system message with providerOptions.
+		const effectiveSystemPrompt = applySystemPromptCaching(
+			systemPrompt,
+			aiSdkMessages,
+			metadata?.systemProviderOptions,
 		)
 
-		const targetIndices = new Set<number>()
-		const lastUserMsgIndex = userMsgIndices[userMsgIndices.length - 1] ?? -1
-		const secondLastUserMsgIndex = userMsgIndices[userMsgIndices.length - 2] ?? -1
-
-		if (lastUserMsgIndex >= 0) targetIndices.add(lastUserMsgIndex)
-		if (secondLastUserMsgIndex >= 0) targetIndices.add(secondLastUserMsgIndex)
-
-		if (targetIndices.size > 0) {
-			this.applyCacheControlToAiSdkMessages(messages as ModelMessage[], targetIndices, cacheProviderOption)
-		}
-
 		// Build streamText request
 		// Cast providerOptions to any to bypass strict JSONObject typing — the AI SDK accepts the correct runtime values
 		const requestOptions: Parameters<typeof streamText>[0] = {
 			model: this.provider(modelConfig.id),
-			system: systemPrompt,
-			...({
-				systemProviderOptions: { anthropic: { cacheControl: { type: "ephemeral" } } },
-			} as Record<string, unknown>),
+			system: effectiveSystemPrompt,
 			messages: aiSdkMessages,
 			temperature: modelConfig.temperature,
 			maxOutputTokens: modelConfig.maxTokens ?? ANTHROPIC_DEFAULT_MAX_TOKENS,
@@ -191,12 +179,19 @@ export class AnthropicHandler extends BaseProvider implements SingleCompletionHa
 		const inputTokens = usage.inputTokens ?? 0
 		const outputTokens = usage.outputTokens ?? 0
 
-		// Extract cache metrics from Anthropic's providerMetadata
+		// Extract cache metrics from Anthropic's providerMetadata.
+		// In @ai-sdk/anthropic v3.0.38+, cacheReadInputTokens may only exist at
+		// usage.cache_read_input_tokens rather than the top-level property.
 		const anthropicMeta = providerMetadata?.anthropic as
-			| { cacheCreationInputTokens?: number; cacheReadInputTokens?: number }
+			| {
+					cacheCreationInputTokens?: number
+					cacheReadInputTokens?: number
+					usage?: { cache_read_input_tokens?: number }
+			  }
 			| undefined
 		const cacheWriteTokens = anthropicMeta?.cacheCreationInputTokens ?? 0
-		const cacheReadTokens = anthropicMeta?.cacheReadInputTokens ?? 0
+		const cacheReadTokens =
+			anthropicMeta?.cacheReadInputTokens ?? anthropicMeta?.usage?.cache_read_input_tokens ?? 0
 
 		const { totalCost } = calculateApiCostAnthropic(
 			info,
@@ -216,29 +211,6 @@ export class AnthropicHandler extends BaseProvider implements SingleCompletionHa
 		}
 	}
 
-	/**
-	 * Apply cacheControl providerOptions to the correct AI SDK messages by walking
-	 * the original Anthropic messages and converted AI SDK messages in parallel.
-	 *
-	 * convertToAiSdkMessages() can split a single Anthropic user message (containing
-	 * tool_results + text) into 2 AI SDK messages (tool role + user role). This method
-	 * accounts for that split so cache control lands on the right message.
-	 */
-	private applyCacheControlToAiSdkMessages(
-		aiSdkMessages: { role: string; providerOptions?: Record<string, Record<string, unknown>> }[],
-		targetIndices: Set<number>,
-		cacheProviderOption: Record<string, Record<string, unknown>>,
-	): void {
-		for (const idx of targetIndices) {
-			if (idx >= 0 && idx < aiSdkMessages.length) {
-				aiSdkMessages[idx].providerOptions = {
-					...aiSdkMessages[idx].providerOptions,
-					...cacheProviderOption,
-				}
-			}
-		}
-	}
-
 	getModel() {
 		const modelId = this.options.apiModelId
 		let id = modelId && modelId in anthropicModels ? (modelId as AnthropicModelId) : anthropicDefaultModelId

+ 3 - 1
src/api/providers/azure.ts

@@ -13,6 +13,7 @@ import {
 	mapToolChoice,
 	handleAiSdkError,
 } from "../transform/ai-sdk"
+import { applyToolCacheOptions } from "../transform/cache-breakpoints"
 import { ApiStream, ApiStreamUsageChunk } from "../transform/stream"
 import { getModelParams } from "../transform/model-params"
 
@@ -144,11 +145,12 @@ export class AzureHandler extends BaseProvider implements SingleCompletionHandle
 		// Convert tools to OpenAI format first, then to AI SDK format
 		const openAiTools = this.convertToolsForOpenAI(metadata?.tools)
 		const aiSdkTools = convertToolsForAiSdk(openAiTools) as ToolSet | undefined
+		applyToolCacheOptions(aiSdkTools as Parameters<typeof applyToolCacheOptions>[0], metadata?.toolProviderOptions)
 
 		// Build the request options
 		const requestOptions: Parameters<typeof streamText>[0] = {
 			model: languageModel,
-			system: systemPrompt,
+			system: systemPrompt || undefined,
 			messages: aiSdkMessages,
 			temperature: this.options.modelTemperature ?? temperature ?? AZURE_DEFAULT_TEMPERATURE,
 			maxOutputTokens: this.getMaxOutputTokens(),

+ 3 - 1
src/api/providers/baseten.ts

@@ -13,6 +13,7 @@ import {
 	mapToolChoice,
 	handleAiSdkError,
 } from "../transform/ai-sdk"
+import { applyToolCacheOptions } from "../transform/cache-breakpoints"
 import { ApiStream, ApiStreamUsageChunk } from "../transform/stream"
 import { getModelParams } from "../transform/model-params"
 
@@ -105,10 +106,11 @@ export class BasetenHandler extends BaseProvider implements SingleCompletionHand
 
 		const openAiTools = this.convertToolsForOpenAI(metadata?.tools)
 		const aiSdkTools = convertToolsForAiSdk(openAiTools) as ToolSet | undefined
+		applyToolCacheOptions(aiSdkTools as Parameters<typeof applyToolCacheOptions>[0], metadata?.toolProviderOptions)
 
 		const requestOptions: Parameters<typeof streamText>[0] = {
 			model: languageModel,
-			system: systemPrompt,
+			system: systemPrompt || undefined,
 			messages: aiSdkMessages,
 			temperature: this.options.modelTemperature ?? temperature ?? BASETEN_DEFAULT_TEMPERATURE,
 			maxOutputTokens: this.getMaxOutputTokens(),

+ 27 - 82
src/api/providers/bedrock.ts

@@ -32,6 +32,7 @@ import {
 	handleAiSdkError,
 	yieldResponseMessage,
 } from "../transform/ai-sdk"
+import { applyToolCacheOptions, applySystemPromptCaching } from "../transform/cache-breakpoints"
 import { getModelParams } from "../transform/model-params"
 import { shouldUseReasoningBudget } from "../../shared/api"
 import { BaseProvider } from "./base-provider"
@@ -210,6 +211,7 @@ export class AwsBedrockHandler extends BaseProvider implements SingleCompletionH
 		// Convert tools to AI SDK format
 		let openAiTools = this.convertToolsForOpenAI(metadata?.tools)
 		const aiSdkTools = convertToolsForAiSdk(openAiTools) as ToolSet | undefined
+		applyToolCacheOptions(aiSdkTools as Parameters<typeof applyToolCacheOptions>[0], metadata?.toolProviderOptions)
 		const toolChoice = mapToolChoice(metadata?.tool_choice)
 
 		// Build provider options for reasoning, betas, etc.
@@ -251,65 +253,34 @@ export class AwsBedrockHandler extends BaseProvider implements SingleCompletionH
 			}
 		}
 
-		// Prompt caching: use AI SDK's cachePoint mechanism
-		// The AI SDK's @ai-sdk/amazon-bedrock supports cachePoint in providerOptions per message.
-		//
-		// Strategy: Bedrock allows up to 4 cache checkpoints. We use them as:
-		//   1. System prompt (via systemProviderOptions below)
-		//   2-4. Up to 3 user messages in the conversation history
-		//
-		// For the message cache points, we target the last 2 user messages (matching
-		// Anthropic's strategy: write-to-cache + read-from-cache) PLUS an earlier "anchor"
-		// user message near the middle of the conversation. This anchor ensures the 20-block
-		// lookback window has a stable cache entry to hit, covering all assistant/tool messages
-		// between the anchor and the recent messages.
-		//
-		// We identify targets in the ORIGINAL Anthropic messages (before AI SDK conversion)
-		// because convertToAiSdkMessages() splits user messages containing tool_results into
-		// separate "tool" + "user" role messages, which would skew naive counting.
+		// Prompt caching — only apply cache annotations when caching is enabled.
+		// This avoids the need to strip annotations after the fact, and keeps
+		// Bedrock decoupled from knowledge of what Task.ts stamps universally.
 		const usePromptCache = Boolean(this.options.awsUsePromptCache && this.supportsAwsPromptCache(modelConfig))
 
-		if (usePromptCache) {
-			const cachePointOption = { bedrock: { cachePoint: { type: "default" as const } } }
-
-			// Find all user message indices in the original (pre-conversion) message array.
-			const originalUserIndices = filteredMessages.reduce<number[]>(
-				(acc, msg, idx) => ("role" in msg && msg.role === "user" ? [...acc, idx] : acc),
-				[],
-			)
-
-			// Select up to 3 user messages for cache points (system prompt uses the 4th):
-			// - Last user message: write to cache for next request
-			// - Second-to-last user message: read from cache for current request
-			// - An "anchor" message earlier in the conversation for 20-block window coverage
-			const targetOriginalIndices = new Set<number>()
-			const numUserMsgs = originalUserIndices.length
-
-			if (numUserMsgs >= 1) {
-				// Always cache the last user message
-				targetOriginalIndices.add(originalUserIndices[numUserMsgs - 1])
-			}
-			if (numUserMsgs >= 2) {
-				// Cache the second-to-last user message
-				targetOriginalIndices.add(originalUserIndices[numUserMsgs - 2])
-			}
-			if (numUserMsgs >= 5) {
-				// Add an anchor cache point roughly in the first third of user messages.
-				// This ensures that the 20-block lookback from the second-to-last breakpoint
-				// can find a stable cache entry, covering all the assistant and tool messages
-				// in the middle of the conversation. We pick the user message at ~1/3 position.
-				const anchorIdx = Math.floor(numUserMsgs / 3)
-				// Only add if it's not already one of the last-2 targets
-				if (!targetOriginalIndices.has(originalUserIndices[anchorIdx])) {
-					targetOriginalIndices.add(originalUserIndices[anchorIdx])
+		// Breakpoint 1: System prompt caching — only when Bedrock prompt cache is enabled
+		const effectiveSystemPrompt = usePromptCache
+			? applySystemPromptCaching(systemPrompt, aiSdkMessages, metadata?.systemProviderOptions)
+			: systemPrompt || undefined
+
+		// Strip non-Bedrock cache annotations from messages when caching is disabled,
+		// and strip Bedrock-specific annotations when caching is disabled.
+		if (!usePromptCache) {
+			for (const msg of aiSdkMessages) {
+				if (msg.providerOptions?.bedrock) {
+					const { bedrock: _, ...rest } = msg.providerOptions
+					msg.providerOptions = Object.keys(rest).length > 0 ? rest : undefined
 				}
 			}
-
-			// Apply cachePoint to the correct AI SDK messages by walking both arrays in parallel.
-			// A single original user message with tool_results becomes [tool-role msg, user-role msg]
-			// in the AI SDK array, while a plain user message becomes [user-role msg].
-			if (targetOriginalIndices.size > 0) {
-				this.applyCachePointsToAiSdkMessages(aiSdkMessages, targetOriginalIndices, cachePointOption)
+			// Also strip cache annotations from tool definitions
+			if (aiSdkTools) {
+				for (const key of Object.keys(aiSdkTools)) {
+					const tool = aiSdkTools[key] as { providerOptions?: Record<string, Record<string, unknown>> }
+					if (tool.providerOptions?.bedrock) {
+						const { bedrock: _, ...rest } = tool.providerOptions
+						tool.providerOptions = Object.keys(rest).length > 0 ? rest : undefined
+					}
+				}
 			}
 		}
 
@@ -317,10 +288,7 @@ export class AwsBedrockHandler extends BaseProvider implements SingleCompletionH
 		// Cast providerOptions to any to bypass strict JSONObject typing — the AI SDK accepts the correct runtime values
 		const requestOptions: Parameters<typeof streamText>[0] = {
 			model: this.provider(modelConfig.id),
-			system: systemPrompt,
-			...(usePromptCache && {
-				systemProviderOptions: { bedrock: { cachePoint: { type: "default" } } } as Record<string, unknown>,
-			}),
+			system: effectiveSystemPrompt,
 			messages: aiSdkMessages,
 			temperature: modelConfig.temperature ?? (this.options.modelTemperature as number),
 			maxOutputTokens: modelConfig.maxTokens || (modelConfig.info.maxTokens as number),
@@ -706,29 +674,6 @@ export class AwsBedrockHandler extends BaseProvider implements SingleCompletionH
 		)
 	}
 
-	/**
-	 * Apply cachePoint providerOptions to the correct AI SDK messages by walking
-	 * the original Anthropic messages and converted AI SDK messages in parallel.
-	 *
-	 * convertToAiSdkMessages() can split a single Anthropic user message (containing
-	 * tool_results + text) into 2 AI SDK messages (tool role + user role). This method
-	 * accounts for that split so cache points land on the right message.
-	 */
-	private applyCachePointsToAiSdkMessages(
-		aiSdkMessages: { role: string; providerOptions?: Record<string, Record<string, unknown>> }[],
-		targetIndices: Set<number>,
-		cachePointOption: Record<string, Record<string, unknown>>,
-	): void {
-		for (const idx of targetIndices) {
-			if (idx >= 0 && idx < aiSdkMessages.length) {
-				aiSdkMessages[idx].providerOptions = {
-					...aiSdkMessages[idx].providerOptions,
-					...cachePointOption,
-				}
-			}
-		}
-	}
-
 	/************************************************************************************
 	 *
 	 *     AMAZON REGIONS

+ 3 - 1
src/api/providers/deepseek.ts

@@ -13,6 +13,7 @@ import {
 	mapToolChoice,
 	handleAiSdkError,
 } from "../transform/ai-sdk"
+import { applyToolCacheOptions } from "../transform/cache-breakpoints"
 import { ApiStream, ApiStreamUsageChunk } from "../transform/stream"
 import { getModelParams } from "../transform/model-params"
 
@@ -122,11 +123,12 @@ export class DeepSeekHandler extends BaseProvider implements SingleCompletionHan
 		// Convert tools to OpenAI format first, then to AI SDK format
 		const openAiTools = this.convertToolsForOpenAI(metadata?.tools)
 		const aiSdkTools = convertToolsForAiSdk(openAiTools) as ToolSet | undefined
+		applyToolCacheOptions(aiSdkTools as Parameters<typeof applyToolCacheOptions>[0], metadata?.toolProviderOptions)
 
 		// Build the request options
 		const requestOptions: Parameters<typeof streamText>[0] = {
 			model: languageModel,
-			system: systemPrompt,
+			system: systemPrompt || undefined,
 			messages: aiSdkMessages,
 			temperature: this.options.modelTemperature ?? temperature ?? DEEP_SEEK_DEFAULT_TEMPERATURE,
 			maxOutputTokens: this.getMaxOutputTokens(),

+ 3 - 1
src/api/providers/fireworks.ts

@@ -13,6 +13,7 @@ import {
 	mapToolChoice,
 	handleAiSdkError,
 } from "../transform/ai-sdk"
+import { applyToolCacheOptions } from "../transform/cache-breakpoints"
 import { ApiStream, ApiStreamUsageChunk } from "../transform/stream"
 import { getModelParams } from "../transform/model-params"
 
@@ -122,11 +123,12 @@ export class FireworksHandler extends BaseProvider implements SingleCompletionHa
 		// Convert tools to OpenAI format first, then to AI SDK format
 		const openAiTools = this.convertToolsForOpenAI(metadata?.tools)
 		const aiSdkTools = convertToolsForAiSdk(openAiTools) as ToolSet | undefined
+		applyToolCacheOptions(aiSdkTools as Parameters<typeof applyToolCacheOptions>[0], metadata?.toolProviderOptions)
 
 		// Build the request options
 		const requestOptions: Parameters<typeof streamText>[0] = {
 			model: languageModel,
-			system: systemPrompt,
+			system: systemPrompt || undefined,
 			messages: aiSdkMessages,
 			temperature: this.options.modelTemperature ?? temperature ?? FIREWORKS_DEFAULT_TEMPERATURE,
 			maxOutputTokens: this.getMaxOutputTokens(),

+ 3 - 1
src/api/providers/gemini.ts

@@ -21,6 +21,7 @@ import {
 	handleAiSdkError,
 	yieldResponseMessage,
 } from "../transform/ai-sdk"
+import { applyToolCacheOptions } from "../transform/cache-breakpoints"
 import { t } from "i18next"
 import type { ApiStream, ApiStreamUsageChunk, GroundingSource } from "../transform/stream"
 import { getModelParams } from "../transform/model-params"
@@ -103,6 +104,7 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 		}
 
 		const aiSdkTools = convertToolsForAiSdk(openAiTools) as ToolSet | undefined
+		applyToolCacheOptions(aiSdkTools as Parameters<typeof applyToolCacheOptions>[0], metadata?.toolProviderOptions)
 
 		// Build tool choice - use 'required' when allowedFunctionNames restricts available tools
 		const toolChoice =
@@ -113,7 +115,7 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 		// Build the request options
 		const requestOptions: Parameters<typeof streamText>[0] = {
 			model: this.provider(modelId),
-			system: systemInstruction,
+			system: systemInstruction || undefined,
 			messages: aiSdkMessages,
 			temperature: temperatureConfig,
 			maxOutputTokens,

+ 3 - 1
src/api/providers/lm-studio.ts

@@ -20,6 +20,7 @@ import {
 	mapToolChoice,
 	handleAiSdkError,
 } from "../transform/ai-sdk"
+import { applyToolCacheOptions } from "../transform/cache-breakpoints"
 import { ApiStream } from "../transform/stream"
 
 import { OpenAICompatibleHandler, OpenAICompatibleConfig } from "./openai-compatible"
@@ -68,10 +69,11 @@ export class LmStudioHandler extends OpenAICompatibleHandler implements SingleCo
 
 		const openAiTools = this.convertToolsForOpenAI(metadata?.tools)
 		const aiSdkTools = convertToolsForAiSdk(openAiTools) as ToolSet | undefined
+		applyToolCacheOptions(aiSdkTools as Parameters<typeof applyToolCacheOptions>[0], metadata?.toolProviderOptions)
 
 		const requestOptions: Parameters<typeof streamText>[0] = {
 			model: languageModel,
-			system: systemPrompt,
+			system: systemPrompt || undefined,
 			messages: aiSdkMessages,
 			temperature: model.temperature ?? this.config.temperature ?? LMSTUDIO_DEFAULT_TEMPERATURE,
 			maxOutputTokens: this.getMaxOutputTokens(),

+ 3 - 36
src/api/providers/minimax.ts

@@ -16,6 +16,7 @@ import {
 	handleAiSdkError,
 	yieldResponseMessage,
 } from "../transform/ai-sdk"
+import { applyToolCacheOptions } from "../transform/cache-breakpoints"
 import { calculateApiCostAnthropic } from "../../shared/cost"
 
 import { DEFAULT_HEADERS } from "./constants"
@@ -75,6 +76,7 @@ export class MiniMaxHandler extends BaseProvider implements SingleCompletionHand
 		const aiSdkMessages = mergedMessages as ModelMessage[]
 		const openAiTools = this.convertToolsForOpenAI(metadata?.tools)
 		const aiSdkTools = convertToolsForAiSdk(openAiTools) as ToolSet | undefined
+		applyToolCacheOptions(aiSdkTools as Parameters<typeof applyToolCacheOptions>[0], metadata?.toolProviderOptions)
 
 		const anthropicProviderOptions: Record<string, unknown> = {}
 
@@ -89,29 +91,9 @@ export class MiniMaxHandler extends BaseProvider implements SingleCompletionHand
 			anthropicProviderOptions.disableParallelToolUse = true
 		}
 
-		const cacheProviderOption = { anthropic: { cacheControl: { type: "ephemeral" as const } } }
-		const userMsgIndices = mergedMessages.reduce(
-			(acc, msg, index) => (msg.role === "user" ? [...acc, index] : acc),
-			[] as number[],
-		)
-
-		const targetIndices = new Set<number>()
-		const lastUserMsgIndex = userMsgIndices[userMsgIndices.length - 1] ?? -1
-		const secondLastUserMsgIndex = userMsgIndices[userMsgIndices.length - 2] ?? -1
-
-		if (lastUserMsgIndex >= 0) targetIndices.add(lastUserMsgIndex)
-		if (secondLastUserMsgIndex >= 0) targetIndices.add(secondLastUserMsgIndex)
-
-		if (targetIndices.size > 0) {
-			this.applyCacheControlToAiSdkMessages(aiSdkMessages, targetIndices, cacheProviderOption)
-		}
-
 		const requestOptions = {
 			model: this.client(modelConfig.id),
-			system: systemPrompt,
-			...({
-				systemProviderOptions: { anthropic: { cacheControl: { type: "ephemeral" } } },
-			} as Record<string, unknown>),
+			system: systemPrompt || undefined,
 			messages: aiSdkMessages,
 			temperature: modelParams.temperature,
 			maxOutputTokens: modelParams.maxTokens ?? modelConfig.info.maxTokens,
@@ -187,21 +169,6 @@ export class MiniMaxHandler extends BaseProvider implements SingleCompletionHand
 		}
 	}
 
-	private applyCacheControlToAiSdkMessages(
-		aiSdkMessages: { role: string; providerOptions?: Record<string, Record<string, unknown>> }[],
-		targetIndices: Set<number>,
-		cacheProviderOption: Record<string, Record<string, unknown>>,
-	): void {
-		for (const idx of targetIndices) {
-			if (idx >= 0 && idx < aiSdkMessages.length) {
-				aiSdkMessages[idx].providerOptions = {
-					...aiSdkMessages[idx].providerOptions,
-					...cacheProviderOption,
-				}
-			}
-		}
-	}
-
 	getModel() {
 		const modelId = this.options.apiModelId
 

+ 3 - 1
src/api/providers/mistral.ts

@@ -13,6 +13,7 @@ import {
 import type { ApiHandlerOptions } from "../../shared/api"
 
 import { convertToAiSdkMessages, convertToolsForAiSdk, consumeAiSdkStream, handleAiSdkError } from "../transform/ai-sdk"
+import { applyToolCacheOptions } from "../transform/cache-breakpoints"
 import { ApiStream, ApiStreamUsageChunk } from "../transform/stream"
 import { getModelParams } from "../transform/model-params"
 
@@ -149,12 +150,13 @@ export class MistralHandler extends BaseProvider implements SingleCompletionHand
 		// Convert tools to OpenAI format first, then to AI SDK format
 		const openAiTools = this.convertToolsForOpenAI(metadata?.tools)
 		const aiSdkTools = convertToolsForAiSdk(openAiTools) as ToolSet | undefined
+		applyToolCacheOptions(aiSdkTools as Parameters<typeof applyToolCacheOptions>[0], metadata?.toolProviderOptions)
 
 		// Build the request options
 		// Use MISTRAL_DEFAULT_TEMPERATURE (1) as fallback to match original behavior
 		const requestOptions: Parameters<typeof streamText>[0] = {
 			model: languageModel,
-			system: systemPrompt,
+			system: systemPrompt || undefined,
 			messages: aiSdkMessages,
 			temperature: this.options.modelTemperature ?? MISTRAL_DEFAULT_TEMPERATURE,
 			maxOutputTokens: this.getMaxOutputTokens(),

+ 3 - 1
src/api/providers/native-ollama.ts

@@ -14,6 +14,7 @@ import {
 	handleAiSdkError,
 	yieldResponseMessage,
 } from "../transform/ai-sdk"
+import { applyToolCacheOptions } from "../transform/cache-breakpoints"
 import { ApiStream } from "../transform/stream"
 
 import { BaseProvider } from "./base-provider"
@@ -99,12 +100,13 @@ export class NativeOllamaHandler extends BaseProvider implements SingleCompletio
 
 		const openAiTools = this.convertToolsForOpenAI(metadata?.tools)
 		const aiSdkTools = convertToolsForAiSdk(openAiTools) as ToolSet | undefined
+		applyToolCacheOptions(aiSdkTools as Parameters<typeof applyToolCacheOptions>[0], metadata?.toolProviderOptions)
 
 		const providerOptions = this.buildProviderOptions(useR1Format)
 
 		const requestOptions: Parameters<typeof streamText>[0] = {
 			model: languageModel,
-			system: systemPrompt,
+			system: systemPrompt || undefined,
 			messages: aiSdkMessages,
 			temperature,
 			tools: aiSdkTools,

+ 16 - 3
src/api/providers/openai-compatible.ts

@@ -20,6 +20,7 @@ import {
 	handleAiSdkError,
 } from "../transform/ai-sdk"
 import { ApiStream, ApiStreamUsageChunk } from "../transform/stream"
+import { applyToolCacheOptions } from "../transform/cache-breakpoints"
 
 import { DEFAULT_HEADERS } from "./constants"
 import { BaseProvider } from "./base-provider"
@@ -95,6 +96,14 @@ export abstract class OpenAICompatibleHandler extends BaseProvider implements Si
 	protected processUsageMetrics(usage: {
 		inputTokens?: number
 		outputTokens?: number
+		inputTokenDetails?: {
+			cacheReadTokens?: number
+			cacheWriteTokens?: number
+			noCacheTokens?: number
+		}
+		outputTokenDetails?: {
+			reasoningTokens?: number
+		}
 		details?: {
 			cachedInputTokens?: number
 			reasoningTokens?: number
@@ -105,8 +114,11 @@ export abstract class OpenAICompatibleHandler extends BaseProvider implements Si
 			type: "usage",
 			inputTokens: usage.inputTokens || 0,
 			outputTokens: usage.outputTokens || 0,
-			cacheReadTokens: usage.details?.cachedInputTokens,
-			reasoningTokens: usage.details?.reasoningTokens,
+			// P1: AI SDK v6 standard (LanguageModelInputTokenDetails)
+			// P2: Legacy AI SDK standard (usage.details)
+			cacheReadTokens: usage.inputTokenDetails?.cacheReadTokens ?? usage.details?.cachedInputTokens,
+			cacheWriteTokens: usage.inputTokenDetails?.cacheWriteTokens,
+			reasoningTokens: usage.outputTokenDetails?.reasoningTokens ?? usage.details?.reasoningTokens,
 		}
 	}
 
@@ -137,11 +149,12 @@ export abstract class OpenAICompatibleHandler extends BaseProvider implements Si
 		// Convert tools to OpenAI format first, then to AI SDK format
 		const openAiTools = this.convertToolsForOpenAI(metadata?.tools)
 		const aiSdkTools = convertToolsForAiSdk(openAiTools) as ToolSet | undefined
+		applyToolCacheOptions(aiSdkTools as Parameters<typeof applyToolCacheOptions>[0], metadata?.toolProviderOptions)
 
 		// Build the request options
 		const requestOptions: Parameters<typeof streamText>[0] = {
 			model: languageModel,
-			system: systemPrompt,
+			system: systemPrompt || undefined,
 			messages: aiSdkMessages,
 			temperature: model.temperature ?? this.config.temperature ?? 0,
 			maxOutputTokens: this.getMaxOutputTokens(),

+ 2 - 0
src/api/providers/openai-native.ts

@@ -26,6 +26,7 @@ import {
 	mapToolChoice,
 	handleAiSdkError,
 } from "../transform/ai-sdk"
+import { applyToolCacheOptions } from "../transform/cache-breakpoints"
 import { ApiStream, ApiStreamUsageChunk } from "../transform/stream"
 import { getModelParams } from "../transform/model-params"
 
@@ -434,6 +435,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 
 		const openAiTools = this.convertToolsForOpenAI(metadata?.tools)
 		const aiSdkTools = convertToolsForAiSdk(openAiTools) as ToolSet | undefined
+		applyToolCacheOptions(aiSdkTools as Parameters<typeof applyToolCacheOptions>[0], metadata?.toolProviderOptions)
 
 		const taskId = metadata?.taskId
 		const userAgent = `roo-code/${Package.version} (${os.platform()} ${os.release()}; ${os.arch()}) node/${process.version.slice(1)}`

+ 7 - 3
src/api/providers/openai.ts

@@ -24,6 +24,7 @@ import {
 	handleAiSdkError,
 	yieldResponseMessage,
 } from "../transform/ai-sdk"
+import { applyToolCacheOptions } from "../transform/cache-breakpoints"
 import { ApiStream, ApiStreamUsageChunk } from "../transform/stream"
 import { getModelParams } from "../transform/model-params"
 
@@ -110,6 +111,7 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
 
 		const openAiTools = this.convertToolsForOpenAI(metadata?.tools)
 		const aiSdkTools = convertToolsForAiSdk(openAiTools) as ToolSet | undefined
+		applyToolCacheOptions(aiSdkTools as Parameters<typeof applyToolCacheOptions>[0], metadata?.toolProviderOptions)
 
 		let effectiveSystemPrompt: string | undefined = systemPrompt
 		let effectiveTemperature: number | undefined =
@@ -141,7 +143,9 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
 
 		if (deepseekReasoner) {
 			effectiveSystemPrompt = undefined
-			aiSdkMessages.unshift({ role: "user", content: systemPrompt })
+			if (systemPrompt) {
+				aiSdkMessages.unshift({ role: "user", content: systemPrompt })
+			}
 		}
 
 		if (this.options.openAiStreamingEnabled ?? true) {
@@ -181,7 +185,7 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
 	): ApiStream {
 		const result = streamText({
 			model: languageModel,
-			system: systemPrompt,
+			system: systemPrompt || undefined,
 			messages,
 			temperature,
 			maxOutputTokens: this.getMaxOutputTokens(),
@@ -253,7 +257,7 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
 		try {
 			const { text, toolCalls, usage, providerMetadata } = await generateText({
 				model: languageModel,
-				system: systemPrompt,
+				system: systemPrompt || undefined,
 				messages,
 				temperature,
 				maxOutputTokens: this.getMaxOutputTokens(),

+ 11 - 1
src/api/providers/openrouter.ts

@@ -23,6 +23,7 @@ import {
 	processAiSdkStreamPart,
 	yieldResponseMessage,
 } from "../transform/ai-sdk"
+import { applyToolCacheOptions, applySystemPromptCaching } from "../transform/cache-breakpoints"
 
 import { BaseProvider } from "./base-provider"
 import { getModels, getModelsFromCache } from "./fetchers/modelCache"
@@ -153,6 +154,7 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
 		const openrouter = this.createOpenRouterProvider({ reasoning, headers })
 
 		const tools = convertToolsForAiSdk(metadata?.tools)
+		applyToolCacheOptions(tools as Parameters<typeof applyToolCacheOptions>[0], metadata?.toolProviderOptions)
 
 		const providerOptions:
 			| {
@@ -174,10 +176,18 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
 					}
 				: undefined
 
+		// Breakpoint 1: System prompt caching — inject as cached system message
+		// OpenRouter routes to Anthropic models that benefit from cache annotations
+		const effectiveSystemPrompt = applySystemPromptCaching(
+			systemPrompt,
+			aiSdkMessages,
+			metadata?.systemProviderOptions,
+		)
+
 		try {
 			const result = streamText({
 				model: openrouter.chat(modelId),
-				system: systemPrompt,
+				system: effectiveSystemPrompt,
 				messages: aiSdkMessages,
 				maxOutputTokens: maxTokens && maxTokens > 0 ? maxTokens : undefined,
 				temperature,

+ 11 - 1
src/api/providers/requesty.ts

@@ -14,6 +14,7 @@ import {
 	mapToolChoice,
 	handleAiSdkError,
 } from "../transform/ai-sdk"
+import { applyToolCacheOptions, applySystemPromptCaching } from "../transform/cache-breakpoints"
 import { ApiStream, ApiStreamUsageChunk } from "../transform/stream"
 import { getModelParams } from "../transform/model-params"
 
@@ -183,12 +184,21 @@ export class RequestyHandler extends BaseProvider implements SingleCompletionHan
 
 		const openAiTools = this.convertToolsForOpenAI(metadata?.tools)
 		const aiSdkTools = convertToolsForAiSdk(openAiTools) as ToolSet | undefined
+		applyToolCacheOptions(aiSdkTools as Parameters<typeof applyToolCacheOptions>[0], metadata?.toolProviderOptions)
 
 		const requestyOptions = this.getRequestyProviderOptions(metadata)
 
+		// Breakpoint 1: System prompt caching — inject as cached system message
+		// Requesty routes to Anthropic models that benefit from cache annotations
+		const effectiveSystemPrompt = applySystemPromptCaching(
+			systemPrompt,
+			aiSdkMessages,
+			metadata?.systemProviderOptions,
+		)
+
 		const requestOptions: Parameters<typeof streamText>[0] = {
 			model: languageModel,
-			system: systemPrompt,
+			system: effectiveSystemPrompt,
 			messages: aiSdkMessages,
 			temperature: this.options.modelTemperature ?? temperature ?? 0,
 			maxOutputTokens: this.getMaxOutputTokens(),

+ 3 - 1
src/api/providers/roo.ts

@@ -17,6 +17,7 @@ import {
 	mapToolChoice,
 	yieldResponseMessage,
 } from "../transform/ai-sdk"
+import { applyToolCacheOptions } from "../transform/cache-breakpoints"
 import type { RooReasoningParams } from "../transform/reasoning"
 import { getRooReasoning } from "../transform/reasoning"
 
@@ -122,13 +123,14 @@ export class RooHandler extends BaseProvider implements SingleCompletionHandler
 		// RooMessage[] is already AI SDK-compatible, cast directly
 		const aiSdkMessages = messages as ModelMessage[]
 		const tools = convertToolsForAiSdk(this.convertToolsForOpenAI(metadata?.tools))
+		applyToolCacheOptions(tools as Parameters<typeof applyToolCacheOptions>[0], metadata?.toolProviderOptions)
 
 		let lastStreamError: string | undefined
 
 		try {
 			const result = streamText({
 				model: provider(modelId),
-				system: systemPrompt,
+				system: systemPrompt || undefined,
 				messages: aiSdkMessages,
 				maxOutputTokens: maxTokens && maxTokens > 0 ? maxTokens : undefined,
 				temperature,

+ 3 - 1
src/api/providers/sambanova.ts

@@ -14,6 +14,7 @@ import {
 	handleAiSdkError,
 	flattenAiSdkMessagesToStringContent,
 } from "../transform/ai-sdk"
+import { applyToolCacheOptions } from "../transform/cache-breakpoints"
 import { ApiStream, ApiStreamUsageChunk } from "../transform/stream"
 import { getModelParams } from "../transform/model-params"
 
@@ -125,11 +126,12 @@ export class SambaNovaHandler extends BaseProvider implements SingleCompletionHa
 		// Convert tools to OpenAI format first, then to AI SDK format
 		const openAiTools = this.convertToolsForOpenAI(metadata?.tools)
 		const aiSdkTools = convertToolsForAiSdk(openAiTools) as ToolSet | undefined
+		applyToolCacheOptions(aiSdkTools as Parameters<typeof applyToolCacheOptions>[0], metadata?.toolProviderOptions)
 
 		// Build the request options
 		const requestOptions: Parameters<typeof streamText>[0] = {
 			model: languageModel,
-			system: systemPrompt,
+			system: systemPrompt || undefined,
 			messages: aiSdkMessages,
 			temperature: this.options.modelTemperature ?? temperature ?? SAMBANOVA_DEFAULT_TEMPERATURE,
 			maxOutputTokens: this.getMaxOutputTokens(),

+ 3 - 1
src/api/providers/vercel-ai-gateway.ts

@@ -19,6 +19,7 @@ import {
 	handleAiSdkError,
 	yieldResponseMessage,
 } from "../transform/ai-sdk"
+import { applyToolCacheOptions } from "../transform/cache-breakpoints"
 import { ApiStream, ApiStreamUsageChunk } from "../transform/stream"
 
 import { DEFAULT_HEADERS } from "./constants"
@@ -120,6 +121,7 @@ export class VercelAiGatewayHandler extends BaseProvider implements SingleComple
 
 		const openAiTools = this.convertToolsForOpenAI(metadata?.tools)
 		const aiSdkTools = convertToolsForAiSdk(openAiTools) as ToolSet | undefined
+		applyToolCacheOptions(aiSdkTools as Parameters<typeof applyToolCacheOptions>[0], metadata?.toolProviderOptions)
 
 		const temperature = this.supportsTemperature(modelId)
 			? (this.options.modelTemperature ?? VERCEL_AI_GATEWAY_DEFAULT_TEMPERATURE)
@@ -127,7 +129,7 @@ export class VercelAiGatewayHandler extends BaseProvider implements SingleComple
 
 		const result = streamText({
 			model: languageModel,
-			system: systemPrompt,
+			system: systemPrompt || undefined,
 			messages: aiSdkMessages,
 			temperature,
 			maxOutputTokens: info.maxTokens ?? undefined,

+ 3 - 1
src/api/providers/vertex.ts

@@ -21,6 +21,7 @@ import {
 	handleAiSdkError,
 	yieldResponseMessage,
 } from "../transform/ai-sdk"
+import { applyToolCacheOptions } from "../transform/cache-breakpoints"
 import { t } from "i18next"
 import type { ApiStream, ApiStreamUsageChunk, GroundingSource } from "../transform/stream"
 import { getModelParams } from "../transform/model-params"
@@ -117,6 +118,7 @@ export class VertexHandler extends BaseProvider implements SingleCompletionHandl
 		}
 
 		const aiSdkTools = convertToolsForAiSdk(openAiTools) as ToolSet | undefined
+		applyToolCacheOptions(aiSdkTools as Parameters<typeof applyToolCacheOptions>[0], metadata?.toolProviderOptions)
 
 		// Build tool choice - use 'required' when allowedFunctionNames restricts available tools
 		const toolChoice =
@@ -127,7 +129,7 @@ export class VertexHandler extends BaseProvider implements SingleCompletionHandl
 		// Build the request options
 		const requestOptions: Parameters<typeof streamText>[0] = {
 			model: this.provider(modelId),
-			system: systemInstruction,
+			system: systemInstruction || undefined,
 			messages: aiSdkMessages,
 			temperature: temperatureConfig,
 			maxOutputTokens,

+ 3 - 1
src/api/providers/xai.ts

@@ -13,6 +13,7 @@ import {
 	mapToolChoice,
 	handleAiSdkError,
 } from "../transform/ai-sdk"
+import { applyToolCacheOptions } from "../transform/cache-breakpoints"
 import { ApiStream, ApiStreamUsageChunk } from "../transform/stream"
 import { getModelParams } from "../transform/model-params"
 
@@ -131,11 +132,12 @@ export class XAIHandler extends BaseProvider implements SingleCompletionHandler
 		// Convert tools to OpenAI format first, then to AI SDK format
 		const openAiTools = this.convertToolsForOpenAI(metadata?.tools)
 		const aiSdkTools = convertToolsForAiSdk(openAiTools) as ToolSet | undefined
+		applyToolCacheOptions(aiSdkTools as Parameters<typeof applyToolCacheOptions>[0], metadata?.toolProviderOptions)
 
 		// Build the request options
 		const requestOptions: Parameters<typeof streamText>[0] = {
 			model: languageModel,
-			system: systemPrompt,
+			system: systemPrompt || undefined,
 			messages: aiSdkMessages,
 			temperature: this.options.modelTemperature ?? temperature ?? XAI_DEFAULT_TEMPERATURE,
 			maxOutputTokens: this.getMaxOutputTokens(),

+ 3 - 1
src/api/providers/zai.ts

@@ -21,6 +21,7 @@ import {
 	mapToolChoice,
 	handleAiSdkError,
 } from "../transform/ai-sdk"
+import { applyToolCacheOptions } from "../transform/cache-breakpoints"
 import { ApiStream } from "../transform/stream"
 import { getModelParams } from "../transform/model-params"
 
@@ -102,10 +103,11 @@ export class ZAiHandler extends BaseProvider implements SingleCompletionHandler
 
 		const openAiTools = this.convertToolsForOpenAI(metadata?.tools)
 		const aiSdkTools = convertToolsForAiSdk(openAiTools) as ToolSet | undefined
+		applyToolCacheOptions(aiSdkTools as Parameters<typeof applyToolCacheOptions>[0], metadata?.toolProviderOptions)
 
 		const requestOptions: Parameters<typeof streamText>[0] = {
 			model: languageModel,
-			system: systemPrompt,
+			system: systemPrompt || undefined,
 			messages: aiSdkMessages,
 			temperature: this.options.modelTemperature ?? temperature ?? ZAI_DEFAULT_TEMPERATURE,
 			maxOutputTokens: this.getMaxOutputTokens(),

+ 274 - 0
src/api/transform/__tests__/cache-breakpoints.spec.ts

@@ -0,0 +1,274 @@
+import {
+	applyCacheBreakpoints,
+	applyToolCacheOptions,
+	applySystemPromptCaching,
+	UNIVERSAL_CACHE_OPTIONS,
+} from "../cache-breakpoints"
+
+type TestMessage = { role: string; providerOptions?: Record<string, Record<string, unknown>> }
+
+describe("UNIVERSAL_CACHE_OPTIONS", () => {
+	it("includes anthropic namespace with ephemeral cacheControl", () => {
+		expect(UNIVERSAL_CACHE_OPTIONS.anthropic).toEqual({ cacheControl: { type: "ephemeral" } })
+	})
+
+	it("includes bedrock namespace with default cachePoint", () => {
+		expect(UNIVERSAL_CACHE_OPTIONS.bedrock).toEqual({ cachePoint: { type: "default" } })
+	})
+})
+
+describe("applyCacheBreakpoints", () => {
+	it("is a no-op for empty message array", () => {
+		const messages: TestMessage[] = []
+		applyCacheBreakpoints(messages)
+		expect(messages).toEqual([])
+	})
+
+	it("is a no-op when all messages are assistant or system", () => {
+		const messages: TestMessage[] = [{ role: "system" }, { role: "assistant" }, { role: "assistant" }]
+		applyCacheBreakpoints(messages)
+		expect(messages[0].providerOptions).toBeUndefined()
+		expect(messages[1].providerOptions).toBeUndefined()
+		expect(messages[2].providerOptions).toBeUndefined()
+	})
+
+	it("places 1 breakpoint on a single user message", () => {
+		const messages: TestMessage[] = [{ role: "user" }]
+		applyCacheBreakpoints(messages)
+		expect(messages[0].providerOptions).toEqual(UNIVERSAL_CACHE_OPTIONS)
+	})
+
+	it("places 1 breakpoint on a single tool message", () => {
+		const messages: TestMessage[] = [{ role: "tool" }]
+		applyCacheBreakpoints(messages)
+		expect(messages[0].providerOptions).toEqual(UNIVERSAL_CACHE_OPTIONS)
+	})
+
+	it("places 2 breakpoints on 2 user messages", () => {
+		const messages: TestMessage[] = [{ role: "user" }, { role: "user" }]
+		applyCacheBreakpoints(messages)
+		expect(messages[0].providerOptions).toEqual(UNIVERSAL_CACHE_OPTIONS)
+		expect(messages[1].providerOptions).toEqual(UNIVERSAL_CACHE_OPTIONS)
+	})
+
+	it("places 2 breakpoints on 2 tool messages", () => {
+		const messages: TestMessage[] = [{ role: "tool" }, { role: "tool" }]
+		applyCacheBreakpoints(messages)
+		expect(messages[0].providerOptions).toEqual(UNIVERSAL_CACHE_OPTIONS)
+		expect(messages[1].providerOptions).toEqual(UNIVERSAL_CACHE_OPTIONS)
+	})
+
+	it("targets last 2 non-assistant messages in a mixed conversation", () => {
+		const messages: TestMessage[] = [
+			{ role: "user" },
+			{ role: "assistant" },
+			{ role: "user" },
+			{ role: "assistant" },
+			{ role: "tool" },
+		]
+		applyCacheBreakpoints(messages)
+		// Last 2 non-assistant: index 2 (user) and index 4 (tool)
+		expect(messages[0].providerOptions).toBeUndefined()
+		expect(messages[1].providerOptions).toBeUndefined()
+		expect(messages[2].providerOptions).toEqual(UNIVERSAL_CACHE_OPTIONS)
+		expect(messages[3].providerOptions).toBeUndefined()
+		expect(messages[4].providerOptions).toEqual(UNIVERSAL_CACHE_OPTIONS)
+	})
+
+	it("targets indices 3 and 5 in [user, assistant, tool, user, assistant, tool]", () => {
+		const messages: TestMessage[] = [
+			{ role: "user" },
+			{ role: "assistant" },
+			{ role: "tool" },
+			{ role: "user" },
+			{ role: "assistant" },
+			{ role: "tool" },
+		]
+		applyCacheBreakpoints(messages)
+		expect(messages[0].providerOptions).toBeUndefined()
+		expect(messages[1].providerOptions).toBeUndefined()
+		expect(messages[2].providerOptions).toBeUndefined()
+		expect(messages[3].providerOptions).toEqual(UNIVERSAL_CACHE_OPTIONS)
+		expect(messages[4].providerOptions).toBeUndefined()
+		expect(messages[5].providerOptions).toEqual(UNIVERSAL_CACHE_OPTIONS)
+	})
+
+	it("never targets system messages", () => {
+		const messages: TestMessage[] = [{ role: "system" }, { role: "user" }, { role: "assistant" }, { role: "user" }]
+		applyCacheBreakpoints(messages)
+		expect(messages[0].providerOptions).toBeUndefined()
+		expect(messages[1].providerOptions).toEqual(UNIVERSAL_CACHE_OPTIONS)
+		expect(messages[3].providerOptions).toEqual(UNIVERSAL_CACHE_OPTIONS)
+	})
+
+	it("never targets assistant messages", () => {
+		const messages: TestMessage[] = [
+			{ role: "user" },
+			{ role: "assistant" },
+			{ role: "assistant" },
+			{ role: "user" },
+		]
+		applyCacheBreakpoints(messages)
+		expect(messages[1].providerOptions).toBeUndefined()
+		expect(messages[2].providerOptions).toBeUndefined()
+	})
+
+	it("preserves existing providerOptions via spread", () => {
+		const messages: TestMessage[] = [
+			{
+				role: "user",
+				providerOptions: {
+					openai: { customField: "keep-me" },
+				},
+			},
+		]
+		applyCacheBreakpoints(messages)
+		expect(messages[0].providerOptions).toEqual({
+			openai: { customField: "keep-me" },
+			...UNIVERSAL_CACHE_OPTIONS,
+		})
+	})
+
+	it("adds anchor breakpoint at ~1/3 with useAnchor and enough messages", () => {
+		// 6 non-assistant messages (indices 0-5 in nonAssistantIndices)
+		// Anchor at floor(6/3) = index 2 in nonAssistantIndices -> messages index 4
+		// Last 2: indices 10 and 8
+		const messages: TestMessage[] = [
+			{ role: "user" }, // 0 - nonAssistant[0]
+			{ role: "assistant" }, // 1
+			{ role: "user" }, // 2 - nonAssistant[1]
+			{ role: "assistant" }, // 3
+			{ role: "user" }, // 4 - nonAssistant[2] <- anchor (floor(6/3)=2)
+			{ role: "assistant" }, // 5
+			{ role: "user" }, // 6 - nonAssistant[3]
+			{ role: "assistant" }, // 7
+			{ role: "user" }, // 8 - nonAssistant[4] <- last 2
+			{ role: "assistant" }, // 9
+			{ role: "user" }, // 10 - nonAssistant[5] <- last 2
+		]
+		applyCacheBreakpoints(messages, { useAnchor: true })
+
+		// Should have 3 breakpoints: indices 4, 8, 10
+		expect(messages[4].providerOptions).toEqual(UNIVERSAL_CACHE_OPTIONS)
+		expect(messages[8].providerOptions).toEqual(UNIVERSAL_CACHE_OPTIONS)
+		expect(messages[10].providerOptions).toEqual(UNIVERSAL_CACHE_OPTIONS)
+
+		// Others should NOT have breakpoints
+		expect(messages[0].providerOptions).toBeUndefined()
+		expect(messages[2].providerOptions).toBeUndefined()
+		expect(messages[6].providerOptions).toBeUndefined()
+	})
+
+	it("does not add anchor when below anchorThreshold", () => {
+		const messages: TestMessage[] = [
+			{ role: "user" },
+			{ role: "assistant" },
+			{ role: "user" },
+			{ role: "assistant" },
+			{ role: "user" },
+		]
+		// 3 non-assistant messages, below default threshold of 5
+		applyCacheBreakpoints(messages, { useAnchor: true })
+
+		// Last 2 only: indices 2 and 4
+		expect(messages[0].providerOptions).toBeUndefined()
+		expect(messages[2].providerOptions).toEqual(UNIVERSAL_CACHE_OPTIONS)
+		expect(messages[4].providerOptions).toEqual(UNIVERSAL_CACHE_OPTIONS)
+	})
+
+	it("universal options include both anthropic and bedrock namespaces", () => {
+		const messages: TestMessage[] = [{ role: "user" }]
+		applyCacheBreakpoints(messages)
+		expect(messages[0].providerOptions).toHaveProperty("anthropic")
+		expect(messages[0].providerOptions).toHaveProperty("bedrock")
+	})
+})
+
+describe("applyToolCacheOptions", () => {
+	it("should apply cache options only to the last tool to conserve breakpoints", () => {
+		const tools: Record<
+			string,
+			{ providerOptions?: Record<string, Record<string, unknown>>; [key: string]: unknown }
+		> = {
+			tool1: { description: "test", parameters: {} },
+			tool2: { description: "test2", parameters: {}, providerOptions: { existing: { key: "value" } } },
+		}
+		const cacheOptions = { anthropic: { cacheControl: { type: "ephemeral" } } }
+		applyToolCacheOptions(tools, cacheOptions)
+		// Only the last tool (tool2) should receive cache options
+		expect(tools.tool1.providerOptions).toBeUndefined()
+		expect(tools.tool2.providerOptions).toEqual({
+			existing: { key: "value" },
+			anthropic: { cacheControl: { type: "ephemeral" } },
+		})
+	})
+
+	it("should handle undefined tools", () => {
+		expect(() =>
+			applyToolCacheOptions(undefined, { anthropic: { cacheControl: { type: "ephemeral" } } }),
+		).not.toThrow()
+	})
+
+	it("should handle undefined cacheOptions", () => {
+		const tools: Record<
+			string,
+			{ providerOptions?: Record<string, Record<string, unknown>>; [key: string]: unknown }
+		> = {
+			tool1: { description: "test", parameters: {} },
+		}
+		applyToolCacheOptions(tools, undefined)
+		expect(tools.tool1.providerOptions).toBeUndefined()
+	})
+
+	it("should handle empty tools object", () => {
+		const tools: Record<
+			string,
+			{ providerOptions?: Record<string, Record<string, unknown>>; [key: string]: unknown }
+		> = {}
+		applyToolCacheOptions(tools, { anthropic: { cacheControl: { type: "ephemeral" } } })
+		expect(tools).toEqual({})
+	})
+})
+
+describe("applySystemPromptCaching", () => {
+	it("injects system prompt as cached system message and returns undefined", () => {
+		const messages: TestMessage[] = [{ role: "user" }]
+		const result = applySystemPromptCaching("You are helpful", messages, UNIVERSAL_CACHE_OPTIONS)
+		expect(result).toBeUndefined()
+		expect(messages).toHaveLength(2)
+		expect(messages[0]).toEqual({
+			role: "system",
+			content: "You are helpful",
+			providerOptions: UNIVERSAL_CACHE_OPTIONS,
+		})
+	})
+
+	it("returns undefined (no system prompt) when systemPrompt is empty string", () => {
+		const messages: TestMessage[] = [{ role: "user" }]
+		const result = applySystemPromptCaching("", messages, UNIVERSAL_CACHE_OPTIONS)
+		expect(result).toBeUndefined()
+		expect(messages).toHaveLength(1) // no message injected
+	})
+
+	it("returns undefined when systemPrompt is undefined", () => {
+		const messages: TestMessage[] = [{ role: "user" }]
+		const result = applySystemPromptCaching(undefined, messages, UNIVERSAL_CACHE_OPTIONS)
+		expect(result).toBeUndefined()
+		expect(messages).toHaveLength(1) // no message injected
+	})
+
+	it("returns systemPrompt unchanged when cacheOptions is undefined", () => {
+		const messages: TestMessage[] = [{ role: "user" }]
+		const result = applySystemPromptCaching("You are helpful", messages, undefined)
+		expect(result).toBe("You are helpful")
+		expect(messages).toHaveLength(1) // no message injected
+	})
+
+	it("prepends system message before existing messages", () => {
+		const messages: TestMessage[] = [{ role: "user" }, { role: "assistant" }, { role: "user" }]
+		applySystemPromptCaching("System prompt", messages, UNIVERSAL_CACHE_OPTIONS)
+		expect(messages).toHaveLength(4)
+		expect(messages[0].role).toBe("system")
+		expect(messages[1].role).toBe("user")
+	})
+})

+ 126 - 0
src/api/transform/cache-breakpoints.ts

@@ -0,0 +1,126 @@
+/**
+ * Universal cache breakpoint options — contains ALL provider namespaces.
+ * AI SDK's `providerOptions` are namespaced: each provider ignores keys
+ * that don't match its namespace, so it's safe to include all of them.
+ */
+export const UNIVERSAL_CACHE_OPTIONS: Record<string, Record<string, unknown>> = {
+	anthropic: { cacheControl: { type: "ephemeral" } },
+	bedrock: { cachePoint: { type: "default" } },
+}
+
+/**
+ * Optional targeting configuration for cache breakpoint placement.
+ */
+export interface CacheBreakpointTargeting {
+	/** Maximum number of message breakpoints to place. Default: 2 */
+	maxBreakpoints?: number
+	/** Whether to add an anchor breakpoint at ~1/3 through the conversation. Default: false */
+	useAnchor?: boolean
+	/** Minimum number of non-assistant messages before placing an anchor. Default: 5 */
+	anchorThreshold?: number
+}
+
+/**
+ * Apply cache breakpoints to AI SDK messages with ALL provider namespaces.
+ *
+ * 4-breakpoint strategy:
+ *   1. System prompt — passed as first message in messages[] with providerOptions
+ *   2. Tool definitions — handled externally via `toolProviderOptions` in `streamText()`
+ *   3-4. Last 2 non-assistant messages — this function handles these
+ *
+ * @param messages - The AI SDK message array (mutated in place)
+ * @param targeting - Optional targeting options (defaults: 2 breakpoints, no anchor)
+ */
+export function applyCacheBreakpoints(
+	messages: { role: string; providerOptions?: Record<string, Record<string, unknown>> }[],
+	targeting: CacheBreakpointTargeting = {},
+): void {
+	const { maxBreakpoints = 2, useAnchor = false, anchorThreshold = 5 } = targeting
+
+	// 1. Collect non-assistant message indices (user | tool roles)
+	const nonAssistantIndices: number[] = []
+	for (let i = 0; i < messages.length; i++) {
+		if (messages[i].role !== "assistant" && messages[i].role !== "system") {
+			nonAssistantIndices.push(i)
+		}
+	}
+
+	if (nonAssistantIndices.length === 0) return
+
+	// 2. Target last N non-assistant messages
+	const targetIndices = new Set<number>()
+	for (let j = 0; j < maxBreakpoints && j < nonAssistantIndices.length; j++) {
+		targetIndices.add(nonAssistantIndices[nonAssistantIndices.length - 1 - j])
+	}
+
+	// 3. Optional anchor at ~1/3 point
+	if (useAnchor && nonAssistantIndices.length >= anchorThreshold) {
+		const anchorIdx = Math.floor(nonAssistantIndices.length / 3)
+		targetIndices.add(nonAssistantIndices[anchorIdx])
+	}
+
+	// 4. Apply UNIVERSAL cache options to targeted messages
+	for (const idx of targetIndices) {
+		if (idx >= 0 && idx < messages.length) {
+			messages[idx].providerOptions = {
+				...messages[idx].providerOptions,
+				...UNIVERSAL_CACHE_OPTIONS,
+			}
+		}
+	}
+}
+
+/**
+ * Apply system prompt caching by injecting the system prompt as a cached
+ * system message at the front of the messages array.
+ *
+ * AI SDK v6 does not support `providerOptions` on the `system` string
+ * parameter. Cache-aware providers call this helper to convert the system
+ * prompt into a system message with `providerOptions` for cache control.
+ *
+ * Returns the effective system prompt to pass to `streamText()`:
+ *   - `undefined` when caching was applied (prompt is now in messages[0])
+ *   - the original `systemPrompt` when no caching options were provided
+ *
+ * @param systemPrompt - The system prompt string
+ * @param messages - The AI SDK message array (mutated in place)
+ * @param cacheOptions - Provider-specific cache options (e.g. UNIVERSAL_CACHE_OPTIONS)
+ */
+export function applySystemPromptCaching(
+	systemPrompt: string | undefined,
+	messages: { role: string; content?: unknown; providerOptions?: Record<string, Record<string, unknown>> }[],
+	cacheOptions: Record<string, Record<string, unknown>> | undefined,
+): string | undefined {
+	if (!systemPrompt || !cacheOptions) {
+		return systemPrompt || undefined
+	}
+
+	messages.unshift({
+		role: "system",
+		content: systemPrompt,
+		providerOptions: cacheOptions,
+	})
+
+	// Tell the caller not to also pass the system prompt via the `system:` parameter
+	return undefined
+}
+
+/**
+ * Apply provider-specific cache options to AI SDK tool definitions.
+ * Breakpoint 2 of 4: tool definitions.
+ */
+export function applyToolCacheOptions(
+	tools:
+		| Record<string, { providerOptions?: Record<string, Record<string, unknown>>; [key: string]: unknown }>
+		| undefined,
+	cacheOptions: Record<string, Record<string, unknown>> | undefined,
+): void {
+	if (!tools || !cacheOptions) return
+	const keys = Object.keys(tools)
+	if (keys.length === 0) return
+	// Only stamp the LAST tool to conserve cache breakpoints (max 4 shared across
+	// messages and tools). Stamping every tool wastes breakpoints — the provider
+	// silently drops all but the first few.
+	const lastKey = keys[keys.length - 1]
+	tools[lastKey].providerOptions = { ...tools[lastKey].providerOptions, ...cacheOptions }
+}

+ 10 - 0
src/core/task/Task.ts

@@ -63,6 +63,7 @@ import { ApiHandler, ApiHandlerCreateMessageMetadata, buildApiHandler } from "..
 import type { AssistantModelMessage } from "ai"
 import { ApiStream, GroundingSource } from "../../api/transform/stream"
 import { maybeRemoveImageBlocks } from "../../api/transform/image-cleaning"
+import { applyCacheBreakpoints, UNIVERSAL_CACHE_OPTIONS } from "../../api/transform/cache-breakpoints"
 
 // shared
 import { findLastIndex } from "../../shared/array"
@@ -4347,6 +4348,9 @@ export class Task extends EventEmitter<TaskEvents> implements TaskLike {
 		const messagesWithoutImages = maybeRemoveImageBlocks(mergedForApi, this.api)
 		const cleanConversationHistory = this.buildCleanConversationHistory(messagesWithoutImages)
 
+		// Breakpoints 3-4: Apply cache breakpoints to the last 2 non-assistant messages
+		applyCacheBreakpoints(cleanConversationHistory.filter(isRooRoleMessage))
+
 		// Check auto-approval limits
 		const approvalResult = await this.autoApprovalHandler.checkAutoApprovalLimits(
 			state,
@@ -4403,6 +4407,12 @@ export class Task extends EventEmitter<TaskEvents> implements TaskLike {
 			mode: mode,
 			taskId: this.taskId,
 			suppressPreviousResponseId: this.skipPrevResponseIdOnce,
+			toolProviderOptions: UNIVERSAL_CACHE_OPTIONS,
+			// Breakpoint 1: System prompt caching — cache-aware providers use this
+			// to inject the system prompt as a cached system message via
+			// applySystemPromptCaching(), since AI SDK v6 does not support
+			// providerOptions on the `system` string parameter.
+			systemProviderOptions: UNIVERSAL_CACHE_OPTIONS,
 			// Include tools whenever they are present.
 			...(shouldIncludeTools
 				? {