瀏覽代碼

Merge pull request #1312 from RooVetGit/count_tokens

Infrastructure to support calling token count APIs, starting with Anthropic
Matt Rubens 10 月之前
父節點
當前提交
773e5560db

+ 10 - 0
src/api/index.ts

@@ -27,6 +27,16 @@ export interface SingleCompletionHandler {
 export interface ApiHandler {
 	createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream
 	getModel(): { id: string; info: ModelInfo }
+
+	/**
+	 * Counts tokens for content blocks
+	 * All providers extend BaseProvider which provides a default tiktoken implementation,
+	 * but they can override this to use their native token counting endpoints
+	 *
+	 * @param content The content to count tokens for
+	 * @returns A promise resolving to the token count
+	 */
+	countTokens(content: Array<Anthropic.Messages.ContentBlockParam>): Promise<number>
 }
 
 export function buildApiHandler(configuration: ApiConfiguration): ApiHandler {

+ 35 - 3
src/api/providers/anthropic.ts

@@ -9,16 +9,17 @@ import {
 	ModelInfo,
 } from "../../shared/api"
 import { ApiStream } from "../transform/stream"
+import { BaseProvider } from "./base-provider"
 import { ANTHROPIC_DEFAULT_MAX_TOKENS } from "./constants"
-import { ApiHandler, SingleCompletionHandler, getModelParams } from "../index"
+import { SingleCompletionHandler, getModelParams } from "../index"
 
-export class AnthropicHandler implements ApiHandler, SingleCompletionHandler {
+export class AnthropicHandler extends BaseProvider implements SingleCompletionHandler {
 	private options: ApiHandlerOptions
 	private client: Anthropic
 
 	constructor(options: ApiHandlerOptions) {
+		super()
 		this.options = options
-
 		this.client = new Anthropic({
 			apiKey: this.options.apiKey,
 			baseURL: this.options.anthropicBaseUrl || undefined,
@@ -212,4 +213,35 @@ export class AnthropicHandler implements ApiHandler, SingleCompletionHandler {
 		const content = message.content.find(({ type }) => type === "text")
 		return content?.type === "text" ? content.text : ""
 	}
+
+	/**
+	 * Counts tokens for the given content using Anthropic's API
+	 *
+	 * @param content The content blocks to count tokens for
+	 * @returns A promise resolving to the token count
+	 */
+	override async countTokens(content: Array<Anthropic.Messages.ContentBlockParam>): Promise<number> {
+		try {
+			// Use the current model
+			const actualModelId = this.getModel().id
+
+			const response = await this.client.messages.countTokens({
+				model: actualModelId,
+				messages: [
+					{
+						role: "user",
+						content: content,
+					},
+				],
+			})
+
+			return response.input_tokens
+		} catch (error) {
+			// Log error but fallback to tiktoken estimation
+			console.warn("Anthropic token counting failed, using fallback", error)
+
+			// Use the base provider's implementation as fallback
+			return super.countTokens(content)
+		}
+	}
 }

+ 64 - 0
src/api/providers/base-provider.ts

@@ -0,0 +1,64 @@
+import { Anthropic } from "@anthropic-ai/sdk"
+import { ApiHandler } from ".."
+import { ModelInfo } from "../../shared/api"
+import { ApiStream } from "../transform/stream"
+import { Tiktoken } from "js-tiktoken/lite"
+import o200kBase from "js-tiktoken/ranks/o200k_base"
+
+// Reuse the fudge factor used in the original code
+const TOKEN_FUDGE_FACTOR = 1.5
+
+/**
+ * Base class for API providers that implements common functionality
+ */
+export abstract class BaseProvider implements ApiHandler {
+	// Cache the Tiktoken encoder instance since it's stateless
+	private encoder: Tiktoken | null = null
+	abstract createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream
+	abstract getModel(): { id: string; info: ModelInfo }
+
+	/**
+	 * Default token counting implementation using tiktoken
+	 * Providers can override this to use their native token counting endpoints
+	 *
+	 * Uses a cached Tiktoken encoder instance for performance since it's stateless.
+	 * The encoder is created lazily on first use and reused for subsequent calls.
+	 *
+	 * @param content The content to count tokens for
+	 * @returns A promise resolving to the token count
+	 */
+	async countTokens(content: Array<Anthropic.Messages.ContentBlockParam>): Promise<number> {
+		if (!content || content.length === 0) return 0
+
+		let totalTokens = 0
+
+		// Lazily create and cache the encoder if it doesn't exist
+		if (!this.encoder) {
+			this.encoder = new Tiktoken(o200kBase)
+		}
+
+		// Process each content block using the cached encoder
+		for (const block of content) {
+			if (block.type === "text") {
+				// Use tiktoken for text token counting
+				const text = block.text || ""
+				if (text.length > 0) {
+					const tokens = this.encoder.encode(text)
+					totalTokens += tokens.length
+				}
+			} else if (block.type === "image") {
+				// For images, calculate based on data size
+				const imageSource = block.source
+				if (imageSource && typeof imageSource === "object" && "data" in imageSource) {
+					const base64Data = imageSource.data as string
+					totalTokens += Math.ceil(Math.sqrt(base64Data.length))
+				} else {
+					totalTokens += 300 // Conservative estimate for unknown images
+				}
+			}
+		}
+
+		// Add a fudge factor to account for the fact that tiktoken is not always accurate
+		return Math.ceil(totalTokens * TOKEN_FUDGE_FACTOR)
+	}
+}

+ 7 - 5
src/api/providers/bedrock.ts

@@ -6,10 +6,11 @@ import {
 } from "@aws-sdk/client-bedrock-runtime"
 import { fromIni } from "@aws-sdk/credential-providers"
 import { Anthropic } from "@anthropic-ai/sdk"
-import { ApiHandler, SingleCompletionHandler } from "../"
+import { SingleCompletionHandler } from "../"
 import { ApiHandlerOptions, BedrockModelId, ModelInfo, bedrockDefaultModelId, bedrockModels } from "../../shared/api"
 import { ApiStream } from "../transform/stream"
 import { convertToBedrockConverseMessages } from "../transform/bedrock-converse-format"
+import { BaseProvider } from "./base-provider"
 
 const BEDROCK_DEFAULT_TEMPERATURE = 0.3
 
@@ -46,11 +47,12 @@ export interface StreamEvent {
 	}
 }
 
-export class AwsBedrockHandler implements ApiHandler, SingleCompletionHandler {
-	private options: ApiHandlerOptions
+export class AwsBedrockHandler extends BaseProvider implements SingleCompletionHandler {
+	protected options: ApiHandlerOptions
 	private client: BedrockRuntimeClient
 
 	constructor(options: ApiHandlerOptions) {
+		super()
 		this.options = options
 
 		const clientConfig: BedrockRuntimeClientConfig = {
@@ -74,7 +76,7 @@ export class AwsBedrockHandler implements ApiHandler, SingleCompletionHandler {
 		this.client = new BedrockRuntimeClient(clientConfig)
 	}
 
-	async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
+	override async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
 		const modelConfig = this.getModel()
 
 		// Handle cross-region inference
@@ -205,7 +207,7 @@ export class AwsBedrockHandler implements ApiHandler, SingleCompletionHandler {
 		}
 	}
 
-	getModel(): { id: BedrockModelId | string; info: ModelInfo } {
+	override getModel(): { id: BedrockModelId | string; info: ModelInfo } {
 		const modelId = this.options.apiModelId
 		if (modelId) {
 			// For tests, allow any model ID

+ 7 - 5
src/api/providers/gemini.ts

@@ -1,22 +1,24 @@
 import { Anthropic } from "@anthropic-ai/sdk"
 import { GoogleGenerativeAI } from "@google/generative-ai"
-import { ApiHandler, SingleCompletionHandler } from "../"
+import { SingleCompletionHandler } from "../"
 import { ApiHandlerOptions, geminiDefaultModelId, GeminiModelId, geminiModels, ModelInfo } from "../../shared/api"
 import { convertAnthropicMessageToGemini } from "../transform/gemini-format"
 import { ApiStream } from "../transform/stream"
+import { BaseProvider } from "./base-provider"
 
 const GEMINI_DEFAULT_TEMPERATURE = 0
 
-export class GeminiHandler implements ApiHandler, SingleCompletionHandler {
-	private options: ApiHandlerOptions
+export class GeminiHandler extends BaseProvider implements SingleCompletionHandler {
+	protected options: ApiHandlerOptions
 	private client: GoogleGenerativeAI
 
 	constructor(options: ApiHandlerOptions) {
+		super()
 		this.options = options
 		this.client = new GoogleGenerativeAI(options.geminiApiKey ?? "not-provided")
 	}
 
-	async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
+	override async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
 		const model = this.client.getGenerativeModel({
 			model: this.getModel().id,
 			systemInstruction: systemPrompt,
@@ -44,7 +46,7 @@ export class GeminiHandler implements ApiHandler, SingleCompletionHandler {
 		}
 	}
 
-	getModel(): { id: GeminiModelId; info: ModelInfo } {
+	override getModel(): { id: GeminiModelId; info: ModelInfo } {
 		const modelId = this.options.apiModelId
 		if (modelId && modelId in geminiModels) {
 			const id = modelId as GeminiModelId

+ 21 - 19
src/api/providers/glama.ts

@@ -6,22 +6,39 @@ import { ApiHandlerOptions, ModelInfo, glamaDefaultModelId, glamaDefaultModelInf
 import { parseApiPrice } from "../../utils/cost"
 import { convertToOpenAiMessages } from "../transform/openai-format"
 import { ApiStream } from "../transform/stream"
-import { ApiHandler, SingleCompletionHandler } from "../"
+import { SingleCompletionHandler } from "../"
+import { BaseProvider } from "./base-provider"
 
 const GLAMA_DEFAULT_TEMPERATURE = 0
 
-export class GlamaHandler implements ApiHandler, SingleCompletionHandler {
-	private options: ApiHandlerOptions
+export class GlamaHandler extends BaseProvider implements SingleCompletionHandler {
+	protected options: ApiHandlerOptions
 	private client: OpenAI
 
 	constructor(options: ApiHandlerOptions) {
+		super()
 		this.options = options
 		const baseURL = "https://glama.ai/api/gateway/openai/v1"
 		const apiKey = this.options.glamaApiKey ?? "not-provided"
 		this.client = new OpenAI({ baseURL, apiKey })
 	}
 
-	async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
+	private supportsTemperature(): boolean {
+		return !this.getModel().id.startsWith("openai/o3-mini")
+	}
+
+	override getModel(): { id: string; info: ModelInfo } {
+		const modelId = this.options.glamaModelId
+		const modelInfo = this.options.glamaModelInfo
+
+		if (modelId && modelInfo) {
+			return { id: modelId, info: modelInfo }
+		}
+
+		return { id: glamaDefaultModelId, info: glamaDefaultModelInfo }
+	}
+
+	override async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
 		// Convert Anthropic messages to OpenAI format
 		const openAiMessages: OpenAI.Chat.ChatCompletionMessageParam[] = [
 			{ role: "system", content: systemPrompt },
@@ -152,21 +169,6 @@ export class GlamaHandler implements ApiHandler, SingleCompletionHandler {
 		}
 	}
 
-	private supportsTemperature(): boolean {
-		return !this.getModel().id.startsWith("openai/o3-mini")
-	}
-
-	getModel(): { id: string; info: ModelInfo } {
-		const modelId = this.options.glamaModelId
-		const modelInfo = this.options.glamaModelInfo
-
-		if (modelId && modelInfo) {
-			return { id: modelId, info: modelInfo }
-		}
-
-		return { id: glamaDefaultModelId, info: glamaDefaultModelInfo }
-	}
-
 	async completePrompt(prompt: string): Promise<string> {
 		try {
 			const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming = {

+ 7 - 5
src/api/providers/lmstudio.ts

@@ -2,18 +2,20 @@ import { Anthropic } from "@anthropic-ai/sdk"
 import OpenAI from "openai"
 import axios from "axios"
 
-import { ApiHandler, SingleCompletionHandler } from "../"
+import { SingleCompletionHandler } from "../"
 import { ApiHandlerOptions, ModelInfo, openAiModelInfoSaneDefaults } from "../../shared/api"
 import { convertToOpenAiMessages } from "../transform/openai-format"
 import { ApiStream } from "../transform/stream"
+import { BaseProvider } from "./base-provider"
 
 const LMSTUDIO_DEFAULT_TEMPERATURE = 0
 
-export class LmStudioHandler implements ApiHandler, SingleCompletionHandler {
-	private options: ApiHandlerOptions
+export class LmStudioHandler extends BaseProvider implements SingleCompletionHandler {
+	protected options: ApiHandlerOptions
 	private client: OpenAI
 
 	constructor(options: ApiHandlerOptions) {
+		super()
 		this.options = options
 		this.client = new OpenAI({
 			baseURL: (this.options.lmStudioBaseUrl || "http://localhost:1234") + "/v1",
@@ -21,7 +23,7 @@ export class LmStudioHandler implements ApiHandler, SingleCompletionHandler {
 		})
 	}
 
-	async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
+	override async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
 		const openAiMessages: OpenAI.Chat.ChatCompletionMessageParam[] = [
 			{ role: "system", content: systemPrompt },
 			...convertToOpenAiMessages(messages),
@@ -51,7 +53,7 @@ export class LmStudioHandler implements ApiHandler, SingleCompletionHandler {
 		}
 	}
 
-	getModel(): { id: string; info: ModelInfo } {
+	override getModel(): { id: string; info: ModelInfo } {
 		return {
 			id: this.options.lmStudioModelId || "",
 			info: openAiModelInfoSaneDefaults,

+ 7 - 5
src/api/providers/mistral.ts

@@ -1,6 +1,6 @@
 import { Anthropic } from "@anthropic-ai/sdk"
 import { Mistral } from "@mistralai/mistralai"
-import { ApiHandler } from "../"
+import { SingleCompletionHandler } from "../"
 import {
 	ApiHandlerOptions,
 	mistralDefaultModelId,
@@ -13,14 +13,16 @@ import {
 } from "../../shared/api"
 import { convertToMistralMessages } from "../transform/mistral-format"
 import { ApiStream } from "../transform/stream"
+import { BaseProvider } from "./base-provider"
 
 const MISTRAL_DEFAULT_TEMPERATURE = 0
 
-export class MistralHandler implements ApiHandler {
-	private options: ApiHandlerOptions
+export class MistralHandler extends BaseProvider implements SingleCompletionHandler {
+	protected options: ApiHandlerOptions
 	private client: Mistral
 
 	constructor(options: ApiHandlerOptions) {
+		super()
 		if (!options.mistralApiKey) {
 			throw new Error("Mistral API key is required")
 		}
@@ -48,7 +50,7 @@ export class MistralHandler implements ApiHandler {
 		return "https://api.mistral.ai"
 	}
 
-	async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
+	override async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
 		const response = await this.client.chat.stream({
 			model: this.options.apiModelId || mistralDefaultModelId,
 			messages: [{ role: "system", content: systemPrompt }, ...convertToMistralMessages(messages)],
@@ -81,7 +83,7 @@ export class MistralHandler implements ApiHandler {
 		}
 	}
 
-	getModel(): { id: MistralModelId; info: ModelInfo } {
+	override getModel(): { id: MistralModelId; info: ModelInfo } {
 		const modelId = this.options.apiModelId
 		if (modelId && modelId in mistralModels) {
 			const id = modelId as MistralModelId

+ 7 - 5
src/api/providers/ollama.ts

@@ -2,19 +2,21 @@ import { Anthropic } from "@anthropic-ai/sdk"
 import OpenAI from "openai"
 import axios from "axios"
 
-import { ApiHandler, SingleCompletionHandler } from "../"
+import { SingleCompletionHandler } from "../"
 import { ApiHandlerOptions, ModelInfo, openAiModelInfoSaneDefaults } from "../../shared/api"
 import { convertToOpenAiMessages } from "../transform/openai-format"
 import { convertToR1Format } from "../transform/r1-format"
 import { ApiStream } from "../transform/stream"
 import { DEEP_SEEK_DEFAULT_TEMPERATURE } from "./constants"
 import { XmlMatcher } from "../../utils/xml-matcher"
+import { BaseProvider } from "./base-provider"
 
-export class OllamaHandler implements ApiHandler, SingleCompletionHandler {
-	private options: ApiHandlerOptions
+export class OllamaHandler extends BaseProvider implements SingleCompletionHandler {
+	protected options: ApiHandlerOptions
 	private client: OpenAI
 
 	constructor(options: ApiHandlerOptions) {
+		super()
 		this.options = options
 		this.client = new OpenAI({
 			baseURL: (this.options.ollamaBaseUrl || "http://localhost:11434") + "/v1",
@@ -22,7 +24,7 @@ export class OllamaHandler implements ApiHandler, SingleCompletionHandler {
 		})
 	}
 
-	async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
+	override async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
 		const modelId = this.getModel().id
 		const useR1Format = modelId.toLowerCase().includes("deepseek-r1")
 		const openAiMessages: OpenAI.Chat.ChatCompletionMessageParam[] = [
@@ -58,7 +60,7 @@ export class OllamaHandler implements ApiHandler, SingleCompletionHandler {
 		}
 	}
 
-	getModel(): { id: string; info: ModelInfo } {
+	override getModel(): { id: string; info: ModelInfo } {
 		return {
 			id: this.options.ollamaModelId || "",
 			info: openAiModelInfoSaneDefaults,

+ 7 - 5
src/api/providers/openai-native.ts

@@ -1,6 +1,6 @@
 import { Anthropic } from "@anthropic-ai/sdk"
 import OpenAI from "openai"
-import { ApiHandler, SingleCompletionHandler } from "../"
+import { SingleCompletionHandler } from "../"
 import {
 	ApiHandlerOptions,
 	ModelInfo,
@@ -10,20 +10,22 @@ import {
 } from "../../shared/api"
 import { convertToOpenAiMessages } from "../transform/openai-format"
 import { ApiStream } from "../transform/stream"
+import { BaseProvider } from "./base-provider"
 
 const OPENAI_NATIVE_DEFAULT_TEMPERATURE = 0
 
-export class OpenAiNativeHandler implements ApiHandler, SingleCompletionHandler {
-	private options: ApiHandlerOptions
+export class OpenAiNativeHandler extends BaseProvider implements SingleCompletionHandler {
+	protected options: ApiHandlerOptions
 	private client: OpenAI
 
 	constructor(options: ApiHandlerOptions) {
+		super()
 		this.options = options
 		const apiKey = this.options.openAiNativeApiKey ?? "not-provided"
 		this.client = new OpenAI({ apiKey })
 	}
 
-	async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
+	override async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
 		const modelId = this.getModel().id
 
 		if (modelId.startsWith("o1")) {
@@ -133,7 +135,7 @@ export class OpenAiNativeHandler implements ApiHandler, SingleCompletionHandler
 		}
 	}
 
-	getModel(): { id: OpenAiNativeModelId; info: ModelInfo } {
+	override getModel(): { id: OpenAiNativeModelId; info: ModelInfo } {
 		const modelId = this.options.apiModelId
 		if (modelId && modelId in openAiNativeModels) {
 			const id = modelId as OpenAiNativeModelId

+ 7 - 5
src/api/providers/openai.ts

@@ -8,22 +8,24 @@ import {
 	ModelInfo,
 	openAiModelInfoSaneDefaults,
 } from "../../shared/api"
-import { ApiHandler, SingleCompletionHandler } from "../index"
+import { SingleCompletionHandler } from "../index"
 import { convertToOpenAiMessages } from "../transform/openai-format"
 import { convertToR1Format } from "../transform/r1-format"
 import { convertToSimpleMessages } from "../transform/simple-format"
 import { ApiStream, ApiStreamUsageChunk } from "../transform/stream"
-import { DEEP_SEEK_DEFAULT_TEMPERATURE } from "./constants"
+import { BaseProvider } from "./base-provider"
 
+const DEEP_SEEK_DEFAULT_TEMPERATURE = 0.6
 export interface OpenAiHandlerOptions extends ApiHandlerOptions {
 	defaultHeaders?: Record<string, string>
 }
 
-export class OpenAiHandler implements ApiHandler, SingleCompletionHandler {
+export class OpenAiHandler extends BaseProvider implements SingleCompletionHandler {
 	protected options: OpenAiHandlerOptions
 	private client: OpenAI
 
 	constructor(options: OpenAiHandlerOptions) {
+		super()
 		this.options = options
 
 		const baseURL = this.options.openAiBaseUrl ?? "https://api.openai.com/v1"
@@ -51,7 +53,7 @@ export class OpenAiHandler implements ApiHandler, SingleCompletionHandler {
 		}
 	}
 
-	async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
+	override async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
 		const modelInfo = this.getModel().info
 		const modelUrl = this.options.openAiBaseUrl ?? ""
 		const modelId = this.options.openAiModelId ?? ""
@@ -139,7 +141,7 @@ export class OpenAiHandler implements ApiHandler, SingleCompletionHandler {
 		}
 	}
 
-	getModel(): { id: string; info: ModelInfo } {
+	override getModel(): { id: string; info: ModelInfo } {
 		return {
 			id: this.options.openAiModelId ?? "",
 			info: this.options.openAiCustomModelInfo ?? openAiModelInfoSaneDefaults,

+ 8 - 5
src/api/providers/openrouter.ts

@@ -9,8 +9,10 @@ import { parseApiPrice } from "../../utils/cost"
 import { convertToOpenAiMessages } from "../transform/openai-format"
 import { ApiStreamChunk, ApiStreamUsageChunk } from "../transform/stream"
 import { convertToR1Format } from "../transform/r1-format"
+
 import { DEEP_SEEK_DEFAULT_TEMPERATURE } from "./constants"
-import { ApiHandler, getModelParams, SingleCompletionHandler } from ".."
+import { getModelParams, SingleCompletionHandler } from ".."
+import { BaseProvider } from "./base-provider"
 
 // Add custom interface for OpenRouter params.
 type OpenRouterChatCompletionParams = OpenAI.Chat.ChatCompletionCreateParams & {
@@ -24,11 +26,12 @@ interface OpenRouterApiStreamUsageChunk extends ApiStreamUsageChunk {
 	fullResponseText: string
 }
 
-export class OpenRouterHandler implements ApiHandler, SingleCompletionHandler {
-	private options: ApiHandlerOptions
+export class OpenRouterHandler extends BaseProvider implements SingleCompletionHandler {
+	protected options: ApiHandlerOptions
 	private client: OpenAI
 
 	constructor(options: ApiHandlerOptions) {
+		super()
 		this.options = options
 
 		const baseURL = this.options.openRouterBaseUrl || "https://openrouter.ai/api/v1"
@@ -42,7 +45,7 @@ export class OpenRouterHandler implements ApiHandler, SingleCompletionHandler {
 		this.client = new OpenAI({ baseURL, apiKey, defaultHeaders })
 	}
 
-	async *createMessage(
+	override async *createMessage(
 		systemPrompt: string,
 		messages: Anthropic.Messages.MessageParam[],
 	): AsyncGenerator<ApiStreamChunk> {
@@ -191,7 +194,7 @@ export class OpenRouterHandler implements ApiHandler, SingleCompletionHandler {
 		}
 	}
 
-	getModel() {
+	override getModel() {
 		const modelId = this.options.openRouterModelId
 		const modelInfo = this.options.openRouterModelInfo
 

+ 7 - 5
src/api/providers/unbound.ts

@@ -5,25 +5,27 @@ import OpenAI from "openai"
 import { ApiHandlerOptions, ModelInfo, unboundDefaultModelId, unboundDefaultModelInfo } from "../../shared/api"
 import { convertToOpenAiMessages } from "../transform/openai-format"
 import { ApiStream, ApiStreamUsageChunk } from "../transform/stream"
-import { ApiHandler, SingleCompletionHandler } from "../"
+import { SingleCompletionHandler } from "../"
+import { BaseProvider } from "./base-provider"
 
 interface UnboundUsage extends OpenAI.CompletionUsage {
 	cache_creation_input_tokens?: number
 	cache_read_input_tokens?: number
 }
 
-export class UnboundHandler implements ApiHandler, SingleCompletionHandler {
-	private options: ApiHandlerOptions
+export class UnboundHandler extends BaseProvider implements SingleCompletionHandler {
+	protected options: ApiHandlerOptions
 	private client: OpenAI
 
 	constructor(options: ApiHandlerOptions) {
+		super()
 		this.options = options
 		const baseURL = "https://api.getunbound.ai/v1"
 		const apiKey = this.options.unboundApiKey ?? "not-provided"
 		this.client = new OpenAI({ baseURL, apiKey })
 	}
 
-	async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
+	override async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
 		// Convert Anthropic messages to OpenAI format
 		const openAiMessages: OpenAI.Chat.ChatCompletionMessageParam[] = [
 			{ role: "system", content: systemPrompt },
@@ -131,7 +133,7 @@ export class UnboundHandler implements ApiHandler, SingleCompletionHandler {
 		}
 	}
 
-	getModel(): { id: string; info: ModelInfo } {
+	override getModel(): { id: string; info: ModelInfo } {
 		const modelId = this.options.unboundModelId
 		const modelInfo = this.options.unboundModelInfo
 		if (modelId && modelInfo) {

+ 9 - 4
src/api/providers/vertex.ts

@@ -1,13 +1,16 @@
 import { Anthropic } from "@anthropic-ai/sdk"
 import { AnthropicVertex } from "@anthropic-ai/vertex-sdk"
 import { Stream as AnthropicStream } from "@anthropic-ai/sdk/streaming"
+
 import { VertexAI } from "@google-cloud/vertexai"
 
 import { ApiHandlerOptions, ModelInfo, vertexDefaultModelId, VertexModelId, vertexModels } from "../../shared/api"
 import { ApiStream } from "../transform/stream"
 import { convertAnthropicMessageToVertexGemini } from "../transform/vertex-gemini-format"
+import { BaseProvider } from "./base-provider"
+
 import { ANTHROPIC_DEFAULT_MAX_TOKENS } from "./constants"
-import { ApiHandler, getModelParams, SingleCompletionHandler } from "../"
+import { getModelParams, SingleCompletionHandler } from "../"
 
 // Types for Vertex SDK
 
@@ -94,17 +97,19 @@ interface VertexMessageStreamEvent {
 				thinking: string
 		  }
 }
+
 // https://docs.anthropic.com/en/api/claude-on-vertex-ai
-export class VertexHandler implements ApiHandler, SingleCompletionHandler {
+export class VertexHandler extends BaseProvider implements SingleCompletionHandler {
 	MODEL_CLAUDE = "claude"
 	MODEL_GEMINI = "gemini"
 
-	private options: ApiHandlerOptions
+	protected options: ApiHandlerOptions
 	private anthropicClient: AnthropicVertex
 	private geminiClient: VertexAI
 	private modelType: string
 
 	constructor(options: ApiHandlerOptions) {
+		super()
 		this.options = options
 
 		if (this.options.apiModelId?.startsWith(this.MODEL_CLAUDE)) {
@@ -329,7 +334,7 @@ export class VertexHandler implements ApiHandler, SingleCompletionHandler {
 		}
 	}
 
-	async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
+	override async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
 		switch (this.modelType) {
 			case this.MODEL_CLAUDE: {
 				yield* this.createClaudeMessage(systemPrompt, messages)

+ 39 - 11
src/api/providers/vscode-lm.ts

@@ -1,18 +1,19 @@
 import { Anthropic } from "@anthropic-ai/sdk"
 import * as vscode from "vscode"
 
-import { ApiHandler, SingleCompletionHandler } from "../"
+import { SingleCompletionHandler } from "../"
 import { calculateApiCost } from "../../utils/cost"
 import { ApiStream } from "../transform/stream"
 import { convertToVsCodeLmMessages } from "../transform/vscode-lm-format"
 import { SELECTOR_SEPARATOR, stringifyVsCodeLmModelSelector } from "../../shared/vsCodeSelectorUtils"
 import { ApiHandlerOptions, ModelInfo, openAiModelInfoSaneDefaults } from "../../shared/api"
+import { BaseProvider } from "./base-provider"
 
 /**
  * Handles interaction with VS Code's Language Model API for chat-based operations.
- * This handler implements the ApiHandler interface to provide VS Code LM specific functionality.
+ * This handler extends BaseProvider to provide VS Code LM specific functionality.
  *
- * @implements {ApiHandler}
+ * @extends {BaseProvider}
  *
  * @remarks
  * The handler manages a VS Code language model chat client and provides methods to:
@@ -35,13 +36,14 @@ import { ApiHandlerOptions, ModelInfo, openAiModelInfoSaneDefaults } from "../..
  * }
  * ```
  */
-export class VsCodeLmHandler implements ApiHandler, SingleCompletionHandler {
-	private options: ApiHandlerOptions
+export class VsCodeLmHandler extends BaseProvider implements SingleCompletionHandler {
+	protected options: ApiHandlerOptions
 	private client: vscode.LanguageModelChat | null
 	private disposable: vscode.Disposable | null
 	private currentRequestCancellation: vscode.CancellationTokenSource | null
 
 	constructor(options: ApiHandlerOptions) {
+		super()
 		this.options = options
 		this.client = null
 		this.disposable = null
@@ -145,7 +147,33 @@ export class VsCodeLmHandler implements ApiHandler, SingleCompletionHandler {
 		}
 	}
 
-	private async countTokens(text: string | vscode.LanguageModelChatMessage): Promise<number> {
+	/**
+	 * Implements the ApiHandler countTokens interface method
+	 * Provides token counting for Anthropic content blocks
+	 *
+	 * @param content The content blocks to count tokens for
+	 * @returns A promise resolving to the token count
+	 */
+	override async countTokens(content: Array<Anthropic.Messages.ContentBlockParam>): Promise<number> {
+		// Convert Anthropic content blocks to a string for VSCode LM token counting
+		let textContent = ""
+
+		for (const block of content) {
+			if (block.type === "text") {
+				textContent += block.text || ""
+			} else if (block.type === "image") {
+				// VSCode LM doesn't support images directly, so we'll just use a placeholder
+				textContent += "[IMAGE]"
+			}
+		}
+
+		return this.internalCountTokens(textContent)
+	}
+
+	/**
+	 * Private implementation of token counting used internally by VsCodeLmHandler
+	 */
+	private async internalCountTokens(text: string | vscode.LanguageModelChatMessage): Promise<number> {
 		// Check for required dependencies
 		if (!this.client) {
 			console.warn("Roo Code <Language Model API>: No client available for token counting")
@@ -216,9 +244,9 @@ export class VsCodeLmHandler implements ApiHandler, SingleCompletionHandler {
 		systemPrompt: string,
 		vsCodeLmMessages: vscode.LanguageModelChatMessage[],
 	): Promise<number> {
-		const systemTokens: number = await this.countTokens(systemPrompt)
+		const systemTokens: number = await this.internalCountTokens(systemPrompt)
 
-		const messageTokens: number[] = await Promise.all(vsCodeLmMessages.map((msg) => this.countTokens(msg)))
+		const messageTokens: number[] = await Promise.all(vsCodeLmMessages.map((msg) => this.internalCountTokens(msg)))
 
 		return systemTokens + messageTokens.reduce((sum: number, tokens: number): number => sum + tokens, 0)
 	}
@@ -319,7 +347,7 @@ export class VsCodeLmHandler implements ApiHandler, SingleCompletionHandler {
 		return content
 	}
 
-	async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
+	override async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
 		// Ensure clean state before starting a new request
 		this.ensureCleanState()
 		const client: vscode.LanguageModelChat = await this.getClient()
@@ -427,7 +455,7 @@ export class VsCodeLmHandler implements ApiHandler, SingleCompletionHandler {
 			}
 
 			// Count tokens in the accumulated text after stream completion
-			const totalOutputTokens: number = await this.countTokens(accumulatedText)
+			const totalOutputTokens: number = await this.internalCountTokens(accumulatedText)
 
 			// Report final usage after stream completion
 			yield {
@@ -467,7 +495,7 @@ export class VsCodeLmHandler implements ApiHandler, SingleCompletionHandler {
 	}
 
 	// Return model information based on the current client state
-	getModel(): { id: string; info: ModelInfo } {
+	override getModel(): { id: string; info: ModelInfo } {
 		if (this.client) {
 			// Validate client properties
 			const requiredProps = {

+ 2 - 2
src/core/Cline.ts

@@ -990,12 +990,12 @@ export class Cline {
 				? this.apiConfiguration.modelMaxTokens || modelInfo.maxTokens
 				: modelInfo.maxTokens
 			const contextWindow = modelInfo.contextWindow
-
-			const trimmedMessages = truncateConversationIfNeeded({
+			const trimmedMessages = await truncateConversationIfNeeded({
 				messages: this.apiConversationHistory,
 				totalTokens,
 				maxTokens,
 				contextWindow,
+				apiHandler: this.api,
 			})
 
 			if (trimmedMessages !== this.apiConversationHistory) {

+ 276 - 235
src/core/sliding-window/__tests__/sliding-window.test.ts

@@ -3,12 +3,35 @@
 import { Anthropic } from "@anthropic-ai/sdk"
 
 import { ModelInfo } from "../../../shared/api"
-import {
-	TOKEN_BUFFER_PERCENTAGE,
-	estimateTokenCount,
-	truncateConversation,
-	truncateConversationIfNeeded,
-} from "../index"
+import { ApiHandler } from "../../../api"
+import { BaseProvider } from "../../../api/providers/base-provider"
+import { TOKEN_BUFFER_PERCENTAGE } from "../index"
+import { estimateTokenCount, truncateConversation, truncateConversationIfNeeded } from "../index"
+
+// Create a mock ApiHandler for testing
+class MockApiHandler extends BaseProvider {
+	createMessage(): any {
+		throw new Error("Method not implemented.")
+	}
+
+	getModel(): { id: string; info: ModelInfo } {
+		return {
+			id: "test-model",
+			info: {
+				contextWindow: 100000,
+				maxTokens: 50000,
+				supportsPromptCache: true,
+				supportsImages: false,
+				inputPrice: 0,
+				outputPrice: 0,
+				description: "Test model",
+			},
+		}
+	}
+}
+
+// Create a singleton instance for tests
+const mockApiHandler = new MockApiHandler()
 
 /**
  * Tests for the truncateConversation function
@@ -100,134 +123,91 @@ describe("truncateConversation", () => {
 })
 
 /**
- * Tests for the getMaxTokens function (private but tested through truncateConversationIfNeeded)
+ * Tests for the estimateTokenCount function
  */
-describe("getMaxTokens", () => {
-	// We'll test this indirectly through truncateConversationIfNeeded
-	const createModelInfo = (contextWindow: number, maxTokens?: number): ModelInfo => ({
-		contextWindow,
-		supportsPromptCache: true, // Not relevant for getMaxTokens
-		maxTokens,
+describe("estimateTokenCount", () => {
+	it("should return 0 for empty or undefined content", async () => {
+		expect(await estimateTokenCount([], mockApiHandler)).toBe(0)
+		// @ts-ignore - Testing with undefined
+		expect(await estimateTokenCount(undefined, mockApiHandler)).toBe(0)
 	})
 
-	// Reuse across tests for consistency
-	const messages: Anthropic.Messages.MessageParam[] = [
-		{ role: "user", content: "First message" },
-		{ role: "assistant", content: "Second message" },
-		{ role: "user", content: "Third message" },
-		{ role: "assistant", content: "Fourth message" },
-		{ role: "user", content: "Fifth message" },
-	]
-
-	it("should use maxTokens as buffer when specified", () => {
-		const modelInfo = createModelInfo(100000, 50000)
-		// Max tokens = 100000 - 50000 = 50000
-
-		// Create messages with very small content in the last one to avoid token overflow
-		const messagesWithSmallContent = [...messages.slice(0, -1), { ...messages[messages.length - 1], content: "" }]
+	it("should estimate tokens for text blocks", async () => {
+		const content: Array<Anthropic.Messages.ContentBlockParam> = [
+			{ type: "text", text: "This is a text block with 36 characters" },
+		]
 
-		// Account for the dynamic buffer which is 10% of context window (10,000 tokens)
-		// Below max tokens and buffer - no truncation
-		const result1 = truncateConversationIfNeeded({
-			messages: messagesWithSmallContent,
-			totalTokens: 39999, // Well below threshold + dynamic buffer
-			contextWindow: modelInfo.contextWindow,
-			maxTokens: modelInfo.maxTokens,
-		})
-		expect(result1).toEqual(messagesWithSmallContent)
+		// With tiktoken, the exact token count may differ from character-based estimation
+		// Instead of expecting an exact number, we verify it's a reasonable positive number
+		const result = await estimateTokenCount(content, mockApiHandler)
+		expect(result).toBeGreaterThan(0)
 
-		// Above max tokens - truncate
-		const result2 = truncateConversationIfNeeded({
-			messages: messagesWithSmallContent,
-			totalTokens: 50001, // Above threshold
-			contextWindow: modelInfo.contextWindow,
-			maxTokens: modelInfo.maxTokens,
-		})
-		expect(result2).not.toEqual(messagesWithSmallContent)
-		expect(result2.length).toBe(3) // Truncated with 0.5 fraction
+		// We can also verify that longer text results in more tokens
+		const longerContent: Array<Anthropic.Messages.ContentBlockParam> = [
+			{
+				type: "text",
+				text: "This is a longer text block with significantly more characters to encode into tokens",
+			},
+		]
+		const longerResult = await estimateTokenCount(longerContent, mockApiHandler)
+		expect(longerResult).toBeGreaterThan(result)
 	})
 
-	it("should use 20% of context window as buffer when maxTokens is undefined", () => {
-		const modelInfo = createModelInfo(100000, undefined)
-		// Max tokens = 100000 - (100000 * 0.2) = 80000
+	it("should estimate tokens for image blocks based on data size", async () => {
+		// Small image
+		const smallImage: Array<Anthropic.Messages.ContentBlockParam> = [
+			{ type: "image", source: { type: "base64", media_type: "image/jpeg", data: "small_dummy_data" } },
+		]
+		// Larger image with more data
+		const largerImage: Array<Anthropic.Messages.ContentBlockParam> = [
+			{ type: "image", source: { type: "base64", media_type: "image/png", data: "X".repeat(1000) } },
+		]
 
-		// Create messages with very small content in the last one to avoid token overflow
-		const messagesWithSmallContent = [...messages.slice(0, -1), { ...messages[messages.length - 1], content: "" }]
+		// Verify the token count scales with the size of the image data
+		const smallImageTokens = await estimateTokenCount(smallImage, mockApiHandler)
+		const largerImageTokens = await estimateTokenCount(largerImage, mockApiHandler)
 
-		// Account for the dynamic buffer which is 10% of context window (10,000 tokens)
-		// Below max tokens and buffer - no truncation
-		const result1 = truncateConversationIfNeeded({
-			messages: messagesWithSmallContent,
-			totalTokens: 69999, // Well below threshold + dynamic buffer
-			contextWindow: modelInfo.contextWindow,
-			maxTokens: modelInfo.maxTokens,
-		})
-		expect(result1).toEqual(messagesWithSmallContent)
+		// Small image should have some tokens
+		expect(smallImageTokens).toBeGreaterThan(0)
 
-		// Above max tokens - truncate
-		const result2 = truncateConversationIfNeeded({
-			messages: messagesWithSmallContent,
-			totalTokens: 80001, // Above threshold
-			contextWindow: modelInfo.contextWindow,
-			maxTokens: modelInfo.maxTokens,
-		})
-		expect(result2).not.toEqual(messagesWithSmallContent)
-		expect(result2.length).toBe(3) // Truncated with 0.5 fraction
+		// Larger image should have proportionally more tokens
+		expect(largerImageTokens).toBeGreaterThan(smallImageTokens)
+
+		// Verify the larger image calculation matches our formula including the 50% fudge factor
+		expect(largerImageTokens).toBe(48)
 	})
 
-	it("should handle small context windows appropriately", () => {
-		const modelInfo = createModelInfo(50000, 10000)
-		// Max tokens = 50000 - 10000 = 40000
+	it("should estimate tokens for mixed content blocks", async () => {
+		const content: Array<Anthropic.Messages.ContentBlockParam> = [
+			{ type: "text", text: "A text block with 30 characters" },
+			{ type: "image", source: { type: "base64", media_type: "image/jpeg", data: "dummy_data" } },
+			{ type: "text", text: "Another text with 24 chars" },
+		]
 
-		// Create messages with very small content in the last one to avoid token overflow
-		const messagesWithSmallContent = [...messages.slice(0, -1), { ...messages[messages.length - 1], content: "" }]
+		// We know image tokens calculation should be consistent
+		const imageTokens = Math.ceil(Math.sqrt("dummy_data".length)) * 1.5
 
-		// Below max tokens and buffer - no truncation
-		const result1 = truncateConversationIfNeeded({
-			messages: messagesWithSmallContent,
-			totalTokens: 34999, // Well below threshold + buffer
-			contextWindow: modelInfo.contextWindow,
-			maxTokens: modelInfo.maxTokens,
-		})
-		expect(result1).toEqual(messagesWithSmallContent)
+		// With tiktoken, we can't predict exact text token counts,
+		// but we can verify the total is greater than just the image tokens
+		const result = await estimateTokenCount(content, mockApiHandler)
+		expect(result).toBeGreaterThan(imageTokens)
 
-		// Above max tokens - truncate
-		const result2 = truncateConversationIfNeeded({
-			messages: messagesWithSmallContent,
-			totalTokens: 40001, // Above threshold
-			contextWindow: modelInfo.contextWindow,
-			maxTokens: modelInfo.maxTokens,
-		})
-		expect(result2).not.toEqual(messagesWithSmallContent)
-		expect(result2.length).toBe(3) // Truncated with 0.5 fraction
+		// Also test against a version with only the image to verify text adds tokens
+		const imageOnlyContent: Array<Anthropic.Messages.ContentBlockParam> = [
+			{ type: "image", source: { type: "base64", media_type: "image/jpeg", data: "dummy_data" } },
+		]
+		const imageOnlyResult = await estimateTokenCount(imageOnlyContent, mockApiHandler)
+		expect(result).toBeGreaterThan(imageOnlyResult)
 	})
 
-	it("should handle large context windows appropriately", () => {
-		const modelInfo = createModelInfo(200000, 30000)
-		// Max tokens = 200000 - 30000 = 170000
-
-		// Create messages with very small content in the last one to avoid token overflow
-		const messagesWithSmallContent = [...messages.slice(0, -1), { ...messages[messages.length - 1], content: "" }]
-
-		// Account for the dynamic buffer which is 10% of context window (20,000 tokens for this test)
-		// Below max tokens and buffer - no truncation
-		const result1 = truncateConversationIfNeeded({
-			messages: messagesWithSmallContent,
-			totalTokens: 149999, // Well below threshold + dynamic buffer
-			contextWindow: modelInfo.contextWindow,
-			maxTokens: modelInfo.maxTokens,
-		})
-		expect(result1).toEqual(messagesWithSmallContent)
+	it("should handle empty text blocks", async () => {
+		const content: Array<Anthropic.Messages.ContentBlockParam> = [{ type: "text", text: "" }]
+		expect(await estimateTokenCount(content, mockApiHandler)).toBe(0)
+	})
 
-		// Above max tokens - truncate
-		const result2 = truncateConversationIfNeeded({
-			messages: messagesWithSmallContent,
-			totalTokens: 170001, // Above threshold
-			contextWindow: modelInfo.contextWindow,
-			maxTokens: modelInfo.maxTokens,
-		})
-		expect(result2).not.toEqual(messagesWithSmallContent)
-		expect(result2.length).toBe(3) // Truncated with 0.5 fraction
+	it("should handle plain string messages", async () => {
+		const content = "This is a plain text message"
+		expect(await estimateTokenCount([{ type: "text", text: content }], mockApiHandler)).toBeGreaterThan(0)
 	})
 })
 
@@ -235,9 +215,9 @@ describe("getMaxTokens", () => {
  * Tests for the truncateConversationIfNeeded function
  */
 describe("truncateConversationIfNeeded", () => {
-	const createModelInfo = (contextWindow: number, supportsPromptCache: boolean, maxTokens?: number): ModelInfo => ({
+	const createModelInfo = (contextWindow: number, maxTokens?: number): ModelInfo => ({
 		contextWindow,
-		supportsPromptCache,
+		supportsPromptCache: true,
 		maxTokens,
 	})
 
@@ -249,8 +229,8 @@ describe("truncateConversationIfNeeded", () => {
 		{ role: "user", content: "Fifth message" },
 	]
 
-	it("should not truncate if tokens are below max tokens threshold", () => {
-		const modelInfo = createModelInfo(100000, true, 30000)
+	it("should not truncate if tokens are below max tokens threshold", async () => {
+		const modelInfo = createModelInfo(100000, 30000)
 		const maxTokens = 100000 - 30000 // 70000
 		const dynamicBuffer = modelInfo.contextWindow * TOKEN_BUFFER_PERCENTAGE // 10000
 		const totalTokens = 70000 - dynamicBuffer - 1 // Just below threshold - buffer
@@ -258,17 +238,18 @@ describe("truncateConversationIfNeeded", () => {
 		// Create messages with very small content in the last one to avoid token overflow
 		const messagesWithSmallContent = [...messages.slice(0, -1), { ...messages[messages.length - 1], content: "" }]
 
-		const result = truncateConversationIfNeeded({
+		const result = await truncateConversationIfNeeded({
 			messages: messagesWithSmallContent,
 			totalTokens,
 			contextWindow: modelInfo.contextWindow,
 			maxTokens: modelInfo.maxTokens,
+			apiHandler: mockApiHandler,
 		})
 		expect(result).toEqual(messagesWithSmallContent) // No truncation occurs
 	})
 
-	it("should truncate if tokens are above max tokens threshold", () => {
-		const modelInfo = createModelInfo(100000, true, 30000)
+	it("should truncate if tokens are above max tokens threshold", async () => {
+		const modelInfo = createModelInfo(100000, 30000)
 		const maxTokens = 100000 - 30000 // 70000
 		const totalTokens = 70001 // Above threshold
 
@@ -279,68 +260,73 @@ describe("truncateConversationIfNeeded", () => {
 		// With 4 messages after the first, 0.5 fraction means remove 2 messages
 		const expectedResult = [messagesWithSmallContent[0], messagesWithSmallContent[3], messagesWithSmallContent[4]]
 
-		const result = truncateConversationIfNeeded({
+		const result = await truncateConversationIfNeeded({
 			messages: messagesWithSmallContent,
 			totalTokens,
 			contextWindow: modelInfo.contextWindow,
 			maxTokens: modelInfo.maxTokens,
+			apiHandler: mockApiHandler,
 		})
 		expect(result).toEqual(expectedResult)
 	})
 
-	it("should work with non-prompt caching models the same as prompt caching models", () => {
+	it("should work with non-prompt caching models the same as prompt caching models", async () => {
 		// The implementation no longer differentiates between prompt caching and non-prompt caching models
-		const modelInfo1 = createModelInfo(100000, true, 30000)
-		const modelInfo2 = createModelInfo(100000, false, 30000)
+		const modelInfo1 = createModelInfo(100000, 30000)
+		const modelInfo2 = createModelInfo(100000, 30000)
 
 		// Create messages with very small content in the last one to avoid token overflow
 		const messagesWithSmallContent = [...messages.slice(0, -1), { ...messages[messages.length - 1], content: "" }]
 
 		// Test below threshold
 		const belowThreshold = 69999
-		expect(
-			truncateConversationIfNeeded({
-				messages: messagesWithSmallContent,
-				totalTokens: belowThreshold,
-				contextWindow: modelInfo1.contextWindow,
-				maxTokens: modelInfo1.maxTokens,
-			}),
-		).toEqual(
-			truncateConversationIfNeeded({
-				messages: messagesWithSmallContent,
-				totalTokens: belowThreshold,
-				contextWindow: modelInfo2.contextWindow,
-				maxTokens: modelInfo2.maxTokens,
-			}),
-		)
+		const result1 = await truncateConversationIfNeeded({
+			messages: messagesWithSmallContent,
+			totalTokens: belowThreshold,
+			contextWindow: modelInfo1.contextWindow,
+			maxTokens: modelInfo1.maxTokens,
+			apiHandler: mockApiHandler,
+		})
+
+		const result2 = await truncateConversationIfNeeded({
+			messages: messagesWithSmallContent,
+			totalTokens: belowThreshold,
+			contextWindow: modelInfo2.contextWindow,
+			maxTokens: modelInfo2.maxTokens,
+			apiHandler: mockApiHandler,
+		})
+
+		expect(result1).toEqual(result2)
 
 		// Test above threshold
 		const aboveThreshold = 70001
-		expect(
-			truncateConversationIfNeeded({
-				messages: messagesWithSmallContent,
-				totalTokens: aboveThreshold,
-				contextWindow: modelInfo1.contextWindow,
-				maxTokens: modelInfo1.maxTokens,
-			}),
-		).toEqual(
-			truncateConversationIfNeeded({
-				messages: messagesWithSmallContent,
-				totalTokens: aboveThreshold,
-				contextWindow: modelInfo2.contextWindow,
-				maxTokens: modelInfo2.maxTokens,
-			}),
-		)
+		const result3 = await truncateConversationIfNeeded({
+			messages: messagesWithSmallContent,
+			totalTokens: aboveThreshold,
+			contextWindow: modelInfo1.contextWindow,
+			maxTokens: modelInfo1.maxTokens,
+			apiHandler: mockApiHandler,
+		})
+
+		const result4 = await truncateConversationIfNeeded({
+			messages: messagesWithSmallContent,
+			totalTokens: aboveThreshold,
+			contextWindow: modelInfo2.contextWindow,
+			maxTokens: modelInfo2.maxTokens,
+			apiHandler: mockApiHandler,
+		})
+
+		expect(result3).toEqual(result4)
 	})
 
-	it("should consider incoming content when deciding to truncate", () => {
-		const modelInfo = createModelInfo(100000, true, 30000)
+	it("should consider incoming content when deciding to truncate", async () => {
+		const modelInfo = createModelInfo(100000, 30000)
 		const maxTokens = 30000
 		const availableTokens = modelInfo.contextWindow - maxTokens
 
 		// Test case 1: Small content that won't push us over the threshold
 		const smallContent = [{ type: "text" as const, text: "Small content" }]
-		const smallContentTokens = estimateTokenCount(smallContent)
+		const smallContentTokens = await estimateTokenCount(smallContent, mockApiHandler)
 		const messagesWithSmallContent: Anthropic.Messages.MessageParam[] = [
 			...messages.slice(0, -1),
 			{ role: messages[messages.length - 1].role, content: smallContent },
@@ -349,11 +335,12 @@ describe("truncateConversationIfNeeded", () => {
 		// Set base tokens so total is well below threshold + buffer even with small content added
 		const dynamicBuffer = modelInfo.contextWindow * TOKEN_BUFFER_PERCENTAGE
 		const baseTokensForSmall = availableTokens - smallContentTokens - dynamicBuffer - 10
-		const resultWithSmall = truncateConversationIfNeeded({
+		const resultWithSmall = await truncateConversationIfNeeded({
 			messages: messagesWithSmallContent,
 			totalTokens: baseTokensForSmall,
 			contextWindow: modelInfo.contextWindow,
 			maxTokens,
+			apiHandler: mockApiHandler,
 		})
 		expect(resultWithSmall).toEqual(messagesWithSmallContent) // No truncation
 
@@ -364,7 +351,7 @@ describe("truncateConversationIfNeeded", () => {
 				text: "A very large incoming message that would consume a significant number of tokens and push us over the threshold",
 			},
 		]
-		const largeContentTokens = estimateTokenCount(largeContent)
+		const largeContentTokens = await estimateTokenCount(largeContent, mockApiHandler)
 		const messagesWithLargeContent: Anthropic.Messages.MessageParam[] = [
 			...messages.slice(0, -1),
 			{ role: messages[messages.length - 1].role, content: largeContent },
@@ -372,17 +359,18 @@ describe("truncateConversationIfNeeded", () => {
 
 		// Set base tokens so we're just below threshold without content, but over with content
 		const baseTokensForLarge = availableTokens - Math.floor(largeContentTokens / 2)
-		const resultWithLarge = truncateConversationIfNeeded({
+		const resultWithLarge = await truncateConversationIfNeeded({
 			messages: messagesWithLargeContent,
 			totalTokens: baseTokensForLarge,
 			contextWindow: modelInfo.contextWindow,
 			maxTokens,
+			apiHandler: mockApiHandler,
 		})
 		expect(resultWithLarge).not.toEqual(messagesWithLargeContent) // Should truncate
 
 		// Test case 3: Very large content that will definitely exceed threshold
 		const veryLargeContent = [{ type: "text" as const, text: "X".repeat(1000) }]
-		const veryLargeContentTokens = estimateTokenCount(veryLargeContent)
+		const veryLargeContentTokens = await estimateTokenCount(veryLargeContent, mockApiHandler)
 		const messagesWithVeryLargeContent: Anthropic.Messages.MessageParam[] = [
 			...messages.slice(0, -1),
 			{ role: messages[messages.length - 1].role, content: veryLargeContent },
@@ -390,17 +378,18 @@ describe("truncateConversationIfNeeded", () => {
 
 		// Set base tokens so we're just below threshold without content
 		const baseTokensForVeryLarge = availableTokens - Math.floor(veryLargeContentTokens / 2)
-		const resultWithVeryLarge = truncateConversationIfNeeded({
+		const resultWithVeryLarge = await truncateConversationIfNeeded({
 			messages: messagesWithVeryLargeContent,
 			totalTokens: baseTokensForVeryLarge,
 			contextWindow: modelInfo.contextWindow,
 			maxTokens,
+			apiHandler: mockApiHandler,
 		})
 		expect(resultWithVeryLarge).not.toEqual(messagesWithVeryLargeContent) // Should truncate
 	})
 
-	it("should truncate if tokens are within TOKEN_BUFFER_PERCENTAGE of the threshold", () => {
-		const modelInfo = createModelInfo(100000, true, 30000)
+	it("should truncate if tokens are within TOKEN_BUFFER_PERCENTAGE of the threshold", async () => {
+		const modelInfo = createModelInfo(100000, 30000)
 		const maxTokens = 100000 - 30000 // 70000
 		const dynamicBuffer = modelInfo.contextWindow * TOKEN_BUFFER_PERCENTAGE // 10% of 100000 = 10000
 		const totalTokens = 70000 - dynamicBuffer + 1 // Just within the dynamic buffer of threshold (70000)
@@ -412,101 +401,153 @@ describe("truncateConversationIfNeeded", () => {
 		// With 4 messages after the first, 0.5 fraction means remove 2 messages
 		const expectedResult = [messagesWithSmallContent[0], messagesWithSmallContent[3], messagesWithSmallContent[4]]
 
-		const result = truncateConversationIfNeeded({
+		const result = await truncateConversationIfNeeded({
 			messages: messagesWithSmallContent,
 			totalTokens,
 			contextWindow: modelInfo.contextWindow,
 			maxTokens: modelInfo.maxTokens,
+			apiHandler: mockApiHandler,
 		})
 		expect(result).toEqual(expectedResult)
 	})
 })
 
 /**
- * Tests for the estimateTokenCount function
+ * Tests for the getMaxTokens function (private but tested through truncateConversationIfNeeded)
  */
-describe("estimateTokenCount", () => {
-	it("should return 0 for empty or undefined content", () => {
-		expect(estimateTokenCount([])).toBe(0)
-		// @ts-ignore - Testing with undefined
-		expect(estimateTokenCount(undefined)).toBe(0)
+describe("getMaxTokens", () => {
+	// We'll test this indirectly through truncateConversationIfNeeded
+	const createModelInfo = (contextWindow: number, maxTokens?: number): ModelInfo => ({
+		contextWindow,
+		supportsPromptCache: true, // Not relevant for getMaxTokens
+		maxTokens,
 	})
 
-	it("should estimate tokens for text blocks", () => {
-		const content: Array<Anthropic.Messages.ContentBlockParam> = [
-			{ type: "text", text: "This is a text block with 36 characters" },
-		]
+	// Reuse across tests for consistency
+	const messages: Anthropic.Messages.MessageParam[] = [
+		{ role: "user", content: "First message" },
+		{ role: "assistant", content: "Second message" },
+		{ role: "user", content: "Third message" },
+		{ role: "assistant", content: "Fourth message" },
+		{ role: "user", content: "Fifth message" },
+	]
 
-		// With tiktoken, the exact token count may differ from character-based estimation
-		// Instead of expecting an exact number, we verify it's a reasonable positive number
-		const result = estimateTokenCount(content)
-		expect(result).toBeGreaterThan(0)
+	it("should use maxTokens as buffer when specified", async () => {
+		const modelInfo = createModelInfo(100000, 50000)
+		// Max tokens = 100000 - 50000 = 50000
 
-		// We can also verify that longer text results in more tokens
-		const longerContent: Array<Anthropic.Messages.ContentBlockParam> = [
-			{
-				type: "text",
-				text: "This is a longer text block with significantly more characters to encode into tokens",
-			},
-		]
-		const longerResult = estimateTokenCount(longerContent)
-		expect(longerResult).toBeGreaterThan(result)
-	})
+		// Create messages with very small content in the last one to avoid token overflow
+		const messagesWithSmallContent = [...messages.slice(0, -1), { ...messages[messages.length - 1], content: "" }]
 
-	it("should estimate tokens for image blocks based on data size", () => {
-		// Small image
-		const smallImage: Array<Anthropic.Messages.ContentBlockParam> = [
-			{ type: "image", source: { type: "base64", media_type: "image/jpeg", data: "small_dummy_data" } },
-		]
-		// Larger image with more data
-		const largerImage: Array<Anthropic.Messages.ContentBlockParam> = [
-			{ type: "image", source: { type: "base64", media_type: "image/png", data: "X".repeat(1000) } },
-		]
+		// Account for the dynamic buffer which is 10% of context window (10,000 tokens)
+		// Below max tokens and buffer - no truncation
+		const result1 = await truncateConversationIfNeeded({
+			messages: messagesWithSmallContent,
+			totalTokens: 39999, // Well below threshold + dynamic buffer
+			contextWindow: modelInfo.contextWindow,
+			maxTokens: modelInfo.maxTokens,
+			apiHandler: mockApiHandler,
+		})
+		expect(result1).toEqual(messagesWithSmallContent)
 
-		// Verify the token count scales with the size of the image data
-		const smallImageTokens = estimateTokenCount(smallImage)
-		const largerImageTokens = estimateTokenCount(largerImage)
+		// Above max tokens - truncate
+		const result2 = await truncateConversationIfNeeded({
+			messages: messagesWithSmallContent,
+			totalTokens: 50001, // Above threshold
+			contextWindow: modelInfo.contextWindow,
+			maxTokens: modelInfo.maxTokens,
+			apiHandler: mockApiHandler,
+		})
+		expect(result2).not.toEqual(messagesWithSmallContent)
+		expect(result2.length).toBe(3) // Truncated with 0.5 fraction
+	})
 
-		// Small image should have some tokens
-		expect(smallImageTokens).toBeGreaterThan(0)
+	it("should use 20% of context window as buffer when maxTokens is undefined", async () => {
+		const modelInfo = createModelInfo(100000, undefined)
+		// Max tokens = 100000 - (100000 * 0.2) = 80000
 
-		// Larger image should have proportionally more tokens
-		expect(largerImageTokens).toBeGreaterThan(smallImageTokens)
+		// Create messages with very small content in the last one to avoid token overflow
+		const messagesWithSmallContent = [...messages.slice(0, -1), { ...messages[messages.length - 1], content: "" }]
 
-		// Verify the larger image calculation matches our formula including the 50% fudge factor
-		expect(largerImageTokens).toBe(48)
+		// Account for the dynamic buffer which is 10% of context window (10,000 tokens)
+		// Below max tokens and buffer - no truncation
+		const result1 = await truncateConversationIfNeeded({
+			messages: messagesWithSmallContent,
+			totalTokens: 69999, // Well below threshold + dynamic buffer
+			contextWindow: modelInfo.contextWindow,
+			maxTokens: modelInfo.maxTokens,
+			apiHandler: mockApiHandler,
+		})
+		expect(result1).toEqual(messagesWithSmallContent)
+
+		// Above max tokens - truncate
+		const result2 = await truncateConversationIfNeeded({
+			messages: messagesWithSmallContent,
+			totalTokens: 80001, // Above threshold
+			contextWindow: modelInfo.contextWindow,
+			maxTokens: modelInfo.maxTokens,
+			apiHandler: mockApiHandler,
+		})
+		expect(result2).not.toEqual(messagesWithSmallContent)
+		expect(result2.length).toBe(3) // Truncated with 0.5 fraction
 	})
 
-	it("should estimate tokens for mixed content blocks", () => {
-		const content: Array<Anthropic.Messages.ContentBlockParam> = [
-			{ type: "text", text: "A text block with 30 characters" },
-			{ type: "image", source: { type: "base64", media_type: "image/jpeg", data: "dummy_data" } },
-			{ type: "text", text: "Another text with 24 chars" },
-		]
+	it("should handle small context windows appropriately", async () => {
+		const modelInfo = createModelInfo(50000, 10000)
+		// Max tokens = 50000 - 10000 = 40000
 
-		// We know image tokens calculation should be consistent
-		const imageTokens = Math.ceil(Math.sqrt("dummy_data".length)) * 1.5
+		// Create messages with very small content in the last one to avoid token overflow
+		const messagesWithSmallContent = [...messages.slice(0, -1), { ...messages[messages.length - 1], content: "" }]
 
-		// With tiktoken, we can't predict exact text token counts,
-		// but we can verify the total is greater than just the image tokens
-		const result = estimateTokenCount(content)
-		expect(result).toBeGreaterThan(imageTokens)
+		// Below max tokens and buffer - no truncation
+		const result1 = await truncateConversationIfNeeded({
+			messages: messagesWithSmallContent,
+			totalTokens: 34999, // Well below threshold + buffer
+			contextWindow: modelInfo.contextWindow,
+			maxTokens: modelInfo.maxTokens,
+			apiHandler: mockApiHandler,
+		})
+		expect(result1).toEqual(messagesWithSmallContent)
 
-		// Also test against a version with only the image to verify text adds tokens
-		const imageOnlyContent: Array<Anthropic.Messages.ContentBlockParam> = [
-			{ type: "image", source: { type: "base64", media_type: "image/jpeg", data: "dummy_data" } },
-		]
-		const imageOnlyResult = estimateTokenCount(imageOnlyContent)
-		expect(result).toBeGreaterThan(imageOnlyResult)
+		// Above max tokens - truncate
+		const result2 = await truncateConversationIfNeeded({
+			messages: messagesWithSmallContent,
+			totalTokens: 40001, // Above threshold
+			contextWindow: modelInfo.contextWindow,
+			maxTokens: modelInfo.maxTokens,
+			apiHandler: mockApiHandler,
+		})
+		expect(result2).not.toEqual(messagesWithSmallContent)
+		expect(result2.length).toBe(3) // Truncated with 0.5 fraction
 	})
 
-	it("should handle empty text blocks", () => {
-		const content: Array<Anthropic.Messages.ContentBlockParam> = [{ type: "text", text: "" }]
-		expect(estimateTokenCount(content)).toBe(0)
-	})
+	it("should handle large context windows appropriately", async () => {
+		const modelInfo = createModelInfo(200000, 30000)
+		// Max tokens = 200000 - 30000 = 170000
 
-	it("should handle plain string messages", () => {
-		const content = "This is a plain text message"
-		expect(estimateTokenCount([{ type: "text", text: content }])).toBeGreaterThan(0)
+		// Create messages with very small content in the last one to avoid token overflow
+		const messagesWithSmallContent = [...messages.slice(0, -1), { ...messages[messages.length - 1], content: "" }]
+
+		// Account for the dynamic buffer which is 10% of context window (20,000 tokens for this test)
+		// Below max tokens and buffer - no truncation
+		const result1 = await truncateConversationIfNeeded({
+			messages: messagesWithSmallContent,
+			totalTokens: 149999, // Well below threshold + dynamic buffer
+			contextWindow: modelInfo.contextWindow,
+			maxTokens: modelInfo.maxTokens,
+			apiHandler: mockApiHandler,
+		})
+		expect(result1).toEqual(messagesWithSmallContent)
+
+		// Above max tokens - truncate
+		const result2 = await truncateConversationIfNeeded({
+			messages: messagesWithSmallContent,
+			totalTokens: 170001, // Above threshold
+			contextWindow: modelInfo.contextWindow,
+			maxTokens: modelInfo.maxTokens,
+			apiHandler: mockApiHandler,
+		})
+		expect(result2).not.toEqual(messagesWithSmallContent)
+		expect(result2.length).toBe(3) // Truncated with 0.5 fraction
 	})
 })

+ 23 - 42
src/core/sliding-window/index.ts

@@ -1,53 +1,24 @@
 import { Anthropic } from "@anthropic-ai/sdk"
+import { ApiHandler } from "../../api"
 
-import { Tiktoken } from "js-tiktoken/lite"
-import o200kBase from "js-tiktoken/ranks/o200k_base"
-
-export const TOKEN_FUDGE_FACTOR = 1.5
 /**
  * Default percentage of the context window to use as a buffer when deciding when to truncate
  */
 export const TOKEN_BUFFER_PERCENTAGE = 0.1
 
 /**
- * Counts tokens for user content using tiktoken for text
- * and a size-based calculation for images.
+ * Counts tokens for user content using the provider's token counting implementation.
  *
  * @param {Array<Anthropic.Messages.ContentBlockParam>} content - The content to count tokens for
- * @returns {number} The token count
+ * @param {ApiHandler} apiHandler - The API handler to use for token counting
+ * @returns {Promise<number>} A promise resolving to the token count
  */
-export function estimateTokenCount(content: Array<Anthropic.Messages.ContentBlockParam>): number {
+export async function estimateTokenCount(
+	content: Array<Anthropic.Messages.ContentBlockParam>,
+	apiHandler: ApiHandler,
+): Promise<number> {
 	if (!content || content.length === 0) return 0
-
-	let totalTokens = 0
-	let encoder = null
-
-	// Create encoder
-	encoder = new Tiktoken(o200kBase)
-
-	// Process each content block
-	for (const block of content) {
-		if (block.type === "text") {
-			// Use tiktoken for text token counting
-			const text = block.text || ""
-			if (text.length > 0) {
-				const tokens = encoder.encode(text)
-				totalTokens += tokens.length
-			}
-		} else if (block.type === "image") {
-			// For images, calculate based on data size
-			const imageSource = block.source
-			if (imageSource && typeof imageSource === "object" && "data" in imageSource) {
-				const base64Data = imageSource.data as string
-				totalTokens += Math.ceil(Math.sqrt(base64Data.length))
-			} else {
-				totalTokens += 300 // Conservative estimate for unknown images
-			}
-		}
-	}
-
-	// Add a fudge factor to account for the fact that tiktoken is not always accurate
-	return Math.ceil(totalTokens * TOKEN_FUDGE_FACTOR)
+	return apiHandler.countTokens(content)
 }
 
 /**
@@ -81,6 +52,7 @@ export function truncateConversation(
  * @param {number} totalTokens - The total number of tokens in the conversation (excluding the last user message).
  * @param {number} contextWindow - The context window size.
  * @param {number} maxTokens - The maximum number of tokens allowed.
+ * @param {ApiHandler} apiHandler - The API handler to use for token counting.
  * @returns {Anthropic.Messages.MessageParam[]} The original or truncated conversation messages.
  */
 
@@ -89,14 +61,23 @@ type TruncateOptions = {
 	totalTokens: number
 	contextWindow: number
 	maxTokens?: number
+	apiHandler: ApiHandler
 }
 
-export function truncateConversationIfNeeded({
+/**
+ * Conditionally truncates the conversation messages if the total token count
+ * exceeds the model's limit, considering the size of incoming content.
+ *
+ * @param {TruncateOptions} options - The options for truncation
+ * @returns {Promise<Anthropic.Messages.MessageParam[]>} The original or truncated conversation messages.
+ */
+export async function truncateConversationIfNeeded({
 	messages,
 	totalTokens,
 	contextWindow,
 	maxTokens,
-}: TruncateOptions): Anthropic.Messages.MessageParam[] {
+	apiHandler,
+}: TruncateOptions): Promise<Anthropic.Messages.MessageParam[]> {
 	// Calculate the maximum tokens reserved for response
 	const reservedTokens = maxTokens || contextWindow * 0.2
 
@@ -104,8 +85,8 @@ export function truncateConversationIfNeeded({
 	const lastMessage = messages[messages.length - 1]
 	const lastMessageContent = lastMessage.content
 	const lastMessageTokens = Array.isArray(lastMessageContent)
-		? estimateTokenCount(lastMessageContent)
-		: estimateTokenCount([{ type: "text", text: lastMessageContent as string }])
+		? await estimateTokenCount(lastMessageContent, apiHandler)
+		: await estimateTokenCount([{ type: "text", text: lastMessageContent as string }], apiHandler)
 
 	// Calculate total effective tokens (totalTokens never includes the last message)
 	const effectiveTokens = totalTokens + lastMessageTokens