Sfoglia il codice sorgente

feat: embed image input token (#293)

* feat: embedding support image input token

* fix: minimax tts stream error handler

* feat: doubao vision embed support

* chore: optimize patch embed vision input to openai converthandler callback

* fix: ci lint

* rename: chat usage rename to usage
zijiren 6 mesi fa
parent
commit
4306fc24e0
37 ha cambiato i file con 720 aggiunte e 346 eliminazioni
  1. 83 55
      core/docs/docs.go
  2. 83 55
      core/docs/swagger.json
  3. 55 37
      core/docs/swagger.yaml
  4. 2 2
      core/relay/adaptor/ali/embeddings.go
  5. 3 3
      core/relay/adaptor/ali/model.go
  6. 1 1
      core/relay/adaptor/ali/stt-realtime.go
  7. 5 5
      core/relay/adaptor/anthropic/main.go
  8. 2 2
      core/relay/adaptor/anthropic/model.go
  9. 9 9
      core/relay/adaptor/anthropic/openai.go
  10. 5 5
      core/relay/adaptor/aws/claude/main.go
  11. 4 4
      core/relay/adaptor/aws/llama3/main.go
  12. 1 1
      core/relay/adaptor/baidu/embeddings.go
  13. 3 3
      core/relay/adaptor/baidu/main.go
  14. 1 1
      core/relay/adaptor/baidu/model.go
  15. 2 2
      core/relay/adaptor/baidu/rerank.go
  16. 5 5
      core/relay/adaptor/cohere/main.go
  17. 107 0
      core/relay/adaptor/doubao/chat.go
  18. 109 0
      core/relay/adaptor/doubao/embed.go
  19. 23 85
      core/relay/adaptor/doubao/main.go
  20. 3 3
      core/relay/adaptor/gemini/embeddings.go
  21. 5 5
      core/relay/adaptor/gemini/main.go
  22. 1 1
      core/relay/adaptor/jina/rerank.go
  23. 5 2
      core/relay/adaptor/minimax/adaptor.go
  24. 1 1
      core/relay/adaptor/minimax/error.go
  25. 5 11
      core/relay/adaptor/minimax/tts.go
  26. 8 8
      core/relay/adaptor/ollama/main.go
  27. 3 3
      core/relay/adaptor/openai/adaptor.go
  28. 7 6
      core/relay/adaptor/openai/chat.go
  29. 137 10
      core/relay/adaptor/openai/embeddings.go
  30. 2 2
      core/relay/adaptor/openai/helper.go
  31. 1 1
      core/relay/adaptor/openai/stt.go
  32. 3 7
      core/relay/adaptor/zhipu/main.go
  33. 4 4
      core/relay/adaptor/zhipu/model.go
  34. 3 3
      core/relay/model/chat.go
  35. 3 3
      core/relay/model/completions.go
  36. 25 1
      core/relay/model/embed.go
  37. 1 0
      core/relay/model/tts.go

+ 83 - 55
core/docs/docs.go

@@ -9201,58 +9201,6 @@ const docTemplate = `{
                 }
             }
         },
-        "github_com_labring_aiproxy_core_model.Usage": {
-            "type": "object",
-            "properties": {
-                "cache_creation_tokens": {
-                    "type": "integer"
-                },
-                "cached_tokens": {
-                    "type": "integer"
-                },
-                "image_input_tokens": {
-                    "type": "integer"
-                },
-                "input_tokens": {
-                    "type": "integer"
-                },
-                "output_tokens": {
-                    "type": "integer"
-                },
-                "reasoning_tokens": {
-                    "type": "integer"
-                },
-                "total_tokens": {
-                    "type": "integer"
-                },
-                "web_search_count": {
-                    "type": "integer"
-                }
-            }
-        },
-        "github_com_labring_aiproxy_core_relay_model.Usage": {
-            "type": "object",
-            "properties": {
-                "completion_tokens": {
-                    "type": "integer"
-                },
-                "completion_tokens_details": {
-                    "$ref": "#/definitions/model.CompletionTokensDetails"
-                },
-                "prompt_tokens": {
-                    "type": "integer"
-                },
-                "prompt_tokens_details": {
-                    "$ref": "#/definitions/model.PromptTokensDetails"
-                },
-                "total_tokens": {
-                    "type": "integer"
-                },
-                "web_search_count": {
-                    "type": "integer"
-                }
-            }
-        },
         "mcp.Tool": {
             "type": "object",
             "properties": {
@@ -9642,6 +9590,29 @@ const docTemplate = `{
                 }
             }
         },
+        "model.ChatUsage": {
+            "type": "object",
+            "properties": {
+                "completion_tokens": {
+                    "type": "integer"
+                },
+                "completion_tokens_details": {
+                    "$ref": "#/definitions/model.CompletionTokensDetails"
+                },
+                "prompt_tokens": {
+                    "type": "integer"
+                },
+                "prompt_tokens_details": {
+                    "$ref": "#/definitions/model.PromptTokensDetails"
+                },
+                "total_tokens": {
+                    "type": "integer"
+                },
+                "web_search_count": {
+                    "type": "integer"
+                }
+            }
+        },
         "model.CompletionTokensDetails": {
             "type": "object",
             "properties": {
@@ -9746,6 +9717,17 @@ const docTemplate = `{
                 }
             }
         },
+        "model.EmbeddingPromptTokensDetails": {
+            "type": "object",
+            "properties": {
+                "image_tokens": {
+                    "type": "integer"
+                },
+                "text_tokens": {
+                    "type": "integer"
+                }
+            }
+        },
         "model.EmbeddingRequest": {
             "type": "object",
             "properties": {
@@ -9779,7 +9761,7 @@ const docTemplate = `{
                     "type": "string"
                 },
                 "usage": {
-                    "$ref": "#/definitions/github_com_labring_aiproxy_core_relay_model.Usage"
+                    "$ref": "#/definitions/model.EmbeddingUsage"
                 }
             }
         },
@@ -9800,6 +9782,20 @@ const docTemplate = `{
                 }
             }
         },
+        "model.EmbeddingUsage": {
+            "type": "object",
+            "properties": {
+                "prompt_tokens": {
+                    "type": "integer"
+                },
+                "prompt_tokens_details": {
+                    "$ref": "#/definitions/model.EmbeddingPromptTokensDetails"
+                },
+                "total_tokens": {
+                    "type": "integer"
+                }
+            }
+        },
         "model.FinishReason": {
             "type": "string",
             "enum": [
@@ -10400,7 +10396,7 @@ const docTemplate = `{
                     "type": "integer"
                 },
                 "usage": {
-                    "$ref": "#/definitions/github_com_labring_aiproxy_core_model.Usage"
+                    "$ref": "#/definitions/model.Usage"
                 },
                 "used_amount": {
                     "type": "number"
@@ -11162,7 +11158,7 @@ const docTemplate = `{
                     "type": "string"
                 },
                 "usage": {
-                    "$ref": "#/definitions/github_com_labring_aiproxy_core_relay_model.Usage"
+                    "$ref": "#/definitions/model.ChatUsage"
                 }
             }
         },
@@ -11203,6 +11199,9 @@ const docTemplate = `{
                 "speed": {
                     "type": "number"
                 },
+                "stream_format": {
+                    "type": "string"
+                },
                 "voice": {
                     "type": "string"
                 }
@@ -11236,6 +11235,35 @@ const docTemplate = `{
                 }
             }
         },
+        "model.Usage": {
+            "type": "object",
+            "properties": {
+                "cache_creation_tokens": {
+                    "type": "integer"
+                },
+                "cached_tokens": {
+                    "type": "integer"
+                },
+                "image_input_tokens": {
+                    "type": "integer"
+                },
+                "input_tokens": {
+                    "type": "integer"
+                },
+                "output_tokens": {
+                    "type": "integer"
+                },
+                "reasoning_tokens": {
+                    "type": "integer"
+                },
+                "total_tokens": {
+                    "type": "integer"
+                },
+                "web_search_count": {
+                    "type": "integer"
+                }
+            }
+        },
         "model.VideoGenerationJob": {
             "type": "object",
             "properties": {

+ 83 - 55
core/docs/swagger.json

@@ -9192,58 +9192,6 @@
                 }
             }
         },
-        "github_com_labring_aiproxy_core_model.Usage": {
-            "type": "object",
-            "properties": {
-                "cache_creation_tokens": {
-                    "type": "integer"
-                },
-                "cached_tokens": {
-                    "type": "integer"
-                },
-                "image_input_tokens": {
-                    "type": "integer"
-                },
-                "input_tokens": {
-                    "type": "integer"
-                },
-                "output_tokens": {
-                    "type": "integer"
-                },
-                "reasoning_tokens": {
-                    "type": "integer"
-                },
-                "total_tokens": {
-                    "type": "integer"
-                },
-                "web_search_count": {
-                    "type": "integer"
-                }
-            }
-        },
-        "github_com_labring_aiproxy_core_relay_model.Usage": {
-            "type": "object",
-            "properties": {
-                "completion_tokens": {
-                    "type": "integer"
-                },
-                "completion_tokens_details": {
-                    "$ref": "#/definitions/model.CompletionTokensDetails"
-                },
-                "prompt_tokens": {
-                    "type": "integer"
-                },
-                "prompt_tokens_details": {
-                    "$ref": "#/definitions/model.PromptTokensDetails"
-                },
-                "total_tokens": {
-                    "type": "integer"
-                },
-                "web_search_count": {
-                    "type": "integer"
-                }
-            }
-        },
         "mcp.Tool": {
             "type": "object",
             "properties": {
@@ -9633,6 +9581,29 @@
                 }
             }
         },
+        "model.ChatUsage": {
+            "type": "object",
+            "properties": {
+                "completion_tokens": {
+                    "type": "integer"
+                },
+                "completion_tokens_details": {
+                    "$ref": "#/definitions/model.CompletionTokensDetails"
+                },
+                "prompt_tokens": {
+                    "type": "integer"
+                },
+                "prompt_tokens_details": {
+                    "$ref": "#/definitions/model.PromptTokensDetails"
+                },
+                "total_tokens": {
+                    "type": "integer"
+                },
+                "web_search_count": {
+                    "type": "integer"
+                }
+            }
+        },
         "model.CompletionTokensDetails": {
             "type": "object",
             "properties": {
@@ -9737,6 +9708,17 @@
                 }
             }
         },
+        "model.EmbeddingPromptTokensDetails": {
+            "type": "object",
+            "properties": {
+                "image_tokens": {
+                    "type": "integer"
+                },
+                "text_tokens": {
+                    "type": "integer"
+                }
+            }
+        },
         "model.EmbeddingRequest": {
             "type": "object",
             "properties": {
@@ -9770,7 +9752,7 @@
                     "type": "string"
                 },
                 "usage": {
-                    "$ref": "#/definitions/github_com_labring_aiproxy_core_relay_model.Usage"
+                    "$ref": "#/definitions/model.EmbeddingUsage"
                 }
             }
         },
@@ -9791,6 +9773,20 @@
                 }
             }
         },
+        "model.EmbeddingUsage": {
+            "type": "object",
+            "properties": {
+                "prompt_tokens": {
+                    "type": "integer"
+                },
+                "prompt_tokens_details": {
+                    "$ref": "#/definitions/model.EmbeddingPromptTokensDetails"
+                },
+                "total_tokens": {
+                    "type": "integer"
+                }
+            }
+        },
         "model.FinishReason": {
             "type": "string",
             "enum": [
@@ -10391,7 +10387,7 @@
                     "type": "integer"
                 },
                 "usage": {
-                    "$ref": "#/definitions/github_com_labring_aiproxy_core_model.Usage"
+                    "$ref": "#/definitions/model.Usage"
                 },
                 "used_amount": {
                     "type": "number"
@@ -11153,7 +11149,7 @@
                     "type": "string"
                 },
                 "usage": {
-                    "$ref": "#/definitions/github_com_labring_aiproxy_core_relay_model.Usage"
+                    "$ref": "#/definitions/model.ChatUsage"
                 }
             }
         },
@@ -11194,6 +11190,9 @@
                 "speed": {
                     "type": "number"
                 },
+                "stream_format": {
+                    "type": "string"
+                },
                 "voice": {
                     "type": "string"
                 }
@@ -11227,6 +11226,35 @@
                 }
             }
         },
+        "model.Usage": {
+            "type": "object",
+            "properties": {
+                "cache_creation_tokens": {
+                    "type": "integer"
+                },
+                "cached_tokens": {
+                    "type": "integer"
+                },
+                "image_input_tokens": {
+                    "type": "integer"
+                },
+                "input_tokens": {
+                    "type": "integer"
+                },
+                "output_tokens": {
+                    "type": "integer"
+                },
+                "reasoning_tokens": {
+                    "type": "integer"
+                },
+                "total_tokens": {
+                    "type": "integer"
+                },
+                "web_search_count": {
+                    "type": "integer"
+                }
+            }
+        },
         "model.VideoGenerationJob": {
             "type": "object",
             "properties": {

+ 55 - 37
core/docs/swagger.yaml

@@ -714,40 +714,6 @@ definitions:
       status:
         type: integer
     type: object
-  github_com_labring_aiproxy_core_model.Usage:
-    properties:
-      cache_creation_tokens:
-        type: integer
-      cached_tokens:
-        type: integer
-      image_input_tokens:
-        type: integer
-      input_tokens:
-        type: integer
-      output_tokens:
-        type: integer
-      reasoning_tokens:
-        type: integer
-      total_tokens:
-        type: integer
-      web_search_count:
-        type: integer
-    type: object
-  github_com_labring_aiproxy_core_relay_model.Usage:
-    properties:
-      completion_tokens:
-        type: integer
-      completion_tokens_details:
-        $ref: '#/definitions/model.CompletionTokensDetails'
-      prompt_tokens:
-        type: integer
-      prompt_tokens_details:
-        $ref: '#/definitions/model.PromptTokensDetails'
-      total_tokens:
-        type: integer
-      web_search_count:
-        type: integer
-    type: object
   mcp.Tool:
     properties:
       annotations:
@@ -1039,6 +1005,21 @@ definitions:
       web_search_count:
         type: integer
     type: object
+  model.ChatUsage:
+    properties:
+      completion_tokens:
+        type: integer
+      completion_tokens_details:
+        $ref: '#/definitions/model.CompletionTokensDetails'
+      prompt_tokens:
+        type: integer
+      prompt_tokens_details:
+        $ref: '#/definitions/model.PromptTokensDetails'
+      total_tokens:
+        type: integer
+      web_search_count:
+        type: integer
+    type: object
   model.CompletionTokensDetails:
     properties:
       accepted_prediction_tokens:
@@ -1107,6 +1088,13 @@ definitions:
       text:
         type: string
     type: object
+  model.EmbeddingPromptTokensDetails:
+    properties:
+      image_tokens:
+        type: integer
+      text_tokens:
+        type: integer
+    type: object
   model.EmbeddingRequest:
     properties:
       dimensions:
@@ -1129,7 +1117,7 @@ definitions:
       object:
         type: string
       usage:
-        $ref: '#/definitions/github_com_labring_aiproxy_core_relay_model.Usage'
+        $ref: '#/definitions/model.EmbeddingUsage'
     type: object
   model.EmbeddingResponseItem:
     properties:
@@ -1142,6 +1130,15 @@ definitions:
       object:
         type: string
     type: object
+  model.EmbeddingUsage:
+    properties:
+      prompt_tokens:
+        type: integer
+      prompt_tokens_details:
+        $ref: '#/definitions/model.EmbeddingPromptTokensDetails'
+      total_tokens:
+        type: integer
+    type: object
   model.FinishReason:
     enum:
     - stop
@@ -1549,7 +1546,7 @@ definitions:
       ttfb_milliseconds:
         type: integer
       usage:
-        $ref: '#/definitions/github_com_labring_aiproxy_core_model.Usage'
+        $ref: '#/definitions/model.Usage'
       used_amount:
         type: number
       user:
@@ -2076,7 +2073,7 @@ definitions:
       object:
         type: string
       usage:
-        $ref: '#/definitions/github_com_labring_aiproxy_core_relay_model.Usage'
+        $ref: '#/definitions/model.ChatUsage'
     type: object
   model.TextResponseChoice:
     properties:
@@ -2099,6 +2096,8 @@ definitions:
         type: string
       speed:
         type: number
+      stream_format:
+        type: string
       voice:
         type: string
     required:
@@ -2124,6 +2123,25 @@ definitions:
       type:
         type: string
     type: object
+  model.Usage:
+    properties:
+      cache_creation_tokens:
+        type: integer
+      cached_tokens:
+        type: integer
+      image_input_tokens:
+        type: integer
+      input_tokens:
+        type: integer
+      output_tokens:
+        type: integer
+      reasoning_tokens:
+        type: integer
+      total_tokens:
+        type: integer
+      web_search_count:
+        type: integer
+    type: object
   model.VideoGenerationJob:
     properties:
       created_at:

+ 2 - 2
core/relay/adaptor/ali/embeddings.go

@@ -115,7 +115,7 @@ func EmbeddingsHandler(
 	openaiResponse := embeddingResponse2OpenAI(meta, &respBody)
 	data, err := sonic.Marshal(openaiResponse)
 	if err != nil {
-		return openaiResponse.ToModelUsage(), relaymodel.WrapperOpenAIError(
+		return openaiResponse.Usage.ToModelUsage(), relaymodel.WrapperOpenAIError(
 			err,
 			"marshal_response_body_failed",
 			resp.StatusCode,
@@ -128,5 +128,5 @@ func EmbeddingsHandler(
 	if err != nil {
 		log.Warnf("write response body failed: %v", err)
 	}
-	return openaiResponse.ToModelUsage(), nil
+	return openaiResponse.Usage.ToModelUsage(), nil
 }

+ 3 - 3
core/relay/adaptor/ali/model.go

@@ -38,8 +38,8 @@ type TaskResponse struct {
 			Failed    int `json:"FAILED,omitempty"`
 		} `json:"task_metrics,omitempty"`
 	} `json:"output,omitempty"`
-	Usage      model.Usage `json:"usage"`
-	StatusCode int         `json:"status_code,omitempty"`
+	Usage      model.ChatUsage `json:"usage"`
+	StatusCode int             `json:"status_code,omitempty"`
 }
 
 type EmbeddingRequest struct {
@@ -62,7 +62,7 @@ type EmbeddingResponse struct {
 	Output struct {
 		Embeddings []Embedding `json:"embeddings"`
 	} `json:"output"`
-	Usage model.Usage `json:"usage"`
+	Usage model.EmbeddingUsage `json:"usage"`
 }
 
 type Error struct {

+ 1 - 1
core/relay/adaptor/ali/stt-realtime.go

@@ -258,7 +258,7 @@ func STTDoResponse(
 			usage.TotalTokens = model.ZeroNullInt64(msg.Payload.Usage.Characters)
 			c.JSON(http.StatusOK, gin.H{
 				"text": output.String(),
-				"usage": relaymodel.Usage{
+				"usage": relaymodel.ChatUsage{
 					PromptTokens: int64(usage.InputTokens),
 					TotalTokens:  int64(usage.TotalTokens),
 				},

+ 5 - 5
core/relay/adaptor/anthropic/main.go

@@ -177,7 +177,7 @@ func StreamHandler(
 
 	responseText := strings.Builder{}
 
-	var usage *relaymodel.Usage
+	var usage *relaymodel.ChatUsage
 	var writed bool
 
 	for scanner.Scan() {
@@ -193,7 +193,7 @@ func StreamHandler(
 				log.Errorf("response error: %+v", err)
 			} else {
 				if usage == nil {
-					usage = &relaymodel.Usage{}
+					usage = &relaymodel.ChatUsage{}
 				}
 				if response != nil && response.Usage != nil {
 					usage.Add(response.Usage)
@@ -205,7 +205,7 @@ func StreamHandler(
 			switch {
 			case response.Usage != nil:
 				if usage == nil {
-					usage = &relaymodel.Usage{}
+					usage = &relaymodel.ChatUsage{}
 				}
 				usage.Add(response.Usage)
 				if usage.PromptTokens == 0 {
@@ -232,7 +232,7 @@ func StreamHandler(
 	}
 
 	if usage == nil {
-		usage = &relaymodel.Usage{
+		usage = &relaymodel.ChatUsage{
 			PromptTokens:     int64(m.RequestUsage.InputTokens),
 			CompletionTokens: openai.CountTokenText(responseText.String(), m.OriginModel),
 			TotalTokens: int64(
@@ -270,5 +270,5 @@ func Handler(meta *meta.Meta, c *gin.Context, resp *http.Response) (model.Usage,
 	c.Writer.Header().Set("Content-Type", "application/json")
 	c.Writer.Header().Set("Content-Length", strconv.Itoa(len(respBody)))
 	_, _ = c.Writer.Write(respBody)
-	return fullTextResponse.ToModelUsage(), nil
+	return fullTextResponse.Usage.ToModelUsage(), nil
 }

+ 2 - 2
core/relay/adaptor/anthropic/model.go

@@ -151,8 +151,8 @@ type ServerToolUse struct {
 	ExecutionTimeSeconds float64 `json:"execution_time_seconds,omitempty"`
 }
 
-func (u *Usage) ToOpenAIUsage() relaymodel.Usage {
-	usage := relaymodel.Usage{
+func (u *Usage) ToOpenAIUsage() relaymodel.ChatUsage {
+	usage := relaymodel.ChatUsage{
 		PromptTokens:     u.InputTokens + u.CacheReadInputTokens + u.CacheCreationInputTokens,
 		CompletionTokens: u.OutputTokens,
 		PromptTokensDetails: &relaymodel.PromptTokensDetails{

+ 9 - 9
core/relay/adaptor/anthropic/openai.go

@@ -282,7 +282,7 @@ func StreamResponse2OpenAI(
 	meta *meta.Meta,
 	respData []byte,
 ) (*relaymodel.ChatCompletionsStreamResponse, adaptor.Error) {
-	var usage *relaymodel.Usage
+	var usage *relaymodel.ChatUsage
 	var content string
 	var thinking string
 	var stopReason string
@@ -440,10 +440,10 @@ func Response2OpenAI(
 		Choices: []*relaymodel.TextResponseChoice{&choice},
 		Usage:   claudeResponse.Usage.ToOpenAIUsage(),
 	}
-	if fullTextResponse.PromptTokens == 0 {
-		fullTextResponse.PromptTokens = int64(meta.RequestUsage.InputTokens)
+	if fullTextResponse.Usage.PromptTokens == 0 {
+		fullTextResponse.Usage.PromptTokens = int64(meta.RequestUsage.InputTokens)
 	}
-	fullTextResponse.TotalTokens = fullTextResponse.PromptTokens + fullTextResponse.CompletionTokens
+	fullTextResponse.Usage.TotalTokens = fullTextResponse.Usage.PromptTokens + fullTextResponse.Usage.CompletionTokens
 	return &fullTextResponse, nil
 }
 
@@ -467,7 +467,7 @@ func OpenAIStreamHandler(
 
 	responseText := strings.Builder{}
 
-	var usage *relaymodel.Usage
+	var usage *relaymodel.ChatUsage
 	var writed bool
 
 	for scanner.Scan() {
@@ -488,7 +488,7 @@ func OpenAIStreamHandler(
 				continue
 			}
 			if usage == nil {
-				usage = &relaymodel.Usage{}
+				usage = &relaymodel.ChatUsage{}
 			}
 			if response != nil && response.Usage != nil {
 				usage.Add(response.Usage)
@@ -502,7 +502,7 @@ func OpenAIStreamHandler(
 		switch {
 		case response.Usage != nil:
 			if usage == nil {
-				usage = &relaymodel.Usage{}
+				usage = &relaymodel.ChatUsage{}
 			}
 			usage.Add(response.Usage)
 			if usage.PromptTokens == 0 {
@@ -528,7 +528,7 @@ func OpenAIStreamHandler(
 	}
 
 	if usage == nil {
-		usage = &relaymodel.Usage{
+		usage = &relaymodel.ChatUsage{
 			PromptTokens:     int64(m.RequestUsage.InputTokens),
 			CompletionTokens: openai.CountTokenText(responseText.String(), m.OriginModel),
 			TotalTokens: int64(
@@ -588,5 +588,5 @@ func OpenAIHandler(
 	c.Writer.Header().Set("Content-Type", "application/json")
 	c.Writer.Header().Set("Content-Length", strconv.Itoa(len(jsonResponse)))
 	_, _ = c.Writer.Write(jsonResponse)
-	return fullTextResponse.ToModelUsage(), nil
+	return fullTextResponse.Usage.ToModelUsage(), nil
 }

+ 5 - 5
core/relay/adaptor/aws/claude/main.go

@@ -167,7 +167,7 @@ func Handler(meta *meta.Meta, c *gin.Context) (model.Usage, adaptor.Error) {
 
 	jsonBody, err := sonic.Marshal(openaiResp)
 	if err != nil {
-		return openaiResp.ToModelUsage(), relaymodel.WrapperOpenAIErrorWithMessage(
+		return openaiResp.Usage.ToModelUsage(), relaymodel.WrapperOpenAIErrorWithMessage(
 			err.Error(),
 			nil,
 			http.StatusInternalServerError,
@@ -177,7 +177,7 @@ func Handler(meta *meta.Meta, c *gin.Context) (model.Usage, adaptor.Error) {
 	c.Writer.Header().Set("Content-Type", "application/json")
 	c.Writer.Header().Set("Content-Length", strconv.Itoa(len(jsonBody)))
 	_, _ = c.Writer.Write(jsonBody)
-	return openaiResp.ToModelUsage(), nil
+	return openaiResp.Usage.ToModelUsage(), nil
 }
 
 func StreamHandler(m *meta.Meta, c *gin.Context) (model.Usage, adaptor.Error) {
@@ -248,7 +248,7 @@ func StreamHandler(m *meta.Meta, c *gin.Context) (model.Usage, adaptor.Error) {
 	stream := awsResp.GetStream()
 	defer stream.Close()
 
-	var usage *relaymodel.Usage
+	var usage *relaymodel.ChatUsage
 	responseText := strings.Builder{}
 	var writed bool
 
@@ -267,7 +267,7 @@ func StreamHandler(m *meta.Meta, c *gin.Context) (model.Usage, adaptor.Error) {
 				switch {
 				case response.Usage != nil:
 					if usage == nil {
-						usage = &relaymodel.Usage{}
+						usage = &relaymodel.ChatUsage{}
 					}
 					usage.Add(response.Usage)
 					if usage.PromptTokens == 0 {
@@ -297,7 +297,7 @@ func StreamHandler(m *meta.Meta, c *gin.Context) (model.Usage, adaptor.Error) {
 	}
 
 	if usage == nil {
-		usage = &relaymodel.Usage{
+		usage = &relaymodel.ChatUsage{
 			PromptTokens:     int64(m.RequestUsage.InputTokens),
 			CompletionTokens: openai.CountTokenText(responseText.String(), m.OriginModel),
 			TotalTokens: int64(

+ 4 - 4
core/relay/adaptor/aws/llama3/main.go

@@ -157,7 +157,7 @@ func Handler(meta *meta.Meta, c *gin.Context) (model.Usage, adaptor.Error) {
 
 	jsonData, err := sonic.Marshal(llamaResponse)
 	if err != nil {
-		return openaiResp.ToModelUsage(), relaymodel.WrapperOpenAIErrorWithMessage(
+		return openaiResp.Usage.ToModelUsage(), relaymodel.WrapperOpenAIErrorWithMessage(
 			err.Error(),
 			nil,
 			http.StatusInternalServerError,
@@ -167,7 +167,7 @@ func Handler(meta *meta.Meta, c *gin.Context) (model.Usage, adaptor.Error) {
 	c.Writer.Header().Set("Content-Type", "application/json")
 	c.Writer.Header().Set("Content-Length", strconv.Itoa(len(jsonData)))
 	_, _ = c.Writer.Write(jsonData)
-	return openaiResp.ToModelUsage(), nil
+	return openaiResp.Usage.ToModelUsage(), nil
 }
 
 func ResponseLlama2OpenAI(meta *meta.Meta, llamaResponse Response) relaymodel.TextResponse {
@@ -190,7 +190,7 @@ func ResponseLlama2OpenAI(meta *meta.Meta, llamaResponse Response) relaymodel.Te
 		Created: time.Now().Unix(),
 		Choices: []*relaymodel.TextResponseChoice{&choice},
 		Model:   meta.OriginModel,
-		Usage: relaymodel.Usage{
+		Usage: relaymodel.ChatUsage{
 			PromptTokens:     llamaResponse.PromptTokenCount,
 			CompletionTokens: llamaResponse.GenerationTokenCount,
 			TotalTokens:      llamaResponse.PromptTokenCount + llamaResponse.GenerationTokenCount,
@@ -257,7 +257,7 @@ func StreamHandler(meta *meta.Meta, c *gin.Context) (model.Usage, adaptor.Error)
 	defer stream.Close()
 
 	c.Writer.Header().Set("Content-Type", "text/event-stream")
-	var usage relaymodel.Usage
+	var usage relaymodel.ChatUsage
 	c.Stream(func(_ io.Writer) bool {
 		event, ok := <-stream.Events()
 		if !ok {

+ 1 - 1
core/relay/adaptor/baidu/embeddings.go

@@ -16,7 +16,7 @@ import (
 
 type EmbeddingsResponse struct {
 	*Error
-	Usage relaymodel.Usage `json:"usage"`
+	Usage relaymodel.ChatUsage `json:"usage"`
 }
 
 func EmbeddingsHandler(

+ 3 - 3
core/relay/adaptor/baidu/main.go

@@ -141,7 +141,7 @@ func StreamHandler(
 
 	log := common.GetLogger(c)
 
-	var usage relaymodel.Usage
+	var usage relaymodel.ChatUsage
 	scanner := bufio.NewScanner(resp.Body)
 	buf := openai.GetScannerBuffer()
 	defer openai.PutScannerBuffer(buf)
@@ -198,7 +198,7 @@ func Handler(meta *meta.Meta, c *gin.Context, resp *http.Response) (model.Usage,
 	fullTextResponse := response2OpenAI(meta, &baiduResponse)
 	jsonResponse, err := sonic.Marshal(fullTextResponse)
 	if err != nil {
-		return fullTextResponse.ToModelUsage(), relaymodel.WrapperOpenAIError(
+		return fullTextResponse.Usage.ToModelUsage(), relaymodel.WrapperOpenAIError(
 			err,
 			"marshal_response_body_failed",
 			http.StatusInternalServerError,
@@ -207,5 +207,5 @@ func Handler(meta *meta.Meta, c *gin.Context, resp *http.Response) (model.Usage,
 	c.Writer.Header().Set("Content-Type", "application/json")
 	c.Writer.Header().Set("Content-Length", strconv.Itoa(len(jsonResponse)))
 	_, _ = c.Writer.Write(jsonResponse)
-	return fullTextResponse.ToModelUsage(), nil
+	return fullTextResponse.Usage.ToModelUsage(), nil
 }

+ 1 - 1
core/relay/adaptor/baidu/model.go

@@ -15,7 +15,7 @@ type ErrorResponse struct {
 }
 
 type ChatResponse struct {
-	Usage            *model.Usage `json:"usage"`
+	Usage            *model.ChatUsage `json:"usage"`
 	*Error           `json:"error"`
 	ID               string `json:"id"`
 	Object           string `json:"object"`

+ 2 - 2
core/relay/adaptor/baidu/rerank.go

@@ -15,8 +15,8 @@ import (
 )
 
 type RerankResponse struct {
-	Error *Error           `json:"error"`
-	Usage relaymodel.Usage `json:"usage"`
+	Error *Error               `json:"error"`
+	Usage relaymodel.ChatUsage `json:"usage"`
 }
 
 func RerankHandler(

+ 5 - 5
core/relay/adaptor/cohere/main.go

@@ -118,7 +118,7 @@ func StreamResponse2OpenAI(
 		Choices: []*relaymodel.ChatCompletionsStreamResponseChoice{&choice},
 	}
 	if response != nil {
-		openaiResponse.Usage = &relaymodel.Usage{
+		openaiResponse.Usage = &relaymodel.ChatUsage{
 			PromptTokens:     response.Meta.Tokens.InputTokens,
 			CompletionTokens: response.Meta.Tokens.OutputTokens,
 			TotalTokens:      response.Meta.Tokens.InputTokens + response.Meta.Tokens.OutputTokens,
@@ -143,7 +143,7 @@ func Response2OpenAI(meta *meta.Meta, cohereResponse *Response) *relaymodel.Text
 		Object:  relaymodel.ChatCompletionObject,
 		Created: time.Now().Unix(),
 		Choices: []*relaymodel.TextResponseChoice{&choice},
-		Usage: relaymodel.Usage{
+		Usage: relaymodel.ChatUsage{
 			PromptTokens:     cohereResponse.Meta.Tokens.InputTokens,
 			CompletionTokens: cohereResponse.Meta.Tokens.OutputTokens,
 			TotalTokens:      cohereResponse.Meta.Tokens.InputTokens + cohereResponse.Meta.Tokens.OutputTokens,
@@ -170,7 +170,7 @@ func StreamHandler(
 	defer openai.PutScannerBuffer(buf)
 	scanner.Buffer(*buf, cap(*buf))
 
-	var usage relaymodel.Usage
+	var usage relaymodel.ChatUsage
 
 	for scanner.Scan() {
 		data := scanner.Text()
@@ -226,7 +226,7 @@ func Handler(meta *meta.Meta, c *gin.Context, resp *http.Response) (model.Usage,
 	fullTextResponse := Response2OpenAI(meta, &cohereResponse)
 	jsonResponse, err := sonic.Marshal(fullTextResponse)
 	if err != nil {
-		return fullTextResponse.ToModelUsage(), relaymodel.WrapperOpenAIError(
+		return fullTextResponse.Usage.ToModelUsage(), relaymodel.WrapperOpenAIError(
 			err,
 			"marshal_response_body_failed",
 			http.StatusInternalServerError,
@@ -235,5 +235,5 @@ func Handler(meta *meta.Meta, c *gin.Context, resp *http.Response) (model.Usage,
 	c.Writer.Header().Set("Content-Type", "application/json")
 	c.Writer.Header().Set("Content-Length", strconv.Itoa(len(jsonResponse)))
 	_, _ = c.Writer.Write(jsonResponse)
-	return fullTextResponse.ToModelUsage(), nil
+	return fullTextResponse.Usage.ToModelUsage(), nil
 }

+ 107 - 0
core/relay/adaptor/doubao/chat.go

@@ -0,0 +1,107 @@
+package doubao
+
+import (
+	"bytes"
+	"errors"
+	"net/http"
+	"strconv"
+	"strings"
+
+	"github.com/bytedance/sonic"
+	"github.com/bytedance/sonic/ast"
+	"github.com/labring/aiproxy/core/relay/adaptor"
+	"github.com/labring/aiproxy/core/relay/adaptor/openai"
+	"github.com/labring/aiproxy/core/relay/meta"
+	relaymodel "github.com/labring/aiproxy/core/relay/model"
+)
+
+func ConvertChatCompletionsRequest(
+	meta *meta.Meta,
+	req *http.Request,
+) (adaptor.ConvertResult, error) {
+	result, err := openai.ConvertChatCompletionsRequest(
+		meta,
+		req,
+		nil,
+		false,
+	)
+	if err != nil {
+		return adaptor.ConvertResult{}, err
+	}
+	if strings.HasPrefix(meta.OriginModel, "deepseek-reasoner") {
+		return result, nil
+	}
+
+	m := make(map[string]any)
+	err = sonic.ConfigDefault.NewDecoder(result.Body).Decode(&m)
+	if err != nil {
+		return adaptor.ConvertResult{}, err
+	}
+	messages, _ := m["messages"].([]any)
+	if len(messages) == 0 {
+		return adaptor.ConvertResult{}, errors.New("messages is empty")
+	}
+	sysMessage := relaymodel.Message{
+		Role:    "system",
+		Content: "回答前,都先用 <think></think> 输出你的思考过程。",
+	}
+	messages = append([]any{sysMessage}, messages...)
+	m["messages"] = messages
+	newBody, err := sonic.Marshal(m)
+	if err != nil {
+		return adaptor.ConvertResult{}, err
+	}
+
+	header := result.Header
+	header.Set("Content-Length", strconv.Itoa(len(newBody)))
+
+	return adaptor.ConvertResult{
+		Header: header,
+		Body:   bytes.NewReader(newBody),
+	}, nil
+}
+
+func newHandlerPreHandler(websearchCount *int64) func(_ *meta.Meta, node *ast.Node) error {
+	return func(meta *meta.Meta, node *ast.Node) error {
+		return handlerPreHandler(meta, node, websearchCount)
+	}
+}
+
+// copy bot_usage.model_usage to usage
+func handlerPreHandler(meta *meta.Meta, node *ast.Node, websearchCount *int64) error {
+	if !strings.HasPrefix(meta.ActualModel, "bot-") {
+		return nil
+	}
+
+	botUsageNode := node.Get("bot_usage")
+	if botUsageNode.Check() != nil {
+		return nil
+	}
+
+	modelUsageNode := botUsageNode.Get("model_usage").Index(0)
+	if modelUsageNode.Check() != nil {
+		return nil
+	}
+
+	_, err := node.SetAny("usage", modelUsageNode)
+	if err != nil {
+		return err
+	}
+
+	actionUsageNodes := botUsageNode.Get("action_usage")
+	if actionUsageNodes.Check() != nil {
+		return nil
+	}
+
+	return actionUsageNodes.ForEach(func(_ ast.Sequence, node *ast.Node) bool {
+		if node.Check() != nil {
+			return true
+		}
+		count, err := node.Get("count").Int64()
+		if err != nil {
+			return true
+		}
+		*websearchCount += count
+		return true
+	})
+}

+ 109 - 0
core/relay/adaptor/doubao/embed.go

@@ -0,0 +1,109 @@
+package doubao
+
+import (
+	"github.com/bytedance/sonic/ast"
+	"github.com/labring/aiproxy/core/relay/meta"
+)
+
+func patchEmbeddingsVisionInput(node *ast.Node) error {
+	inputNode := node.Get("input")
+	if !inputNode.Exists() {
+		return nil
+	}
+	switch inputNode.TypeSafe() {
+	case ast.V_ARRAY:
+		return inputNode.ForEach(func(_ ast.Sequence, item *ast.Node) bool {
+			switch item.TypeSafe() {
+			case ast.V_STRING:
+				text, err := item.String()
+				if err != nil {
+					return false
+				}
+				*item = ast.NewObject([]ast.Pair{
+					ast.NewPair("type", ast.NewString("text")),
+					ast.NewPair("text", ast.NewString(text)),
+				})
+				return true
+			case ast.V_OBJECT:
+				textNode := item.Get("text")
+				if textNode.Exists() && textNode.TypeSafe() == ast.V_STRING {
+					_, err := item.Set("type", ast.NewString("text"))
+					return err == nil
+				}
+
+				imageNode := item.Get("image")
+				if imageNode.Exists() && imageNode.TypeSafe() == ast.V_STRING {
+					imageURL, err := imageNode.String()
+					if err != nil {
+						return false
+					}
+					_, err = item.Unset("image")
+					if err != nil {
+						return false
+					}
+					_, err = item.Set("type", ast.NewString("image_url"))
+					if err != nil {
+						return false
+					}
+					_, err = item.SetAny("image_url", map[string]string{
+						"url": imageURL,
+					})
+					if err != nil {
+						return false
+					}
+				}
+				return true
+			default:
+				return false
+			}
+		})
+	case ast.V_STRING:
+		inputText, err := inputNode.String()
+		if err != nil {
+			return err
+		}
+		_, err = node.SetAny("input", []map[string]string{
+			{
+				"type": "text",
+				"text": inputText,
+			},
+		})
+		return err
+	default:
+		return nil
+	}
+}
+
+func embeddingPreHandler(_ *meta.Meta, node *ast.Node) error {
+	return patchEmbeddingsVisionResponse(node)
+}
+
+func patchEmbeddingsVisionResponse(node *ast.Node) error {
+	dataNode := node.Get("data")
+	if !dataNode.Exists() {
+		return nil
+	}
+	switch dataNode.TypeSafe() {
+	case ast.V_ARRAY:
+		return nil
+	case ast.V_OBJECT:
+		embeddingNode := dataNode.Get("embedding")
+		if !embeddingNode.Exists() {
+			return nil
+		}
+		_, err := node.Unset("data")
+		if err != nil {
+			return err
+		}
+		_, err = node.SetAny("data", []map[string]any{
+			{
+				"embedding": embeddingNode,
+				"object":    "embedding",
+				"index":     0,
+			},
+		})
+		return err
+	default:
+		return nil
+	}
+}

+ 23 - 85
core/relay/adaptor/doubao/main.go

@@ -1,22 +1,16 @@
 package doubao
 
 import (
-	"bytes"
-	"errors"
 	"fmt"
 	"net/http"
-	"strconv"
 	"strings"
 
-	"github.com/bytedance/sonic"
-	"github.com/bytedance/sonic/ast"
 	"github.com/gin-gonic/gin"
 	"github.com/labring/aiproxy/core/model"
 	"github.com/labring/aiproxy/core/relay/adaptor"
 	"github.com/labring/aiproxy/core/relay/adaptor/openai"
 	"github.com/labring/aiproxy/core/relay/meta"
 	"github.com/labring/aiproxy/core/relay/mode"
-	relaymodel "github.com/labring/aiproxy/core/relay/model"
 	"github.com/labring/aiproxy/core/relay/utils"
 )
 
@@ -35,6 +29,12 @@ func GetRequestURL(meta *meta.Meta) (adaptor.RequestURL, error) {
 			URL:    u + "/api/v3/chat/completions",
 		}, nil
 	case mode.Embeddings:
+		if strings.Contains(meta.ActualModel, "vision") {
+			return adaptor.RequestURL{
+				Method: http.MethodPost,
+				URL:    u + "/api/v3/embeddings/multimodal",
+			}, nil
+		}
 		return adaptor.RequestURL{
 			Method: http.MethodPost,
 			URL:    u + "/api/v3/embeddings",
@@ -73,86 +73,17 @@ func (a *Adaptor) ConvertRequest(
 	store adaptor.Store,
 	req *http.Request,
 ) (adaptor.ConvertResult, error) {
-	result, err := a.Adaptor.ConvertRequest(meta, store, req)
-	if err != nil {
-		return adaptor.ConvertResult{}, err
-	}
-	if meta.Mode != mode.ChatCompletions || meta.OriginModel != "deepseek-reasoner" {
-		return result, nil
-	}
-
-	m := make(map[string]any)
-	err = sonic.ConfigDefault.NewDecoder(result.Body).Decode(&m)
-	if err != nil {
-		return adaptor.ConvertResult{}, err
-	}
-	messages, _ := m["messages"].([]any)
-	if len(messages) == 0 {
-		return adaptor.ConvertResult{}, errors.New("messages is empty")
-	}
-	sysMessage := relaymodel.Message{
-		Role:    "system",
-		Content: "回答前,都先用 <think></think> 输出你的思考过程。",
-	}
-	messages = append([]any{sysMessage}, messages...)
-	m["messages"] = messages
-	newBody, err := sonic.Marshal(m)
-	if err != nil {
-		return adaptor.ConvertResult{}, err
-	}
-
-	header := result.Header
-	header.Set("Content-Length", strconv.Itoa(len(newBody)))
-
-	return adaptor.ConvertResult{
-		Header: header,
-		Body:   bytes.NewReader(newBody),
-	}, nil
-}
-
-func newHandlerPreHandler(websearchCount *int64) func(_ *meta.Meta, node *ast.Node) error {
-	return func(meta *meta.Meta, node *ast.Node) error {
-		return handlerPreHandler(meta, node, websearchCount)
-	}
-}
-
-// copy bot_usage.model_usage to usage
-func handlerPreHandler(meta *meta.Meta, node *ast.Node, websearchCount *int64) error {
-	if !strings.HasPrefix(meta.ActualModel, "bot-") {
-		return nil
-	}
-
-	botUsageNode := node.Get("bot_usage")
-	if botUsageNode.Check() != nil {
-		return nil
-	}
-
-	modelUsageNode := botUsageNode.Get("model_usage").Index(0)
-	if modelUsageNode.Check() != nil {
-		return nil
-	}
-
-	_, err := node.SetAny("usage", modelUsageNode)
-	if err != nil {
-		return err
-	}
-
-	actionUsageNodes := botUsageNode.Get("action_usage")
-	if actionUsageNodes.Check() != nil {
-		return nil
-	}
-
-	return actionUsageNodes.ForEach(func(_ ast.Sequence, node *ast.Node) bool {
-		if node.Check() != nil {
-			return true
-		}
-		count, err := node.Get("count").Int64()
-		if err != nil {
-			return true
+	switch meta.Mode {
+	case mode.Embeddings:
+		if strings.Contains(meta.ActualModel, "vision") {
+			return openai.ConvertEmbeddingsRequest(meta, req, patchEmbeddingsVisionInput, false)
 		}
-		*websearchCount += count
-		return true
-	})
+		return openai.ConvertEmbeddingsRequest(meta, req, nil, true)
+	case mode.ChatCompletions:
+		return ConvertChatCompletionsRequest(meta, req)
+	default:
+		return openai.ConvertRequest(meta, store, req)
+	}
 }
 
 func (a *Adaptor) DoResponse(
@@ -170,6 +101,13 @@ func (a *Adaptor) DoResponse(
 			usage, err = openai.Handler(meta, c, resp, newHandlerPreHandler(&websearchCount))
 		}
 		usage.WebSearchCount += model.ZeroNullInt64(websearchCount)
+	case mode.Embeddings:
+		usage, err = openai.EmbeddingsHandler(
+			meta,
+			c,
+			resp,
+			embeddingPreHandler,
+		)
 	default:
 		return openai.DoResponse(meta, store, c, resp)
 	}

+ 3 - 3
core/relay/adaptor/gemini/embeddings.go

@@ -81,7 +81,7 @@ func EmbeddingHandler(
 	fullTextResponse := embeddingResponse2OpenAI(meta, &geminiEmbeddingResponse)
 	jsonResponse, err := sonic.Marshal(fullTextResponse)
 	if err != nil {
-		return fullTextResponse.ToModelUsage(), relaymodel.WrapperOpenAIError(
+		return fullTextResponse.Usage.ToModelUsage(), relaymodel.WrapperOpenAIError(
 			err,
 			"marshal_response_body_failed",
 			http.StatusInternalServerError,
@@ -90,7 +90,7 @@ func EmbeddingHandler(
 	c.Writer.Header().Set("Content-Type", "application/json")
 	c.Writer.Header().Set("Content-Length", strconv.Itoa(len(jsonResponse)))
 	_, _ = c.Writer.Write(jsonResponse)
-	return fullTextResponse.ToModelUsage(), nil
+	return fullTextResponse.Usage.ToModelUsage(), nil
 }
 
 func embeddingResponse2OpenAI(
@@ -101,7 +101,7 @@ func embeddingResponse2OpenAI(
 		Object: "list",
 		Data:   make([]*relaymodel.EmbeddingResponseItem, 0, len(response.Embeddings)),
 		Model:  meta.OriginModel,
-		Usage: relaymodel.Usage{
+		Usage: relaymodel.EmbeddingUsage{
 			TotalTokens:  int64(meta.RequestUsage.InputTokens),
 			PromptTokens: int64(meta.RequestUsage.InputTokens),
 		},

+ 5 - 5
core/relay/adaptor/gemini/main.go

@@ -367,8 +367,8 @@ type UsageMetadata struct {
 	PromptTokensDetails  []PromptTokensDetail `json:"promptTokensDetails"`
 }
 
-func (u *UsageMetadata) ToUsage() relaymodel.Usage {
-	return relaymodel.Usage{
+func (u *UsageMetadata) ToUsage() relaymodel.ChatUsage {
+	return relaymodel.ChatUsage{
 		PromptTokens: u.PromptTokenCount,
 		CompletionTokens: u.CandidatesTokenCount +
 			u.ThoughtsTokenCount,
@@ -664,7 +664,7 @@ func StreamHandler(
 		scanner.Buffer(*buf, cap(*buf))
 	}
 
-	usage := relaymodel.Usage{
+	usage := relaymodel.ChatUsage{
 		PromptTokens: int64(meta.RequestUsage.InputTokens),
 	}
 
@@ -719,7 +719,7 @@ func Handler(meta *meta.Meta, c *gin.Context, resp *http.Response) (model.Usage,
 	fullTextResponse := responseChat2OpenAI(meta, &geminiResponse)
 	jsonResponse, err := sonic.Marshal(fullTextResponse)
 	if err != nil {
-		return fullTextResponse.ToModelUsage(), relaymodel.WrapperOpenAIError(
+		return fullTextResponse.Usage.ToModelUsage(), relaymodel.WrapperOpenAIError(
 			err,
 			"marshal_response_body_failed",
 			http.StatusInternalServerError,
@@ -728,5 +728,5 @@ func Handler(meta *meta.Meta, c *gin.Context, resp *http.Response) (model.Usage,
 	c.Writer.Header().Set("Content-Type", "application/json")
 	c.Writer.Header().Set("Content-Length", strconv.Itoa(len(jsonResponse)))
 	_, _ = c.Writer.Write(jsonResponse)
-	return fullTextResponse.ToModelUsage(), nil
+	return fullTextResponse.Usage.ToModelUsage(), nil
 }

+ 1 - 1
core/relay/adaptor/jina/rerank.go

@@ -44,7 +44,7 @@ func RerankHandler(
 			http.StatusInternalServerError,
 		)
 	}
-	var usage relaymodel.Usage
+	var usage relaymodel.ChatUsage
 	usageNode := node.Get("usage")
 	usageStr, err := usageNode.Raw()
 	if err != nil {

+ 5 - 2
core/relay/adaptor/minimax/adaptor.go

@@ -10,6 +10,7 @@ import (
 	"github.com/labring/aiproxy/core/relay/adaptor/openai"
 	"github.com/labring/aiproxy/core/relay/meta"
 	"github.com/labring/aiproxy/core/relay/mode"
+	"github.com/labring/aiproxy/core/relay/utils"
 )
 
 type Adaptor struct {
@@ -97,8 +98,10 @@ func (a *Adaptor) DoResponse(
 	case mode.AudioSpeech:
 		return TTSHandler(meta, c, resp)
 	default:
-		if err := TryErrorHanlder(resp); err != nil {
-			return model.Usage{}, err
+		if !utils.IsStreamResponse(resp) {
+			if err := TryErrorHanlder(resp); err != nil {
+				return model.Usage{}, err
+			}
 		}
 		return a.Adaptor.DoResponse(meta, store, c, resp)
 	}

+ 1 - 1
core/relay/adaptor/minimax/error.go

@@ -37,7 +37,7 @@ func TryErrorHanlder(resp *http.Response) adaptor.Error {
 	if err := sonic.Unmarshal(respBody, &result); err != nil {
 		return relaymodel.WrapperOpenAIError(
 			err,
-			"TTS_ERROR",
+			"unmarshal_response_body_failed",
 			http.StatusInternalServerError,
 		)
 	}

+ 5 - 11
core/relay/adaptor/minimax/tts.go

@@ -7,7 +7,6 @@ import (
 	"io"
 	"net/http"
 	"strconv"
-	"strings"
 
 	"github.com/bytedance/sonic"
 	"github.com/gin-gonic/gin"
@@ -72,10 +71,6 @@ func ConvertTTSRequest(meta *meta.Meta, req *http.Request) (adaptor.ConvertResul
 
 	if responseFormat == "wav" {
 		reqMap["stream"] = false
-		meta.Set("stream", false)
-	} else {
-		stream, _ := reqMap["stream"].(bool)
-		meta.Set("stream", stream)
 	}
 
 	body, err := sonic.Marshal(reqMap)
@@ -112,13 +107,12 @@ func TTSHandler(
 	c *gin.Context,
 	resp *http.Response,
 ) (model.Usage, adaptor.Error) {
-	if err := TryErrorHanlder(resp); err != nil {
-		return model.Usage{}, err
+	if utils.IsStreamResponse(resp) {
+		return ttsStreamHandler(meta, c, resp)
 	}
 
-	if !strings.Contains(resp.Header.Get("Content-Type"), "application/json") &&
-		meta.GetBool("stream") {
-		return ttsStreamHandler(meta, c, resp)
+	if err := TryErrorHanlder(resp); err != nil {
+		return model.Usage{}, err
 	}
 
 	defer resp.Body.Close()
@@ -179,7 +173,7 @@ func ttsStreamHandler(
 ) (model.Usage, adaptor.Error) {
 	defer resp.Body.Close()
 
-	resp.Header.Set("Content-Type", "application/octet-stream")
+	c.Writer.Header().Set("Content-Type", "application/octet-stream")
 
 	log := common.GetLogger(c)
 

+ 8 - 8
core/relay/adaptor/ollama/main.go

@@ -159,7 +159,7 @@ func response2OpenAI(meta *meta.Meta, response *ChatResponse) *relaymodel.TextRe
 		Object:  relaymodel.ChatCompletionObject,
 		Created: time.Now().Unix(),
 		Choices: []*relaymodel.TextResponseChoice{&choice},
-		Usage: relaymodel.Usage{
+		Usage: relaymodel.ChatUsage{
 			PromptTokens:     response.PromptEvalCount,
 			CompletionTokens: response.EvalCount,
 			TotalTokens:      response.PromptEvalCount + response.EvalCount,
@@ -194,7 +194,7 @@ func streamResponse2OpenAI(
 	}
 
 	if ollamaResponse.EvalCount != 0 {
-		response.Usage = &relaymodel.Usage{
+		response.Usage = &relaymodel.ChatUsage{
 			PromptTokens:     ollamaResponse.PromptEvalCount,
 			CompletionTokens: ollamaResponse.EvalCount,
 			TotalTokens:      ollamaResponse.PromptEvalCount + ollamaResponse.EvalCount,
@@ -217,7 +217,7 @@ func StreamHandler(
 
 	log := common.GetLogger(c)
 
-	var usage *relaymodel.Usage
+	var usage *relaymodel.ChatUsage
 	scanner := bufio.NewScanner(resp.Body)
 	buf := openai.GetScannerBuffer()
 	defer openai.PutScannerBuffer(buf)
@@ -318,7 +318,7 @@ func EmbeddingHandler(
 	fullTextResponse := embeddingResponseOllama2OpenAI(meta, &ollamaResponse)
 	jsonResponse, err := sonic.Marshal(fullTextResponse)
 	if err != nil {
-		return fullTextResponse.ToModelUsage(), relaymodel.WrapperOpenAIError(
+		return fullTextResponse.Usage.ToModelUsage(), relaymodel.WrapperOpenAIError(
 			err,
 			"marshal_response_body_failed",
 			http.StatusInternalServerError,
@@ -327,7 +327,7 @@ func EmbeddingHandler(
 	c.Writer.Header().Set("Content-Type", "application/json")
 	c.Writer.Header().Set("Content-Length", strconv.Itoa(len(jsonResponse)))
 	_, _ = c.Writer.Write(jsonResponse)
-	return fullTextResponse.ToModelUsage(), nil
+	return fullTextResponse.Usage.ToModelUsage(), nil
 }
 
 func embeddingResponseOllama2OpenAI(
@@ -338,7 +338,7 @@ func embeddingResponseOllama2OpenAI(
 		Object: "list",
 		Data:   make([]*relaymodel.EmbeddingResponseItem, 0, len(response.Embeddings)),
 		Model:  meta.OriginModel,
-		Usage: relaymodel.Usage{
+		Usage: relaymodel.EmbeddingUsage{
 			PromptTokens: response.PromptEvalCount,
 			TotalTokens:  response.PromptEvalCount,
 		},
@@ -376,7 +376,7 @@ func Handler(meta *meta.Meta, c *gin.Context, resp *http.Response) (model.Usage,
 	fullTextResponse := response2OpenAI(meta, &ollamaResponse)
 	jsonResponse, err := sonic.Marshal(fullTextResponse)
 	if err != nil {
-		return fullTextResponse.ToModelUsage(), relaymodel.WrapperOpenAIError(
+		return fullTextResponse.Usage.ToModelUsage(), relaymodel.WrapperOpenAIError(
 			err,
 			"marshal_response_body_failed",
 			http.StatusInternalServerError,
@@ -385,5 +385,5 @@ func Handler(meta *meta.Meta, c *gin.Context, resp *http.Response) (model.Usage,
 	c.Writer.Header().Set("Content-Type", "application/json")
 	c.Writer.Header().Set("Content-Length", strconv.Itoa(len(jsonResponse)))
 	_, _ = c.Writer.Write(jsonResponse)
-	return fullTextResponse.ToModelUsage(), nil
+	return fullTextResponse.Usage.ToModelUsage(), nil
 }

+ 3 - 3
core/relay/adaptor/openai/adaptor.go

@@ -188,9 +188,9 @@ func DoResponse(
 		usage, err = RerankHandler(meta, c, resp)
 	case mode.Moderations:
 		usage, err = ModerationsHandler(meta, c, resp)
-	case mode.Embeddings, mode.Completions:
-		fallthrough
-	case mode.ChatCompletions:
+	case mode.Embeddings:
+		usage, err = EmbeddingsHandler(meta, c, resp, nil)
+	case mode.Completions, mode.ChatCompletions:
 		if utils.IsStreamResponse(resp) {
 			usage, err = StreamHandler(meta, c, resp, nil)
 		} else {

+ 7 - 6
core/relay/adaptor/openai/chat.go

@@ -167,14 +167,14 @@ func patchStreamOptions(node *ast.Node) error {
 
 func GetUsageOrChatChoicesResponseFromNode(
 	node *ast.Node,
-) (*relaymodel.Usage, []*relaymodel.ChatCompletionsStreamResponseChoice, error) {
+) (*relaymodel.ChatUsage, []*relaymodel.ChatCompletionsStreamResponseChoice, error) {
 	usageNode, err := node.Get("usage").Raw()
 	if err != nil {
 		if !errors.Is(err, ast.ErrNotExist) {
 			return nil, nil, err
 		}
 	} else {
-		var usage relaymodel.Usage
+		var usage relaymodel.ChatUsage
 		err = sonic.UnmarshalString(usageNode, &usage)
 		if err != nil {
 			return nil, nil, err
@@ -220,7 +220,7 @@ func StreamHandler(
 	defer PutScannerBuffer(buf)
 	scanner.Buffer(*buf, cap(*buf))
 
-	var usage relaymodel.Usage
+	var usage relaymodel.ChatUsage
 
 	for scanner.Scan() {
 		data := scanner.Bytes()
@@ -304,14 +304,14 @@ func StreamHandler(
 
 func GetUsageOrChoicesResponseFromNode(
 	node *ast.Node,
-) (*relaymodel.Usage, []*relaymodel.TextResponseChoice, error) {
+) (*relaymodel.ChatUsage, []*relaymodel.TextResponseChoice, error) {
 	usageNode, err := node.Get("usage").Raw()
 	if err != nil {
 		if !errors.Is(err, ast.ErrNotExist) {
 			return nil, nil, err
 		}
 	} else {
-		var usage relaymodel.Usage
+		var usage relaymodel.ChatUsage
 		err = sonic.UnmarshalString(usageNode, &usage)
 		if err != nil {
 			return nil, nil, err
@@ -375,6 +375,7 @@ func Handler(
 			)
 		}
 	}
+
 	usage, choices, err := GetUsageOrChoicesResponseFromNode(&node)
 	if err != nil {
 		return model.Usage{}, relaymodel.WrapperOpenAIError(
@@ -395,7 +396,7 @@ func Handler(
 			}
 			completionTokens += CountTokenText(choice.Message.StringContent(), meta.ActualModel)
 		}
-		usage = &relaymodel.Usage{
+		usage = &relaymodel.ChatUsage{
 			PromptTokens:     int64(meta.RequestUsage.InputTokens),
 			CompletionTokens: completionTokens,
 			TotalTokens:      int64(meta.RequestUsage.InputTokens) + completionTokens,

+ 137 - 10
core/relay/adaptor/openai/embeddings.go

@@ -3,12 +3,18 @@ package openai
 import (
 	"bytes"
 	"errors"
+	"io"
 	"net/http"
+	"strconv"
 
+	"github.com/bytedance/sonic"
 	"github.com/bytedance/sonic/ast"
+	"github.com/gin-gonic/gin"
 	"github.com/labring/aiproxy/core/common"
+	"github.com/labring/aiproxy/core/model"
 	"github.com/labring/aiproxy/core/relay/adaptor"
 	"github.com/labring/aiproxy/core/relay/meta"
+	relaymodel "github.com/labring/aiproxy/core/relay/model"
 )
 
 func ConvertEmbeddingsRequest(
@@ -36,17 +42,14 @@ func ConvertEmbeddingsRequest(
 
 	if inputToSlices {
 		inputNode := node.Get("input")
-		if inputNode.Exists() {
+		if inputNode.Exists() && inputNode.TypeSafe() == ast.V_STRING {
 			inputString, err := inputNode.String()
 			if err != nil {
-				if !errors.Is(err, ast.ErrUnsupportType) {
-					return adaptor.ConvertResult{}, err
-				}
-			} else {
-				_, err = node.SetAny("input", []string{inputString})
-				if err != nil {
-					return adaptor.ConvertResult{}, err
-				}
+				return adaptor.ConvertResult{}, err
+			}
+			_, err = node.SetAny("input", []string{inputString})
+			if err != nil {
+				return adaptor.ConvertResult{}, err
 			}
 		}
 	}
@@ -57,8 +60,132 @@ func ConvertEmbeddingsRequest(
 	}
 	return adaptor.ConvertResult{
 		Header: http.Header{
-			"Content-Type": {"application/json"},
+			"Content-Type":   {"application/json"},
+			"Content-Length": {strconv.Itoa(len(jsonData))},
 		},
 		Body: bytes.NewReader(jsonData),
 	}, nil
 }
+
+func GetEmbeddingsUsageFromNode(
+	node *ast.Node,
+) (*relaymodel.EmbeddingUsage, error) {
+	usageNode, err := node.Get("usage").Raw()
+	if err != nil {
+		if !errors.Is(err, ast.ErrNotExist) {
+			return nil, err
+		}
+		return nil, nil
+	}
+	var usage relaymodel.EmbeddingUsage
+	err = sonic.UnmarshalString(usageNode, &usage)
+	if err != nil {
+		return nil, err
+	}
+	return &usage, nil
+}
+
+func EmbeddingsHandler(
+	meta *meta.Meta,
+	c *gin.Context,
+	resp *http.Response,
+	preHandler PreHandler,
+) (model.Usage, adaptor.Error) {
+	if resp.StatusCode != http.StatusOK {
+		return model.Usage{}, ErrorHanlder(resp)
+	}
+
+	defer resp.Body.Close()
+
+	log := common.GetLogger(c)
+
+	responseBody, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return model.Usage{}, relaymodel.WrapperOpenAIError(
+			err,
+			"read_response_body_failed",
+			http.StatusInternalServerError,
+		)
+	}
+
+	node, err := sonic.Get(responseBody)
+	if err != nil {
+		return model.Usage{}, relaymodel.WrapperOpenAIError(
+			err,
+			"unmarshal_response_body_failed",
+			http.StatusInternalServerError,
+		)
+	}
+	if preHandler != nil {
+		err := preHandler(meta, &node)
+		if err != nil {
+			return model.Usage{}, relaymodel.WrapperOpenAIError(
+				err,
+				"pre_handler_failed",
+				http.StatusInternalServerError,
+			)
+		}
+	}
+
+	usage, err := GetEmbeddingsUsageFromNode(&node)
+	if err != nil {
+		return model.Usage{}, relaymodel.WrapperOpenAIError(
+			err,
+			"unmarshal_response_body_failed",
+			http.StatusInternalServerError,
+		)
+	}
+
+	if usage == nil ||
+		(usage.TotalTokens == 0 && usage.PromptTokens == 0) {
+		usage = &relaymodel.EmbeddingUsage{
+			PromptTokens: int64(meta.RequestUsage.InputTokens),
+			TotalTokens:  int64(meta.RequestUsage.InputTokens),
+		}
+		if meta.RequestUsage.ImageInputTokens != 0 {
+			usage.PromptTokensDetails = &relaymodel.EmbeddingPromptTokensDetails{
+				ImageTokens: int64(meta.RequestUsage.ImageInputTokens),
+			}
+		}
+		_, err = node.Set("usage", ast.NewAny(usage))
+		if err != nil {
+			return usage.ToModelUsage(), relaymodel.WrapperOpenAIError(
+				err,
+				"set_usage_failed",
+				http.StatusInternalServerError,
+			)
+		}
+	} else if usage.TotalTokens != 0 && usage.PromptTokens == 0 { // some channels don't return prompt tokens
+		usage.PromptTokens = usage.TotalTokens
+		_, err = node.Set("usage", ast.NewAny(usage))
+		if err != nil {
+			return usage.ToModelUsage(), relaymodel.WrapperOpenAIError(err, "set_usage_failed", http.StatusInternalServerError)
+		}
+	}
+
+	_, err = node.Set("model", ast.NewString(meta.OriginModel))
+	if err != nil {
+		return usage.ToModelUsage(), relaymodel.WrapperOpenAIError(
+			err,
+			"set_model_failed",
+			http.StatusInternalServerError,
+		)
+	}
+
+	newData, err := node.MarshalJSON()
+	if err != nil {
+		return usage.ToModelUsage(), relaymodel.WrapperOpenAIError(
+			err,
+			"marshal_response_body_failed",
+			http.StatusInternalServerError,
+		)
+	}
+
+	c.Writer.Header().Set("Content-Type", "application/json")
+	c.Writer.Header().Set("Content-Length", strconv.Itoa(len(newData)))
+	_, err = c.Writer.Write(newData)
+	if err != nil {
+		log.Warnf("write response body failed: %v", err)
+	}
+	return usage.ToModelUsage(), nil
+}

+ 2 - 2
core/relay/adaptor/openai/helper.go

@@ -5,8 +5,8 @@ import (
 	"github.com/labring/aiproxy/core/relay/model"
 )
 
-func ResponseText2Usage(responseText, modeName string, promptTokens int64) model.Usage {
-	usage := model.Usage{
+func ResponseText2Usage(responseText, modeName string, promptTokens int64) model.ChatUsage {
+	usage := model.ChatUsage{
 		PromptTokens:     promptTokens,
 		CompletionTokens: CountTokenText(responseText, modeName),
 	}

+ 1 - 1
core/relay/adaptor/openai/stt.go

@@ -138,7 +138,7 @@ func STTHandler(
 		promptTokens = CountTokenText(text, meta.ActualModel)
 	}
 
-	usage := relaymodel.Usage{
+	usage := relaymodel.ChatUsage{
 		PromptTokens: promptTokens,
 		TotalTokens:  promptTokens,
 	}

+ 3 - 7
core/relay/adaptor/zhipu/main.go

@@ -36,7 +36,7 @@ func EmbeddingsHandler(c *gin.Context, resp *http.Response) (model.Usage, adapto
 	fullTextResponse := embeddingResponseZhipu2OpenAI(&zhipuResponse)
 	jsonResponse, err := sonic.Marshal(fullTextResponse)
 	if err != nil {
-		return fullTextResponse.ToModelUsage(), relaymodel.WrapperOpenAIError(
+		return fullTextResponse.Usage.ToModelUsage(), relaymodel.WrapperOpenAIError(
 			err,
 			"marshal_response_body_failed",
 			http.StatusInternalServerError,
@@ -45,7 +45,7 @@ func EmbeddingsHandler(c *gin.Context, resp *http.Response) (model.Usage, adapto
 	c.Writer.Header().Set("Content-Type", "application/json")
 	c.Writer.Header().Set("Content-Length", strconv.Itoa(len(jsonResponse)))
 	_, _ = c.Writer.Write(jsonResponse)
-	return fullTextResponse.ToModelUsage(), nil
+	return fullTextResponse.Usage.ToModelUsage(), nil
 }
 
 func embeddingResponseZhipu2OpenAI(response *EmbeddingResponse) *relaymodel.EmbeddingResponse {
@@ -53,11 +53,7 @@ func embeddingResponseZhipu2OpenAI(response *EmbeddingResponse) *relaymodel.Embe
 		Object: "list",
 		Data:   make([]*relaymodel.EmbeddingResponseItem, 0, len(response.Embeddings)),
 		Model:  response.Model,
-		Usage: relaymodel.Usage{
-			PromptTokens:     response.PromptTokens,
-			CompletionTokens: response.CompletionTokens,
-			TotalTokens:      response.TotalTokens,
-		},
+		Usage:  response.Usage,
 	}
 
 	for _, item := range response.Embeddings {

+ 4 - 4
core/relay/adaptor/zhipu/model.go

@@ -18,10 +18,10 @@ type EmbeddingRequest struct {
 }
 
 type EmbeddingResponse struct {
-	Model       string          `json:"model"`
-	Object      string          `json:"object"`
-	Embeddings  []EmbeddingData `json:"data"`
-	model.Usage `json:"usage"`
+	Model      string               `json:"model"`
+	Object     string               `json:"object"`
+	Embeddings []EmbeddingData      `json:"data"`
+	Usage      model.EmbeddingUsage `json:"usage"`
 }
 
 type EmbeddingData struct {

+ 3 - 3
core/relay/model/chat.go

@@ -5,7 +5,7 @@ import (
 	"github.com/labring/aiproxy/core/relay/adaptor"
 )
 
-type Usage struct {
+type ChatUsage struct {
 	PromptTokens     int64 `json:"prompt_tokens,omitempty"`
 	CompletionTokens int64 `json:"completion_tokens,omitempty"`
 	TotalTokens      int64 `json:"total_tokens"`
@@ -16,7 +16,7 @@ type Usage struct {
 	CompletionTokensDetails *CompletionTokensDetails `json:"completion_tokens_details,omitempty"`
 }
 
-func (u Usage) ToModelUsage() model.Usage {
+func (u ChatUsage) ToModelUsage() model.Usage {
 	usage := model.Usage{
 		InputTokens:    model.ZeroNullInt64(u.PromptTokens),
 		OutputTokens:   model.ZeroNullInt64(u.CompletionTokens),
@@ -33,7 +33,7 @@ func (u Usage) ToModelUsage() model.Usage {
 	return usage
 }
 
-func (u *Usage) Add(other *Usage) {
+func (u *ChatUsage) Add(other *ChatUsage) {
 	if other == nil {
 		return
 	}

+ 3 - 3
core/relay/model/completions.go

@@ -80,7 +80,7 @@ type ChatCompletionsStreamResponseChoice struct {
 }
 
 type ChatCompletionsStreamResponse struct {
-	Usage   *Usage                                 `json:"usage,omitempty"`
+	Usage   *ChatUsage                             `json:"usage,omitempty"`
 	ID      string                                 `json:"id"`
 	Object  string                                 `json:"object"`
 	Model   string                                 `json:"model"`
@@ -100,8 +100,8 @@ type TextResponse struct {
 	Model   string                `json:"model,omitempty"`
 	Object  string                `json:"object"`
 	Choices []*TextResponseChoice `json:"choices"`
-	Usage   `json:"usage"`
-	Created int64 `json:"created"`
+	Usage   ChatUsage             `json:"usage"`
+	Created int64                 `json:"created"`
 }
 
 type Message struct {

+ 25 - 1
core/relay/model/embed.go

@@ -1,5 +1,7 @@
 package model
 
+import "github.com/labring/aiproxy/core/model"
+
 type EmbeddingRequest struct {
 	Input          string `json:"input"`
 	Model          string `json:"model"`
@@ -17,5 +19,27 @@ type EmbeddingResponse struct {
 	Object string                   `json:"object"`
 	Model  string                   `json:"model"`
 	Data   []*EmbeddingResponseItem `json:"data"`
-	Usage  `json:"usage"`
+	Usage  EmbeddingUsage           `json:"usage"`
+}
+
+type EmbeddingUsage struct {
+	PromptTokens        int64                         `json:"prompt_tokens,omitempty"`
+	TotalTokens         int64                         `json:"total_tokens"`
+	PromptTokensDetails *EmbeddingPromptTokensDetails `json:"prompt_tokens_details,omitempty"`
+}
+
+type EmbeddingPromptTokensDetails struct {
+	TextTokens  int64 `json:"text_tokens,omitempty"`
+	ImageTokens int64 `json:"image_tokens,omitempty"`
+}
+
+func (u EmbeddingUsage) ToModelUsage() model.Usage {
+	usage := model.Usage{
+		InputTokens: model.ZeroNullInt64(u.PromptTokens),
+		TotalTokens: model.ZeroNullInt64(u.TotalTokens),
+	}
+	if u.PromptTokensDetails != nil {
+		usage.ImageInputTokens = model.ZeroNullInt64(u.PromptTokensDetails.ImageTokens)
+	}
+	return usage
 }

+ 1 - 0
core/relay/model/tts.go

@@ -6,4 +6,5 @@ type TextToSpeechRequest struct {
 	Voice          string  `json:"voice"           binding:"required"`
 	ResponseFormat string  `json:"response_format"`
 	Speed          float64 `json:"speed"`
+	StreamFormat   string  `json:"stream_format"`
 }