Browse Source

feat: plugin streamfake (#306)

* feat: add stream fake plugin

* feat: add logprobs support

* feat: sort tool calls

* docs: add stream fake plugin docs

* fix: ci lint

* fix: ignore WriteHeaderNow

* feat: add stream fake readme
zijiren 6 months ago
parent
commit
66fdd79fa8

+ 15 - 1
README.md

@@ -49,6 +49,8 @@ AI Proxy is a powerful, production-ready AI gateway that provides intelligent re
 
 - **Cache Plugin**: High-performance caching for identical requests with Redis/memory storage
 - **Web Search Plugin**: Real-time web search capabilities with support for Google, Bing, and Arxiv
+- **Think Split Plugin**: Support for reasoning models with content splitting, automatically handling `<think>` tags
+- **Stream Fake Plugin**: Avoid non-streaming request timeouts through internal streaming transmission
 - **Extensible Architecture**: Easy to add custom plugins for additional functionality
 
 ### 🔧 **Advanced Capabilities**
@@ -59,7 +61,7 @@ AI Proxy is a powerful, production-ready AI gateway that provides intelligent re
 - **Think Mode**: Support for reasoning models with content splitting
 - **Built-in Tokenizer**: No external tiktoken dependencies
 
-### 📊 **Management Panel**
+## 📊 Management Panel
 
 AI Proxy provides a management panel for managing AI Proxy's configuration and monitoring.
 
@@ -79,6 +81,7 @@ graph TB
     Plugins --> CachePlugin[Cache Plugin]
     Plugins --> SearchPlugin[Web Search Plugin]
     Plugins --> ThinkSplitPlugin[Think Split Plugin]
+    Plugins --> StreamFakePlugin[Stream Fake Plugin]
 
     Router --> Provider1[OpenAI]
     Router --> Provider2[Anthropic]
@@ -217,6 +220,17 @@ The Think Split Plugin supports content splitting for reasoning models:
 
 [View Think Split Plugin Documentation](./core/relay/plugin/thinksplit/README.md)
 
+### Stream Fake Plugin
+
+The Stream Fake Plugin solves timeout issues with non-streaming requests:
+
+- **Timeout Avoidance**: Prevents request timeouts through internal streaming transmission
+- **Transparent Conversion**: Automatically converts non-streaming requests to streaming format, transparent to clients
+- **Response Reconstruction**: Collects all streaming data chunks and reconstructs them into complete non-streaming responses
+- **Connection Keep-Alive**: Maintains active connections through streaming transmission to avoid network timeouts
+
+[View Stream Fake Plugin Documentation](./core/relay/plugin/streamfake/README.md)
+
 ## 📚 API Documentation
 
 ### Interactive API Explorer

+ 14 - 0
README.zh.md

@@ -49,6 +49,8 @@ AI Proxy 是一个强大的、生产就绪的 AI 网关,提供智能请求路
 
 - **缓存插件**:高性能缓存,支持 Redis/内存存储,用于相同请求
 - **网络搜索插件**:实时网络搜索功能,支持 Google、Bing 和 Arxiv
+- **思考模式插件**:支持推理模型的内容分割,自动处理 `<think>` 标签
+- **流式伪装插件**:通过内部流式传输避免非流式请求超时问题
 - **可扩展架构**:易于添加自定义插件以实现额外功能
 
 ### 🔧 **高级功能**
@@ -79,6 +81,7 @@ graph TB
     Plugins --> CachePlugin[缓存插件]
     Plugins --> SearchPlugin[网络搜索插件]
     Plugins --> ThinkSplitPlugin[思考模式插件]
+    Plugins --> StreamFakePlugin[流式伪装插件]
     
     Router --> Provider1[OpenAI]
     Router --> Provider2[Anthropic]
@@ -217,6 +220,17 @@ AI Proxy 支持插件系统来扩展其功能。当前可用的插件:
 
 [查看思考模式插件文档](./core/relay/plugin/thinksplit/README.zh.md)
 
+### 流式伪装插件
+
+流式伪装插件解决非流式请求的超时问题:
+
+- **超时避免**:通过内部流式传输避免长时间等待导致的超时
+- **透明转换**:自动将非流式请求转换为流式格式,客户端无感知
+- **响应重构**:收集所有流式数据块并重构为完整的非流式响应
+- **连接保持**:通过流式传输保持连接活跃,避免网络超时
+
+[查看流式伪装插件文档](./core/relay/plugin/streamfake/README.cn.md)
+
 ## 📚 API 文档
 
 ### 交互式 API 浏览器

+ 8 - 15
core/controller/relay-channel.go

@@ -1,6 +1,7 @@
 package controller
 
 import (
+	"context"
 	"errors"
 	"fmt"
 	"math/rand/v2"
@@ -343,21 +344,13 @@ func getInitialChannel(c *gin.Context, modelName string, m mode.Mode) (*initialC
 	}, nil
 }
 
-func getWebSearchChannel(c *gin.Context, modelName string) (*model.Channel, error) {
-	log := common.GetLogger(c)
-	mc := middleware.GetModelCaches(c)
-
-	ids, err := monitor.GetBannedChannelsWithModel(c.Request.Context(), modelName)
-	if err != nil {
-		log.Errorf("get %s auto banned channels failed: %+v", modelName, err)
-	}
-	log.Debugf("%s model banned channels: %+v", modelName, ids)
-
-	errorRates, err := monitor.GetModelChannelErrorRate(c.Request.Context(), modelName)
-	if err != nil {
-		log.Errorf("get channel model error rates failed: %+v", err)
-	}
-
+func getWebSearchChannel(
+	ctx context.Context,
+	mc *model.ModelCaches,
+	modelName string,
+) (*model.Channel, error) {
+	ids, _ := monitor.GetBannedChannelsWithModel(ctx, modelName)
+	errorRates, _ := monitor.GetModelChannelErrorRate(ctx, modelName)
 	channel, _, err := getChannelWithFallback(
 		mc,
 		nil,

+ 17 - 10
core/controller/relay-controller.go

@@ -2,6 +2,7 @@ package controller
 
 import (
 	"bytes"
+	"context"
 	"errors"
 	"fmt"
 	"io"
@@ -28,6 +29,7 @@ import (
 	"github.com/labring/aiproxy/core/relay/plugin"
 	"github.com/labring/aiproxy/core/relay/plugin/cache"
 	monitorplugin "github.com/labring/aiproxy/core/relay/plugin/monitor"
+	"github.com/labring/aiproxy/core/relay/plugin/streamfake"
 	"github.com/labring/aiproxy/core/relay/plugin/thinksplit"
 	websearch "github.com/labring/aiproxy/core/relay/plugin/web-search"
 )
@@ -77,6 +79,19 @@ func (s *storeImpl) SaveStore(store adaptor.StoreCache) error {
 	return err
 }
 
+func wrapPlugin(ctx context.Context, mc *model.ModelCaches, a adaptor.Adaptor) adaptor.Adaptor {
+	return plugin.WrapperAdaptor(a,
+		monitorplugin.NewGroupMonitorPlugin(),
+		cache.NewCachePlugin(common.RDB),
+		streamfake.NewStreamFakePlugin(),
+		websearch.NewWebSearchPlugin(func(modelName string) (*model.Channel, error) {
+			return getWebSearchChannel(ctx, mc, modelName)
+		}),
+		thinksplit.NewThinkPlugin(),
+		monitorplugin.NewChannelMonitorPlugin(),
+	)
+}
+
 func relayHandler(c *gin.Context, meta *meta.Meta) *controller.HandleResult {
 	log := common.GetLogger(c)
 	middleware.SetLogFieldsFromMeta(meta, log.Data)
@@ -92,17 +107,9 @@ func relayHandler(c *gin.Context, meta *meta.Meta) *controller.HandleResult {
 		}
 	}
 
-	a := plugin.WrapperAdaptor(adaptor,
-		monitorplugin.NewGroupMonitorPlugin(),
-		cache.NewCachePlugin(common.RDB),
-		websearch.NewWebSearchPlugin(func(modelName string) (*model.Channel, error) {
-			return getWebSearchChannel(c, modelName)
-		}),
-		thinksplit.NewThinkPlugin(),
-		monitorplugin.NewChannelMonitorPlugin(),
-	)
+	adaptor = wrapPlugin(c.Request.Context(), middleware.GetModelCaches(c), adaptor)
 
-	return controller.Handle(a, c, meta, adaptorStore)
+	return controller.Handle(adaptor, c, meta, adaptorStore)
 }
 
 func relayController(m mode.Mode) RelayController {

+ 5 - 5
core/relay/adaptor/anthropic/openai.go

@@ -285,7 +285,7 @@ func StreamResponse2OpenAI(
 	var content string
 	var thinking string
 	var stopReason string
-	tools := make([]*relaymodel.Tool, 0)
+	tools := make([]*relaymodel.ToolCall, 0)
 
 	var claudeResponse StreamResponse
 	err := sonic.Unmarshal(respData, &claudeResponse)
@@ -309,7 +309,7 @@ func StreamResponse2OpenAI(
 		if claudeResponse.ContentBlock != nil {
 			content = claudeResponse.ContentBlock.Text
 			if claudeResponse.ContentBlock.Type == toolUseType {
-				tools = append(tools, &relaymodel.Tool{
+				tools = append(tools, &relaymodel.ToolCall{
 					ID:   claudeResponse.ContentBlock.ID,
 					Type: "function",
 					Function: relaymodel.Function{
@@ -322,7 +322,7 @@ func StreamResponse2OpenAI(
 		if claudeResponse.Delta != nil {
 			switch claudeResponse.Delta.Type {
 			case "input_json_delta":
-				tools = append(tools, &relaymodel.Tool{
+				tools = append(tools, &relaymodel.ToolCall{
 					Type: "function",
 					Function: relaymodel.Function{
 						Arguments: claudeResponse.Delta.PartialJSON,
@@ -396,7 +396,7 @@ func Response2OpenAI(
 
 	var content string
 	var thinking string
-	tools := make([]*relaymodel.Tool, 0)
+	tools := make([]*relaymodel.ToolCall, 0)
 	for _, v := range claudeResponse.Content {
 		switch v.Type {
 		case conetentTypeText:
@@ -405,7 +405,7 @@ func Response2OpenAI(
 			thinking = v.Thinking
 		case toolUseType:
 			args, _ := sonic.MarshalString(v.Input)
-			tools = append(tools, &relaymodel.Tool{
+			tools = append(tools, &relaymodel.ToolCall{
 				ID:   v.ID,
 				Type: "function",
 				Function: relaymodel.Function{

+ 2 - 2
core/relay/adaptor/gemini/main.go

@@ -428,7 +428,7 @@ type ChatPromptFeedback struct {
 	SafetyRatings []ChatSafetyRating `json:"safetyRatings"`
 }
 
-func getToolCall(item *Part) (*relaymodel.Tool, error) {
+func getToolCall(item *Part) (*relaymodel.ToolCall, error) {
 	if item.FunctionCall == nil {
 		return nil, nil
 	}
@@ -436,7 +436,7 @@ func getToolCall(item *Part) (*relaymodel.Tool, error) {
 	if err != nil {
 		return nil, err
 	}
-	toolCall := relaymodel.Tool{
+	toolCall := relaymodel.ToolCall{
 		ID:   openai.CallID(),
 		Type: "function",
 		Function: relaymodel.Function{

+ 3 - 3
core/relay/adaptor/ollama/main.go

@@ -116,17 +116,17 @@ func ConvertRequest(meta *meta.Meta, req *http.Request) (adaptor.ConvertResult,
 	}, nil
 }
 
-func getToolCalls(ollamaResponse *ChatResponse) []*relaymodel.Tool {
+func getToolCalls(ollamaResponse *ChatResponse) []*relaymodel.ToolCall {
 	if ollamaResponse.Message == nil || len(ollamaResponse.Message.ToolCalls) == 0 {
 		return nil
 	}
-	toolCalls := make([]*relaymodel.Tool, 0, len(ollamaResponse.Message.ToolCalls))
+	toolCalls := make([]*relaymodel.ToolCall, 0, len(ollamaResponse.Message.ToolCalls))
 	for _, tool := range ollamaResponse.Message.ToolCalls {
 		argString, err := sonic.MarshalString(tool.Function.Arguments)
 		if err != nil {
 			continue
 		}
-		toolCalls = append(toolCalls, &relaymodel.Tool{
+		toolCalls = append(toolCalls, &relaymodel.ToolCall{
 			ID:   openai.CallID(),
 			Type: "function",
 			Function: relaymodel.Function{

+ 6 - 6
core/relay/model/completions.go

@@ -105,12 +105,12 @@ type TextResponse struct {
 }
 
 type Message struct {
-	Content          any     `json:"content,omitempty"`
-	ReasoningContent string  `json:"reasoning_content,omitempty"`
-	Name             *string `json:"name,omitempty"`
-	Role             string  `json:"role,omitempty"`
-	ToolCallID       string  `json:"tool_call_id,omitempty"`
-	ToolCalls        []*Tool `json:"tool_calls,omitempty"`
+	Content          any         `json:"content,omitempty"`
+	ReasoningContent string      `json:"reasoning_content,omitempty"`
+	Name             *string     `json:"name,omitempty"`
+	Role             string      `json:"role,omitempty"`
+	ToolCallID       string      `json:"tool_call_id,omitempty"`
+	ToolCalls        []*ToolCall `json:"tool_calls,omitempty"`
 }
 
 func (m *Message) IsStringContent() bool {

+ 8 - 2
core/relay/model/tool.go

@@ -1,8 +1,7 @@
 package model
 
 type Tool struct {
-	ID       string   `json:"id,omitempty"`
-	Type     string   `json:"type,omitempty"`
+	Type     string   `json:"type"`
 	Function Function `json:"function"`
 }
 
@@ -12,3 +11,10 @@ type Function struct {
 	Description string `json:"description,omitempty"`
 	Name        string `json:"name,omitempty"`
 }
+
+type ToolCall struct {
+	Index    int      `json:"index"`
+	ID       string   `json:"id"`
+	Type     string   `json:"type"`
+	Function Function `json:"function"`
+}

+ 136 - 0
core/relay/plugin/streamfake/README.cn.md

@@ -0,0 +1,136 @@
+# Stream Fake Plugin 配置指南
+
+## 概述
+
+Stream Fake Plugin 是一个专门用于解决非流式请求超时问题的插件。当 AI 模型响应时间较长时,非流式请求可能会因为等待完整响应而导致超时。该插件通过在内部将非流式请求转换为流式请求来避免超时问题,然后将流式响应重新组装为非流式格式返回给客户端,从而在保持客户端兼容性的同时解决超时问题。
+
+## 功能特性
+
+- **超时避免**:通过流式传输避免长时间等待导致的请求超时
+- **透明转换**:自动将非流式请求转换为流式格式,客户端无感知
+- **响应重构**:收集所有流式数据块并重构为完整的非流式响应
+- **内容完整性**:确保所有内容类型都被正确处理和聚合:
+  - 常规内容
+  - 推理内容(适用于支持思考过程的模型)
+  - 工具调用及其正确合并
+  - 对数概率
+- **连接保持**:通过流式传输保持连接活跃,避免网络超时
+
+## 解决的问题
+
+### 主要问题:上游请求超时
+
+- **长响应超时**:AI 模型生成长文本或复杂响应时,非流式请求容易超时
+- **网络超时**:在网络不稳定环境下,长时间等待完整响应导致连接超时
+- **代理超时**:通过代理服务器时,代理可能因为长时间无数据而断开连接
+
+### 解决方案
+
+通过内部流式传输,连接始终保持活跃状态,避免各种超时问题,同时客户端仍然接收到期望的非流式响应格式。
+
+## 使用场景
+
+1. **长文本生成**:生成长篇文章、报告或代码时避免超时
+2. **复杂推理任务**:需要较长思考时间的复杂问题处理
+3. **不稳定网络环境**:网络延迟较高或不稳定的环境
+4. **严格超时限制**:客户端或中间件有严格的超时限制
+5. **遗留系统兼容**:无法修改客户端超时设置的遗留系统
+
+## 工作原理
+
+### 问题识别
+
+1. 检测到非流式聊天完成请求(`"stream": false` 或未设置)
+2. 识别可能导致超时的长响应场景
+
+### 内部转换
+
+1. 将请求修改为流式格式(`"stream": true`)
+2. 转发修改后的请求到上游 API
+3. 开始接收流式响应数据
+
+### 响应处理
+
+1. 实时接收流式数据块,保持连接活跃
+2. 聚合所有响应内容
+3. 处理不同类型的内容片段
+4. 重构完整的非流式响应格式
+5. 设置正确的响应头并返回给客户端
+
+### 超时避免机制
+
+- **持续数据流**:流式响应确保连接始终有数据传输
+- **连接保活**:避免因长时间无响应导致的连接断开
+- **渐进式处理**:边接收边处理,减少总体等待时间
+
+## 配置示例
+
+```json
+{
+    "model": "gpt-4",
+    "type": 1,
+    "plugin": {
+        "stream-fake": {
+            "enable": true
+        }
+    }
+}
+```
+
+## 配置字段说明
+
+| 字段 | 类型 | 必填 | 默认值 | 说明 |
+|------|------|------|--------|------|
+| `enable` | bool | 是 | false | 是否启用 Stream Fake 插件以避免超时问题 |
+
+## 超时场景示例
+
+### 场景1:长文本生成超时
+
+**问题**:请求生成 5000 字的技术文档,非流式请求在 60 秒后超时
+
+**原始请求**:
+
+```json
+{
+    "model": "gpt-4",
+    "messages": [
+        {
+            "role": "user",
+            "content": "请写一份详细的 5000 字技术文档,介绍微服务架构的设计原则和最佳实践"
+        }
+    ],
+    "stream": false,
+    "max_tokens": 4000
+}
+```
+
+**插件处理**:
+
+1. 自动转换为 `"stream": true`
+2. 实时接收响应片段,避免超时
+3. 重构为完整的非流式响应返回
+
+### 场景2:复杂推理任务超时
+
+**问题**:复杂数学问题需要长时间思考,导致请求超时
+
+**解决方案**:
+
+- 插件确保在模型思考过程中连接保持活跃
+- 即使推理时间很长也不会导致超时
+- 客户端最终收到完整的推理结果
+
+## 性能优势
+
+### 超时避免
+
+- **消除连接超时**:流式传输保持连接活跃
+- **避免代理超时**:中间代理不会因长时间无数据而断开
+- **减少重试次数**:避免因超时导致的请求重试
+
+### 响应时间
+
+- **感知响应更快**:虽然总时间基本相同,但避免了超时重试
+- **更好的用户体验**:避免请求失败和重新发起请求
+- **资源利用率提升**:减少因超时导致的资源浪费

+ 136 - 0
core/relay/plugin/streamfake/README.md

@@ -0,0 +1,136 @@
+# Stream Fake Plugin Configuration Guide
+
+## Overview
+
+Stream Fake Plugin is a specialized plugin designed to solve timeout issues with non-streaming requests. When AI models take a long time to respond, non-streaming requests may timeout while waiting for the complete response. This plugin avoids timeout issues by internally converting non-streaming requests to streaming requests, then reassembling the streaming response back to non-streaming format for the client, thus solving timeout problems while maintaining client compatibility.
+
+## Features
+
+- **Timeout Avoidance**: Prevents request timeouts caused by long waits through streaming transmission
+- **Transparent Conversion**: Automatically converts non-streaming requests to streaming format, transparent to clients
+- **Response Reconstruction**: Collects all streaming data chunks and reconstructs them into complete non-streaming responses
+- **Content Integrity**: Ensures all content types are properly processed and aggregated:
+  - Regular content
+  - Reasoning content (for models that support thinking processes)
+  - Tool calls and their proper merging
+  - Log probabilities
+- **Connection Keep-Alive**: Maintains active connections through streaming transmission to avoid network timeouts
+
+## Problems Solved
+
+### Primary Issue: Upstream Request Timeout
+
+- **Long Response Timeout**: When AI models generate long texts or complex responses, non-streaming requests are prone to timeout
+- **Network Timeout**: In unstable network environments, long waits for complete responses cause connection timeouts
+- **Proxy Timeout**: When going through proxy servers, proxies may disconnect due to prolonged periods without data
+
+### Solution
+
+Through internal streaming transmission, connections remain active at all times, avoiding various timeout issues while clients still receive the expected non-streaming response format.
+
+## Use Cases
+
+1. **Long Text Generation**: Avoiding timeouts when generating long articles, reports, or code
+2. **Complex Reasoning Tasks**: Handling complex problems that require extended thinking time
+3. **Unstable Network Environments**: Environments with high latency or unstable networks
+4. **Strict Timeout Restrictions**: Clients or middleware with strict timeout limitations
+5. **Legacy System Compatibility**: Legacy systems where client timeout settings cannot be modified
+
+## How It Works
+
+### Problem Identification
+
+1. Detects non-streaming chat completion requests (`"stream": false` or not set)
+2. Identifies scenarios with long responses that may cause timeouts
+
+### Internal Conversion
+
+1. Modifies the request to streaming format (`"stream": true`)
+2. Forwards the modified request to upstream API
+3. Begins receiving streaming response data
+
+### Response Processing
+
+1. Receives streaming data chunks in real-time, keeping connection active
+2. Aggregates all response content
+3. Processes different types of content fragments
+4. Reconstructs complete non-streaming response format
+5. Sets correct response headers and returns to client
+
+### Timeout Avoidance Mechanism
+
+- **Continuous Data Flow**: Streaming responses ensure connections always have data transmission
+- **Connection Keep-Alive**: Avoids disconnection due to prolonged periods without response
+- **Progressive Processing**: Receives and processes simultaneously, reducing overall wait time
+
+## Configuration Example
+
+```json
+{
+    "model": "gpt-4",
+    "type": 1,
+    "plugin": {
+        "stream-fake": {
+            "enable": true
+        }
+    }
+}
+```
+
+## Configuration Fields
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `enable` | bool | Yes | false | Whether to enable Stream Fake Plugin to avoid timeout issues |
+
+## Timeout Scenario Examples
+
+### Scenario 1: Long Text Generation Timeout
+
+**Problem**: Requesting generation of a 5000-word technical document, non-streaming request times out after 60 seconds
+
+**Original Request**:
+
+```json
+{
+    "model": "gpt-4",
+    "messages": [
+        {
+            "role": "user",
+            "content": "Please write a detailed 5000-word technical document introducing microservice architecture design principles and best practices"
+        }
+    ],
+    "stream": false,
+    "max_tokens": 4000
+}
+```
+
+**Plugin Processing**:
+
+1. Automatically converts to `"stream": true`
+2. Receives response fragments in real-time, avoiding timeout
+3. Reconstructs into complete non-streaming response for return
+
+### Scenario 2: Complex Reasoning Task Timeout
+
+**Problem**: Complex mathematical problems require long thinking time, causing request timeout
+
+**Solution**:
+
+- Plugin ensures connection remains active during model thinking process
+- No timeout occurs even with extended reasoning time
+- Client ultimately receives complete reasoning results
+
+## Performance Benefits
+
+### Timeout Avoidance
+
+- **Eliminates Connection Timeouts**: Streaming transmission keeps connections active
+- **Avoids Proxy Timeouts**: Intermediate proxies won't disconnect due to prolonged periods without data
+- **Reduces Retry Attempts**: Avoids request retries caused by timeouts
+
+### Response Time
+
+- **Faster Perceived Response**: While total time remains essentially the same, timeout retries are avoided
+- **Better User Experience**: Avoids request failures and the need to reinitiate requests
+- **Improved Resource Utilization**: Reduces resource waste caused by timeouts

+ 6 - 0
core/relay/plugin/streamfake/config.go

@@ -0,0 +1,6 @@
+package streamfake
+
+// Config represents the plugin configuration
+type Config struct {
+	Enable bool `json:"enable"`
+}

+ 366 - 0
core/relay/plugin/streamfake/fake.go

@@ -0,0 +1,366 @@
+package streamfake
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+	"net/http"
+	"slices"
+	"strconv"
+
+	"github.com/bytedance/sonic"
+	"github.com/bytedance/sonic/ast"
+	"github.com/gin-gonic/gin"
+	"github.com/labring/aiproxy/core/common"
+	"github.com/labring/aiproxy/core/common/conv"
+	"github.com/labring/aiproxy/core/model"
+	"github.com/labring/aiproxy/core/relay/adaptor"
+	"github.com/labring/aiproxy/core/relay/meta"
+	"github.com/labring/aiproxy/core/relay/mode"
+	relaymodel "github.com/labring/aiproxy/core/relay/model"
+	"github.com/labring/aiproxy/core/relay/plugin"
+	"github.com/labring/aiproxy/core/relay/plugin/noop"
+)
+
+var _ plugin.Plugin = (*StreamFake)(nil)
+
+// StreamFake implements the stream fake functionality
+type StreamFake struct {
+	noop.Noop
+}
+
+// NewStreamFakePlugin creates a new stream fake plugin instance
+func NewStreamFakePlugin() plugin.Plugin {
+	return &StreamFake{}
+}
+
+// Constants for metadata keys
+const (
+	fakeStreamKey = "fake_stream"
+)
+
+// getConfig retrieves the plugin configuration
+func (p *StreamFake) getConfig(meta *meta.Meta) (*Config, error) {
+	pluginConfig := &Config{}
+	if err := meta.ModelConfig.LoadPluginConfig("stream-fake", pluginConfig); err != nil {
+		return nil, err
+	}
+	return pluginConfig, nil
+}
+
+// ConvertRequest modifies the request to enable streaming if it's originally non-streaming
+func (p *StreamFake) ConvertRequest(
+	meta *meta.Meta,
+	store adaptor.Store,
+	req *http.Request,
+	do adaptor.ConvertRequest,
+) (adaptor.ConvertResult, error) {
+	// Only process chat completions
+	if meta.Mode != mode.ChatCompletions {
+		return do.ConvertRequest(meta, store, req)
+	}
+
+	// Check if stream fake is enabled
+	pluginConfig, err := p.getConfig(meta)
+	if err != nil || !pluginConfig.Enable {
+		return do.ConvertRequest(meta, store, req)
+	}
+
+	body, err := common.GetRequestBodyReusable(req)
+	if err != nil {
+		return adaptor.ConvertResult{}, fmt.Errorf("failed to read request body: %w", err)
+	}
+
+	node, err := sonic.Get(body)
+	if err != nil {
+		return do.ConvertRequest(meta, store, req)
+	}
+
+	stream, _ := node.Get("stream").Bool()
+	if stream {
+		// Already streaming, no need to fake
+		return do.ConvertRequest(meta, store, req)
+	}
+
+	// Modify request to enable streaming
+	_, err = node.Set("stream", ast.NewBool(true))
+	if err != nil {
+		return do.ConvertRequest(meta, store, req)
+	}
+
+	// Create new request body
+	modifiedBody, err := node.MarshalJSON()
+	if err != nil {
+		return do.ConvertRequest(meta, store, req)
+	}
+
+	// Update the request
+	common.SetRequestBody(req, modifiedBody)
+	defer common.SetRequestBody(req, body)
+
+	meta.Set(fakeStreamKey, true)
+
+	return do.ConvertRequest(meta, store, req)
+}
+
+// DoResponse handles the response processing to collect streaming data and convert back to non-streaming
+func (p *StreamFake) DoResponse(
+	meta *meta.Meta,
+	store adaptor.Store,
+	c *gin.Context,
+	resp *http.Response,
+	do adaptor.DoResponse,
+) (model.Usage, adaptor.Error) {
+	// Only process chat completions
+	if meta.Mode != mode.ChatCompletions {
+		return do.DoResponse(meta, store, c, resp)
+	}
+
+	// Check if this is a fake stream request
+	isFakeStream, ok := meta.Get(fakeStreamKey)
+	if !ok {
+		return do.DoResponse(meta, store, c, resp)
+	}
+	isFakeStreamBool, ok := isFakeStream.(bool)
+	if !ok || !isFakeStreamBool {
+		return do.DoResponse(meta, store, c, resp)
+	}
+
+	return p.handleFakeStreamResponse(meta, store, c, resp, do)
+}
+
+// handleFakeStreamResponse processes the streaming response and converts it back to non-streaming
+func (p *StreamFake) handleFakeStreamResponse(
+	meta *meta.Meta,
+	store adaptor.Store,
+	c *gin.Context,
+	resp *http.Response,
+	do adaptor.DoResponse,
+) (model.Usage, adaptor.Error) {
+	log := common.GetLogger(c)
+	// Create a custom response writer to collect streaming data
+	rw := &fakeStreamResponseWriter{
+		ResponseWriter: c.Writer,
+	}
+	c.Writer = rw
+	defer func() {
+		c.Writer = rw.ResponseWriter
+	}()
+
+	// Process the streaming response
+	usage, relayErr := do.DoResponse(meta, store, c, resp)
+	if relayErr != nil {
+		return usage, relayErr
+	}
+
+	// Convert collected streaming chunks to non-streaming response
+	respBody, err := rw.convertToNonStream()
+	if err != nil {
+		log.Errorf("failed to convert to non-streaming response: %v", err)
+		return usage, relayErr
+	}
+
+	// Set appropriate headers for non-streaming response
+	c.Header("Content-Type", "application/json")
+	c.Header("Content-Length", strconv.Itoa(len(respBody)))
+
+	// Remove streaming-specific headers
+	c.Header("Cache-Control", "")
+	c.Header("Connection", "")
+	c.Header("Transfer-Encoding", "")
+	c.Header("X-Accel-Buffering", "")
+
+	// Write the non-streaming response
+	_, _ = rw.ResponseWriter.Write(respBody)
+
+	return usage, nil
+}
+
+// fakeStreamResponseWriter captures streaming response data
+type fakeStreamResponseWriter struct {
+	gin.ResponseWriter
+
+	lastChunk        *ast.Node
+	usageNode        *ast.Node
+	contentBuilder   bytes.Buffer
+	reasoningContent bytes.Buffer
+	finishReason     relaymodel.FinishReason
+	logprobsContent  []ast.Node
+	toolCalls        []*relaymodel.ToolCall
+}
+
+// ignore flush
+func (rw *fakeStreamResponseWriter) Flush() {}
+
+// ignore WriteHeaderNow
+func (rw *fakeStreamResponseWriter) WriteHeaderNow() {}
+
+func (rw *fakeStreamResponseWriter) Write(b []byte) (int, error) {
+	// Parse streaming data
+	_ = rw.parseStreamingData(b)
+
+	return len(b), nil
+}
+
+func (rw *fakeStreamResponseWriter) WriteString(s string) (int, error) {
+	return rw.Write(conv.StringToBytes(s))
+}
+
+// parseStreamingData extracts individual chunks from streaming response
+func (rw *fakeStreamResponseWriter) parseStreamingData(data []byte) error {
+	node, err := sonic.Get(data)
+	if err != nil {
+		return err
+	}
+	rw.lastChunk = &node
+	usageNode := node.Get("usage")
+	if err := usageNode.Check(); err != nil {
+		if !errors.Is(err, ast.ErrNotExist) {
+			return err
+		}
+	} else {
+		rw.usageNode = usageNode
+	}
+
+	choicesNode := node.Get("choices")
+	if err := choicesNode.Check(); err != nil {
+		return err
+	}
+
+	return choicesNode.ForEach(func(_ ast.Sequence, choiceNode *ast.Node) bool {
+		deltaNode := choiceNode.Get("delta")
+		if err := deltaNode.Check(); err != nil {
+			return true
+		}
+		content, err := deltaNode.Get("content").String()
+		if err == nil {
+			rw.contentBuilder.WriteString(content)
+		}
+		reasoningContent, err := deltaNode.Get("reasoning_content").String()
+		if err == nil {
+			rw.reasoningContent.WriteString(reasoningContent)
+		}
+		_ = deltaNode.Get("tool_calls").
+			ForEach(func(_ ast.Sequence, toolCallNode *ast.Node) bool {
+				toolCallRaw, err := toolCallNode.Raw()
+				if err != nil {
+					return true
+				}
+				var toolCall relaymodel.ToolCall
+				if err := sonic.UnmarshalString(toolCallRaw, &toolCall); err != nil {
+					return true
+				}
+				rw.toolCalls = mergeToolCalls(rw.toolCalls, &toolCall)
+				return true
+			})
+		finishReason, err := choiceNode.Get("finish_reason").String()
+		if err == nil && finishReason != "" {
+			rw.finishReason = finishReason
+		}
+		logprobsContentNode := choiceNode.Get("logprobs").Get("content")
+		if err := logprobsContentNode.Check(); err == nil {
+			l, err := logprobsContentNode.Len()
+			if err != nil {
+				return true
+			}
+			rw.logprobsContent = slices.Grow(rw.logprobsContent, l)
+			_ = logprobsContentNode.ForEach(
+				func(_ ast.Sequence, logprobsContentNode *ast.Node) bool {
+					rw.logprobsContent = append(rw.logprobsContent, *logprobsContentNode)
+					return true
+				},
+			)
+		}
+		return true
+	})
+}
+
+func (rw *fakeStreamResponseWriter) convertToNonStream() ([]byte, error) {
+	lastChunk := rw.lastChunk
+	if lastChunk == nil {
+		return nil, errors.New("last chunk is nil")
+	}
+
+	_, err := lastChunk.Set("object", ast.NewString(relaymodel.ChatCompletionObject))
+	if err != nil {
+		return nil, err
+	}
+	if rw.usageNode != nil {
+		_, err = lastChunk.Set("usage", *rw.usageNode)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	message := map[string]any{
+		"role":    "assistant",
+		"content": rw.contentBuilder.String(),
+	}
+
+	reasoningContent := rw.reasoningContent.String()
+	if reasoningContent != "" {
+		message["reasoning_content"] = reasoningContent
+	}
+
+	if len(rw.toolCalls) > 0 {
+		slices.SortFunc(rw.toolCalls, func(a, b *relaymodel.ToolCall) int {
+			return a.Index - b.Index
+		})
+		message["tool_calls"] = rw.toolCalls
+	}
+	if len(rw.logprobsContent) > 0 {
+		message["logprobs"] = map[string]any{
+			"content": rw.logprobsContent,
+		}
+	}
+
+	_, err = lastChunk.SetAny("choices", []any{
+		map[string]any{
+			"index":         0,
+			"message":       message,
+			"finish_reason": rw.finishReason,
+		},
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	return lastChunk.MarshalJSON()
+}
+
+func mergeToolCalls(
+	oldToolCalls []*relaymodel.ToolCall,
+	newToolCall *relaymodel.ToolCall,
+) []*relaymodel.ToolCall {
+	findedToolCallIndex := slices.IndexFunc(oldToolCalls, func(t *relaymodel.ToolCall) bool {
+		return t.Index == newToolCall.Index
+	})
+	if findedToolCallIndex != -1 {
+		oldToolCall := oldToolCalls[findedToolCallIndex]
+		oldToolCalls[findedToolCallIndex] = mergeToolCall(oldToolCall, newToolCall)
+	} else {
+		oldToolCalls = append(oldToolCalls, newToolCall)
+	}
+	return oldToolCalls
+}
+
+func mergeToolCall(oldToolCall, newToolCall *relaymodel.ToolCall) *relaymodel.ToolCall {
+	if oldToolCall == nil {
+		return newToolCall
+	}
+
+	if newToolCall == nil {
+		return oldToolCall
+	}
+
+	merged := &relaymodel.ToolCall{
+		Index:    oldToolCall.Index,
+		ID:       oldToolCall.ID,
+		Type:     oldToolCall.Type,
+		Function: oldToolCall.Function,
+	}
+
+	merged.Function.Arguments += newToolCall.Function.Arguments
+
+	return merged
+}

+ 3 - 0
core/relay/plugin/thinksplit/split.go

@@ -95,6 +95,9 @@ func (rw *thinkResponseWriter) getThinkSplitter() *splitter.Splitter {
 	return rw.thinkSplitter
 }
 
+// ignore WriteHeaderNow
+func (rw *thinkResponseWriter) WriteHeaderNow() {}
+
 func (rw *thinkResponseWriter) Write(b []byte) (int, error) {
 	if rw.done {
 		return rw.ResponseWriter.Write(b)