6 months ago · 66fdd79fa8
--- a/README.md
+++ b/README.md
@@ -49,6 +49,8 @@ AI Proxy is a powerful, production-ready AI gateway that provides intelligent re
 
				 
			
 
				 - **Cache Plugin**: High-performance caching for identical requests with Redis/memory storage
			
 
				 - **Web Search Plugin**: Real-time web search capabilities with support for Google, Bing, and Arxiv
			
 
				+- **Think Split Plugin**: Support for reasoning models with content splitting, automatically handling `<think>` tags
			
 
				+- **Stream Fake Plugin**: Avoid non-streaming request timeouts through internal streaming transmission
			
 
				 - **Extensible Architecture**: Easy to add custom plugins for additional functionality
			
 
				 
			
 
				 ### 🔧 **Advanced Capabilities**
			
@@ -59,7 +61,7 @@ AI Proxy is a powerful, production-ready AI gateway that provides intelligent re
 
				 - **Think Mode**: Support for reasoning models with content splitting
			
 
				 - **Built-in Tokenizer**: No external tiktoken dependencies
			
 
				 
			
 
				-### 📊 **Management Panel**
			
 
				+## 📊 Management Panel
			
 
				 
			
 
				 AI Proxy provides a management panel for managing AI Proxy's configuration and monitoring.
			
 
				 
			
@@ -79,6 +81,7 @@ graph TB
 
				     Plugins --> CachePlugin[Cache Plugin]
			
 
				     Plugins --> SearchPlugin[Web Search Plugin]
			
 
				     Plugins --> ThinkSplitPlugin[Think Split Plugin]
			
 
				+    Plugins --> StreamFakePlugin[Stream Fake Plugin]
			
 
				 
			
 
				     Router --> Provider1[OpenAI]
			
 
				     Router --> Provider2[Anthropic]
			
@@ -217,6 +220,17 @@ The Think Split Plugin supports content splitting for reasoning models:
 
				 
			
 
				 [View Think Split Plugin Documentation](./core/relay/plugin/thinksplit/README.md)
			
 
				 
			
 
				+### Stream Fake Plugin
			
 
				+
			
 
				+The Stream Fake Plugin solves timeout issues with non-streaming requests:
			
 
				+
			
 
				+- **Timeout Avoidance**: Prevents request timeouts through internal streaming transmission
			
 
				+- **Transparent Conversion**: Automatically converts non-streaming requests to streaming format, transparent to clients
			
 
				+- **Response Reconstruction**: Collects all streaming data chunks and reconstructs them into complete non-streaming responses
			
 
				+- **Connection Keep-Alive**: Maintains active connections through streaming transmission to avoid network timeouts
			
 
				+
			
 
				+[View Stream Fake Plugin Documentation](./core/relay/plugin/streamfake/README.md)
			
 
				+
			
 
				 ## 📚 API Documentation
			
 
				 
			
 
				 ### Interactive API Explorer
			
--- a/README.zh.md
+++ b/README.zh.md
@@ -49,6 +49,8 @@ AI Proxy 是一个强大的、生产就绪的 AI 网关，提供智能请求路
 
				 
			
 
				 - **缓存插件**：高性能缓存，支持 Redis/内存存储，用于相同请求
			
 
				 - **网络搜索插件**：实时网络搜索功能，支持 Google、Bing 和 Arxiv
			
 
				+- **思考模式插件**：支持推理模型的内容分割，自动处理 `<think>` 标签
			
 
				+- **流式伪装插件**：通过内部流式传输避免非流式请求超时问题
			
 
				 - **可扩展架构**：易于添加自定义插件以实现额外功能
			
 
				 
			
 
				 ### 🔧 **高级功能**
			
@@ -79,6 +81,7 @@ graph TB
 
				     Plugins --> CachePlugin[缓存插件]
			
 
				     Plugins --> SearchPlugin[网络搜索插件]
			
 
				     Plugins --> ThinkSplitPlugin[思考模式插件]
			
 
				+    Plugins --> StreamFakePlugin[流式伪装插件]
			
 
				     
			
 
				     Router --> Provider1[OpenAI]
			
 
				     Router --> Provider2[Anthropic]
			
@@ -217,6 +220,17 @@ AI Proxy 支持插件系统来扩展其功能。当前可用的插件：
 
				 
			
 
				 [查看思考模式插件文档](./core/relay/plugin/thinksplit/README.zh.md)
			
 
				 
			
 
				+### 流式伪装插件
			
 
				+
			
 
				+流式伪装插件解决非流式请求的超时问题：
			
 
				+
			
 
				+- **超时避免**：通过内部流式传输避免长时间等待导致的超时
			
 
				+- **透明转换**：自动将非流式请求转换为流式格式，客户端无感知
			
 
				+- **响应重构**：收集所有流式数据块并重构为完整的非流式响应
			
 
				+- **连接保持**：通过流式传输保持连接活跃，避免网络超时
			
 
				+
			
 
				+[查看流式伪装插件文档](./core/relay/plugin/streamfake/README.cn.md)
			
 
				+
			
 
				 ## 📚 API 文档
			
 
				 
			
 
				 ### 交互式 API 浏览器
			
--- a/core/controller/relay-channel.go
+++ b/core/controller/relay-channel.go
@@ -1,6 +1,7 @@
 
				 package controller
			
 
				 
			
 
				 import (
			
 
				+	"context"
			
 
				 	"errors"
			
 
				 	"fmt"
			
 
				 	"math/rand/v2"
			
@@ -343,21 +344,13 @@ func getInitialChannel(c *gin.Context, modelName string, m mode.Mode) (*initialC
 
				 	}, nil
			
 
				 }
			
 
				 
			
 
				-func getWebSearchChannel(c *gin.Context, modelName string) (*model.Channel, error) {
			
 
				-	log := common.GetLogger(c)
			
 
				-	mc := middleware.GetModelCaches(c)
			
 
				-
			
 
				-	ids, err := monitor.GetBannedChannelsWithModel(c.Request.Context(), modelName)
			
 
				-	if err != nil {
			
 
				-		log.Errorf("get %s auto banned channels failed: %+v", modelName, err)
			
 
				-	}
			
 
				-	log.Debugf("%s model banned channels: %+v", modelName, ids)
			
 
				-
			
 
				-	errorRates, err := monitor.GetModelChannelErrorRate(c.Request.Context(), modelName)
			
 
				-	if err != nil {
			
 
				-		log.Errorf("get channel model error rates failed: %+v", err)
			
 
				-	}
			
 
				-
			
 
				+func getWebSearchChannel(
			
 
				+	ctx context.Context,
			
 
				+	mc *model.ModelCaches,
			
 
				+	modelName string,
			
 
				+) (*model.Channel, error) {
			
 
				+	ids, _ := monitor.GetBannedChannelsWithModel(ctx, modelName)
			
 
				+	errorRates, _ := monitor.GetModelChannelErrorRate(ctx, modelName)
			
 
				 	channel, _, err := getChannelWithFallback(
			
 
				 		mc,
			
 
				 		nil,
			
--- a/core/controller/relay-controller.go
+++ b/core/controller/relay-controller.go
@@ -2,6 +2,7 @@ package controller
 
				 
			
 
				 import (
			
 
				 	"bytes"
			
 
				+	"context"
			
 
				 	"errors"
			
 
				 	"fmt"
			
 
				 	"io"
			
@@ -28,6 +29,7 @@ import (
 
				 	"github.com/labring/aiproxy/core/relay/plugin"
			
 
				 	"github.com/labring/aiproxy/core/relay/plugin/cache"
			
 
				 	monitorplugin "github.com/labring/aiproxy/core/relay/plugin/monitor"
			
 
				+	"github.com/labring/aiproxy/core/relay/plugin/streamfake"
			
 
				 	"github.com/labring/aiproxy/core/relay/plugin/thinksplit"
			
 
				 	websearch "github.com/labring/aiproxy/core/relay/plugin/web-search"
			
 
				 )
			
@@ -77,6 +79,19 @@ func (s *storeImpl) SaveStore(store adaptor.StoreCache) error {
 
				 	return err
			
 
				 }
			
 
				 
			
 
				+func wrapPlugin(ctx context.Context, mc *model.ModelCaches, a adaptor.Adaptor) adaptor.Adaptor {
			
 
				+	return plugin.WrapperAdaptor(a,
			
 
				+		monitorplugin.NewGroupMonitorPlugin(),
			
 
				+		cache.NewCachePlugin(common.RDB),
			
 
				+		streamfake.NewStreamFakePlugin(),
			
 
				+		websearch.NewWebSearchPlugin(func(modelName string) (*model.Channel, error) {
			
 
				+			return getWebSearchChannel(ctx, mc, modelName)
			
 
				+		}),
			
 
				+		thinksplit.NewThinkPlugin(),
			
 
				+		monitorplugin.NewChannelMonitorPlugin(),
			
 
				+	)
			
 
				+}
			
 
				+
			
 
				 func relayHandler(c *gin.Context, meta *meta.Meta) *controller.HandleResult {
			
 
				 	log := common.GetLogger(c)
			
 
				 	middleware.SetLogFieldsFromMeta(meta, log.Data)
			
@@ -92,17 +107,9 @@ func relayHandler(c *gin.Context, meta *meta.Meta) *controller.HandleResult {
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	a := plugin.WrapperAdaptor(adaptor,
			
 
				-		monitorplugin.NewGroupMonitorPlugin(),
			
 
				-		cache.NewCachePlugin(common.RDB),
			
 
				-		websearch.NewWebSearchPlugin(func(modelName string) (*model.Channel, error) {
			
 
				-			return getWebSearchChannel(c, modelName)
			
 
				-		}),
			
 
				-		thinksplit.NewThinkPlugin(),
			
 
				-		monitorplugin.NewChannelMonitorPlugin(),
			
 
				-	)
			
 
				+	adaptor = wrapPlugin(c.Request.Context(), middleware.GetModelCaches(c), adaptor)
			
 
				 
			
 
				-	return controller.Handle(a, c, meta, adaptorStore)
			
 
				+	return controller.Handle(adaptor, c, meta, adaptorStore)
			
 
				 }
			
 
				 
			
 
				 func relayController(m mode.Mode) RelayController {
			
--- a/core/relay/adaptor/anthropic/openai.go
+++ b/core/relay/adaptor/anthropic/openai.go
@@ -285,7 +285,7 @@ func StreamResponse2OpenAI(
 
				 	var content string
			
 
				 	var thinking string
			
 
				 	var stopReason string
			
 
				-	tools := make([]*relaymodel.Tool, 0)
			
 
				+	tools := make([]*relaymodel.ToolCall, 0)
			
 
				 
			
 
				 	var claudeResponse StreamResponse
			
 
				 	err := sonic.Unmarshal(respData, &claudeResponse)
			
@@ -309,7 +309,7 @@ func StreamResponse2OpenAI(
 
				 		if claudeResponse.ContentBlock != nil {
			
 
				 			content = claudeResponse.ContentBlock.Text
			
 
				 			if claudeResponse.ContentBlock.Type == toolUseType {
			
 
				-				tools = append(tools, &relaymodel.Tool{
			
 
				+				tools = append(tools, &relaymodel.ToolCall{
			
 
				 					ID:   claudeResponse.ContentBlock.ID,
			
 
				 					Type: "function",
			
 
				 					Function: relaymodel.Function{
			
@@ -322,7 +322,7 @@ func StreamResponse2OpenAI(
 
				 		if claudeResponse.Delta != nil {
			
 
				 			switch claudeResponse.Delta.Type {
			
 
				 			case "input_json_delta":
			
 
				-				tools = append(tools, &relaymodel.Tool{
			
 
				+				tools = append(tools, &relaymodel.ToolCall{
			
 
				 					Type: "function",
			
 
				 					Function: relaymodel.Function{
			
 
				 						Arguments: claudeResponse.Delta.PartialJSON,
			
@@ -396,7 +396,7 @@ func Response2OpenAI(
 
				 
			
 
				 	var content string
			
 
				 	var thinking string
			
 
				-	tools := make([]*relaymodel.Tool, 0)
			
 
				+	tools := make([]*relaymodel.ToolCall, 0)
			
 
				 	for _, v := range claudeResponse.Content {
			
 
				 		switch v.Type {
			
 
				 		case conetentTypeText:
			
@@ -405,7 +405,7 @@ func Response2OpenAI(
 
				 			thinking = v.Thinking
			
 
				 		case toolUseType:
			
 
				 			args, _ := sonic.MarshalString(v.Input)
			
 
				-			tools = append(tools, &relaymodel.Tool{
			
 
				+			tools = append(tools, &relaymodel.ToolCall{
			
 
				 				ID:   v.ID,
			
 
				 				Type: "function",
			
 
				 				Function: relaymodel.Function{
			
--- a/core/relay/adaptor/gemini/main.go
+++ b/core/relay/adaptor/gemini/main.go
@@ -428,7 +428,7 @@ type ChatPromptFeedback struct {
 
				 	SafetyRatings []ChatSafetyRating `json:"safetyRatings"`
			
 
				 }
			
 
				 
			
 
				-func getToolCall(item *Part) (*relaymodel.Tool, error) {
			
 
				+func getToolCall(item *Part) (*relaymodel.ToolCall, error) {
			
 
				 	if item.FunctionCall == nil {
			
 
				 		return nil, nil
			
 
				 	}
			
@@ -436,7 +436,7 @@ func getToolCall(item *Part) (*relaymodel.Tool, error) {
 
				 	if err != nil {
			
 
				 		return nil, err
			
 
				 	}
			
 
				-	toolCall := relaymodel.Tool{
			
 
				+	toolCall := relaymodel.ToolCall{
			
 
				 		ID:   openai.CallID(),
			
 
				 		Type: "function",
			
 
				 		Function: relaymodel.Function{
			
--- a/core/relay/adaptor/ollama/main.go
+++ b/core/relay/adaptor/ollama/main.go
@@ -116,17 +116,17 @@ func ConvertRequest(meta *meta.Meta, req *http.Request) (adaptor.ConvertResult,
 
				 	}, nil
			
 
				 }
			
 
				 
			
 
				-func getToolCalls(ollamaResponse *ChatResponse) []*relaymodel.Tool {
			
 
				+func getToolCalls(ollamaResponse *ChatResponse) []*relaymodel.ToolCall {
			
 
				 	if ollamaResponse.Message == nil || len(ollamaResponse.Message.ToolCalls) == 0 {
			
 
				 		return nil
			
 
				 	}
			
 
				-	toolCalls := make([]*relaymodel.Tool, 0, len(ollamaResponse.Message.ToolCalls))
			
 
				+	toolCalls := make([]*relaymodel.ToolCall, 0, len(ollamaResponse.Message.ToolCalls))
			
 
				 	for _, tool := range ollamaResponse.Message.ToolCalls {
			
 
				 		argString, err := sonic.MarshalString(tool.Function.Arguments)
			
 
				 		if err != nil {
			
 
				 			continue
			
 
				 		}
			
 
				-		toolCalls = append(toolCalls, &relaymodel.Tool{
			
 
				+		toolCalls = append(toolCalls, &relaymodel.ToolCall{
			
 
				 			ID:   openai.CallID(),
			
 
				 			Type: "function",
			
 
				 			Function: relaymodel.Function{
			
--- a/core/relay/model/completions.go
+++ b/core/relay/model/completions.go
@@ -105,12 +105,12 @@ type TextResponse struct {
 
				 }
			
 
				 
			
 
				 type Message struct {
			
 
				-	Content          any     `json:"content,omitempty"`
			
 
				-	ReasoningContent string  `json:"reasoning_content,omitempty"`
			
 
				-	Name             *string `json:"name,omitempty"`
			
 
				-	Role             string  `json:"role,omitempty"`
			
 
				-	ToolCallID       string  `json:"tool_call_id,omitempty"`
			
 
				-	ToolCalls        []*Tool `json:"tool_calls,omitempty"`
			
 
				+	Content          any         `json:"content,omitempty"`
			
 
				+	ReasoningContent string      `json:"reasoning_content,omitempty"`
			
 
				+	Name             *string     `json:"name,omitempty"`
			
 
				+	Role             string      `json:"role,omitempty"`
			
 
				+	ToolCallID       string      `json:"tool_call_id,omitempty"`
			
 
				+	ToolCalls        []*ToolCall `json:"tool_calls,omitempty"`
			
 
				 }
			
 
				 
			
 
				 func (m *Message) IsStringContent() bool {
			
--- a/core/relay/model/tool.go
+++ b/core/relay/model/tool.go
@@ -1,8 +1,7 @@
 
				 package model
			
 
				 
			
 
				 type Tool struct {
			
 
				-	ID       string   `json:"id,omitempty"`
			
 
				-	Type     string   `json:"type,omitempty"`
			
 
				+	Type     string   `json:"type"`
			
 
				 	Function Function `json:"function"`
			
 
				 }
			
 
				 
			
@@ -12,3 +11,10 @@ type Function struct {
 
				 	Description string `json:"description,omitempty"`
			
 
				 	Name        string `json:"name,omitempty"`
			
 
				 }
			
 
				+
			
 
				+type ToolCall struct {
			
 
				+	Index    int      `json:"index"`
			
 
				+	ID       string   `json:"id"`
			
 
				+	Type     string   `json:"type"`
			
 
				+	Function Function `json:"function"`
			
 
				+}
			
--- a/core/relay/plugin/streamfake/README.cn.md
+++ b/core/relay/plugin/streamfake/README.cn.md
@@ -0,0 +1,136 @@
 
				+# Stream Fake Plugin 配置指南
			
 
				+
			
 
				+## 概述
			
 
				+
			
 
				+Stream Fake Plugin 是一个专门用于解决非流式请求超时问题的插件。当 AI 模型响应时间较长时，非流式请求可能会因为等待完整响应而导致超时。该插件通过在内部将非流式请求转换为流式请求来避免超时问题，然后将流式响应重新组装为非流式格式返回给客户端，从而在保持客户端兼容性的同时解决超时问题。
			
 
				+
			
 
				+## 功能特性
			
 
				+
			
 
				+- **超时避免**：通过流式传输避免长时间等待导致的请求超时
			
 
				+- **透明转换**：自动将非流式请求转换为流式格式，客户端无感知
			
 
				+- **响应重构**：收集所有流式数据块并重构为完整的非流式响应
			
 
				+- **内容完整性**：确保所有内容类型都被正确处理和聚合：
			
 
				+  - 常规内容
			
 
				+  - 推理内容（适用于支持思考过程的模型）
			
 
				+  - 工具调用及其正确合并
			
 
				+  - 对数概率
			
 
				+- **连接保持**：通过流式传输保持连接活跃，避免网络超时
			
 
				+
			
 
				+## 解决的问题
			
 
				+
			
 
				+### 主要问题：上游请求超时
			
 
				+
			
 
				+- **长响应超时**：AI 模型生成长文本或复杂响应时，非流式请求容易超时
			
 
				+- **网络超时**：在网络不稳定环境下，长时间等待完整响应导致连接超时
			
 
				+- **代理超时**：通过代理服务器时，代理可能因为长时间无数据而断开连接
			
 
				+
			
 
				+### 解决方案
			
 
				+
			
 
				+通过内部流式传输，连接始终保持活跃状态，避免各种超时问题，同时客户端仍然接收到期望的非流式响应格式。
			
 
				+
			
 
				+## 使用场景
			
 
				+
			
 
				+1. **长文本生成**：生成长篇文章、报告或代码时避免超时
			
 
				+2. **复杂推理任务**：需要较长思考时间的复杂问题处理
			
 
				+3. **不稳定网络环境**：网络延迟较高或不稳定的环境
			
 
				+4. **严格超时限制**：客户端或中间件有严格的超时限制
			
 
				+5. **遗留系统兼容**：无法修改客户端超时设置的遗留系统
			
 
				+
			
 
				+## 工作原理
			
 
				+
			
 
				+### 问题识别
			
 
				+
			
 
				+1. 检测到非流式聊天完成请求（`"stream": false` 或未设置）
			
 
				+2. 识别可能导致超时的长响应场景
			
 
				+
			
 
				+### 内部转换
			
 
				+
			
 
				+1. 将请求修改为流式格式（`"stream": true`）
			
 
				+2. 转发修改后的请求到上游 API
			
 
				+3. 开始接收流式响应数据
			
 
				+
			
 
				+### 响应处理
			
 
				+
			
 
				+1. 实时接收流式数据块，保持连接活跃
			
 
				+2. 聚合所有响应内容
			
 
				+3. 处理不同类型的内容片段
			
 
				+4. 重构完整的非流式响应格式
			
 
				+5. 设置正确的响应头并返回给客户端
			
 
				+
			
 
				+### 超时避免机制
			
 
				+
			
 
				+- **持续数据流**：流式响应确保连接始终有数据传输
			
 
				+- **连接保活**：避免因长时间无响应导致的连接断开
			
 
				+- **渐进式处理**：边接收边处理，减少总体等待时间
			
 
				+
			
 
				+## 配置示例
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+    "model": "gpt-4",
			
 
				+    "type": 1,
			
 
				+    "plugin": {
			
 
				+        "stream-fake": {
			
 
				+            "enable": true
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+## 配置字段说明
			
 
				+
			
 
				+| 字段 | 类型 | 必填 | 默认值 | 说明 |
			
 
				+|------|------|------|--------|------|
			
 
				+| `enable` | bool | 是 | false | 是否启用 Stream Fake 插件以避免超时问题 |
			
 
				+
			
 
				+## 超时场景示例
			
 
				+
			
 
				+### 场景1：长文本生成超时
			
 
				+
			
 
				+**问题**：请求生成 5000 字的技术文档，非流式请求在 60 秒后超时
			
 
				+
			
 
				+**原始请求**：
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+    "model": "gpt-4",
			
 
				+    "messages": [
			
 
				+        {
			
 
				+            "role": "user",
			
 
				+            "content": "请写一份详细的 5000 字技术文档，介绍微服务架构的设计原则和最佳实践"
			
 
				+        }
			
 
				+    ],
			
 
				+    "stream": false,
			
 
				+    "max_tokens": 4000
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+**插件处理**：
			
 
				+
			
 
				+1. 自动转换为 `"stream": true`
			
 
				+2. 实时接收响应片段，避免超时
			
 
				+3. 重构为完整的非流式响应返回
			
 
				+
			
 
				+### 场景2：复杂推理任务超时
			
 
				+
			
 
				+**问题**：复杂数学问题需要长时间思考，导致请求超时
			
 
				+
			
 
				+**解决方案**：
			
 
				+
			
 
				+- 插件确保在模型思考过程中连接保持活跃
			
 
				+- 即使推理时间很长也不会导致超时
			
 
				+- 客户端最终收到完整的推理结果
			
 
				+
			
 
				+## 性能优势
			
 
				+
			
 
				+### 超时避免
			
 
				+
			
 
				+- **消除连接超时**：流式传输保持连接活跃
			
 
				+- **避免代理超时**：中间代理不会因长时间无数据而断开
			
 
				+- **减少重试次数**：避免因超时导致的请求重试
			
 
				+
			
 
				+### 响应时间
			
 
				+
			
 
				+- **感知响应更快**：虽然总时间基本相同，但避免了超时重试
			
 
				+- **更好的用户体验**：避免请求失败和重新发起请求
			
 
				+- **资源利用率提升**：减少因超时导致的资源浪费
			
--- a/core/relay/plugin/streamfake/README.md
+++ b/core/relay/plugin/streamfake/README.md
@@ -0,0 +1,136 @@
 
				+# Stream Fake Plugin Configuration Guide
			
 
				+
			
 
				+## Overview
			
 
				+
			
 
				+Stream Fake Plugin is a specialized plugin designed to solve timeout issues with non-streaming requests. When AI models take a long time to respond, non-streaming requests may timeout while waiting for the complete response. This plugin avoids timeout issues by internally converting non-streaming requests to streaming requests, then reassembling the streaming response back to non-streaming format for the client, thus solving timeout problems while maintaining client compatibility.
			
 
				+
			
 
				+## Features
			
 
				+
			
 
				+- **Timeout Avoidance**: Prevents request timeouts caused by long waits through streaming transmission
			
 
				+- **Transparent Conversion**: Automatically converts non-streaming requests to streaming format, transparent to clients
			
 
				+- **Response Reconstruction**: Collects all streaming data chunks and reconstructs them into complete non-streaming responses
			
 
				+- **Content Integrity**: Ensures all content types are properly processed and aggregated:
			
 
				+  - Regular content
			
 
				+  - Reasoning content (for models that support thinking processes)
			
 
				+  - Tool calls and their proper merging
			
 
				+  - Log probabilities
			
 
				+- **Connection Keep-Alive**: Maintains active connections through streaming transmission to avoid network timeouts
			
 
				+
			
 
				+## Problems Solved
			
 
				+
			
 
				+### Primary Issue: Upstream Request Timeout
			
 
				+
			
 
				+- **Long Response Timeout**: When AI models generate long texts or complex responses, non-streaming requests are prone to timeout
			
 
				+- **Network Timeout**: In unstable network environments, long waits for complete responses cause connection timeouts
			
 
				+- **Proxy Timeout**: When going through proxy servers, proxies may disconnect due to prolonged periods without data
			
 
				+
			
 
				+### Solution
			
 
				+
			
 
				+Through internal streaming transmission, connections remain active at all times, avoiding various timeout issues while clients still receive the expected non-streaming response format.
			
 
				+
			
 
				+## Use Cases
			
 
				+
			
 
				+1. **Long Text Generation**: Avoiding timeouts when generating long articles, reports, or code
			
 
				+2. **Complex Reasoning Tasks**: Handling complex problems that require extended thinking time
			
 
				+3. **Unstable Network Environments**: Environments with high latency or unstable networks
			
 
				+4. **Strict Timeout Restrictions**: Clients or middleware with strict timeout limitations
			
 
				+5. **Legacy System Compatibility**: Legacy systems where client timeout settings cannot be modified
			
 
				+
			
 
				+## How It Works
			
 
				+
			
 
				+### Problem Identification
			
 
				+
			
 
				+1. Detects non-streaming chat completion requests (`"stream": false` or not set)
			
 
				+2. Identifies scenarios with long responses that may cause timeouts
			
 
				+
			
 
				+### Internal Conversion
			
 
				+
			
 
				+1. Modifies the request to streaming format (`"stream": true`)
			
 
				+2. Forwards the modified request to upstream API
			
 
				+3. Begins receiving streaming response data
			
 
				+
			
 
				+### Response Processing
			
 
				+
			
 
				+1. Receives streaming data chunks in real-time, keeping connection active
			
 
				+2. Aggregates all response content
			
 
				+3. Processes different types of content fragments
			
 
				+4. Reconstructs complete non-streaming response format
			
 
				+5. Sets correct response headers and returns to client
			
 
				+
			
 
				+### Timeout Avoidance Mechanism
			
 
				+
			
 
				+- **Continuous Data Flow**: Streaming responses ensure connections always have data transmission
			
 
				+- **Connection Keep-Alive**: Avoids disconnection due to prolonged periods without response
			
 
				+- **Progressive Processing**: Receives and processes simultaneously, reducing overall wait time
			
 
				+
			
 
				+## Configuration Example
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+    "model": "gpt-4",
			
 
				+    "type": 1,
			
 
				+    "plugin": {
			
 
				+        "stream-fake": {
			
 
				+            "enable": true
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+## Configuration Fields
			
 
				+
			
 
				+| Field | Type | Required | Default | Description |
			
 
				+|-------|------|----------|---------|-------------|
			
 
				+| `enable` | bool | Yes | false | Whether to enable Stream Fake Plugin to avoid timeout issues |
			
 
				+
			
 
				+## Timeout Scenario Examples
			
 
				+
			
 
				+### Scenario 1: Long Text Generation Timeout
			
 
				+
			
 
				+**Problem**: Requesting generation of a 5000-word technical document, non-streaming request times out after 60 seconds
			
 
				+
			
 
				+**Original Request**:
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+    "model": "gpt-4",
			
 
				+    "messages": [
			
 
				+        {
			
 
				+            "role": "user",
			
 
				+            "content": "Please write a detailed 5000-word technical document introducing microservice architecture design principles and best practices"
			
 
				+        }
			
 
				+    ],
			
 
				+    "stream": false,
			
 
				+    "max_tokens": 4000
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+**Plugin Processing**:
			
 
				+
			
 
				+1. Automatically converts to `"stream": true`
			
 
				+2. Receives response fragments in real-time, avoiding timeout
			
 
				+3. Reconstructs into complete non-streaming response for return
			
 
				+
			
 
				+### Scenario 2: Complex Reasoning Task Timeout
			
 
				+
			
 
				+**Problem**: Complex mathematical problems require long thinking time, causing request timeout
			
 
				+
			
 
				+**Solution**:
			
 
				+
			
 
				+- Plugin ensures connection remains active during model thinking process
			
 
				+- No timeout occurs even with extended reasoning time
			
 
				+- Client ultimately receives complete reasoning results
			
 
				+
			
 
				+## Performance Benefits
			
 
				+
			
 
				+### Timeout Avoidance
			
 
				+
			
 
				+- **Eliminates Connection Timeouts**: Streaming transmission keeps connections active
			
 
				+- **Avoids Proxy Timeouts**: Intermediate proxies won't disconnect due to prolonged periods without data
			
 
				+- **Reduces Retry Attempts**: Avoids request retries caused by timeouts
			
 
				+
			
 
				+### Response Time
			
 
				+
			
 
				+- **Faster Perceived Response**: While total time remains essentially the same, timeout retries are avoided
			
 
				+- **Better User Experience**: Avoids request failures and the need to reinitiate requests
			
 
				+- **Improved Resource Utilization**: Reduces resource waste caused by timeouts
			
--- a/core/relay/plugin/streamfake/config.go
+++ b/core/relay/plugin/streamfake/config.go
@@ -0,0 +1,6 @@
 
				+package streamfake
			
 
				+
			
 
				+// Config represents the plugin configuration
			
 
				+type Config struct {
			
 
				+	Enable bool `json:"enable"`
			
 
				+}
			
--- a/core/relay/plugin/streamfake/fake.go
+++ b/core/relay/plugin/streamfake/fake.go
@@ -0,0 +1,366 @@
 
				+package streamfake
			
 
				+
			
 
				+import (
			
 
				+	"bytes"
			
 
				+	"errors"
			
 
				+	"fmt"
			
 
				+	"net/http"
			
 
				+	"slices"
			
 
				+	"strconv"
			
 
				+
			
 
				+	"github.com/bytedance/sonic"
			
 
				+	"github.com/bytedance/sonic/ast"
			
 
				+	"github.com/gin-gonic/gin"
			
 
				+	"github.com/labring/aiproxy/core/common"
			
 
				+	"github.com/labring/aiproxy/core/common/conv"
			
 
				+	"github.com/labring/aiproxy/core/model"
			
 
				+	"github.com/labring/aiproxy/core/relay/adaptor"
			
 
				+	"github.com/labring/aiproxy/core/relay/meta"
			
 
				+	"github.com/labring/aiproxy/core/relay/mode"
			
 
				+	relaymodel "github.com/labring/aiproxy/core/relay/model"
			
 
				+	"github.com/labring/aiproxy/core/relay/plugin"
			
 
				+	"github.com/labring/aiproxy/core/relay/plugin/noop"
			
 
				+)
			
 
				+
			
 
				+var _ plugin.Plugin = (*StreamFake)(nil)
			
 
				+
			
 
				+// StreamFake implements the stream fake functionality
			
 
				+type StreamFake struct {
			
 
				+	noop.Noop
			
 
				+}
			
 
				+
			
 
				+// NewStreamFakePlugin creates a new stream fake plugin instance
			
 
				+func NewStreamFakePlugin() plugin.Plugin {
			
 
				+	return &StreamFake{}
			
 
				+}
			
 
				+
			
 
				+// Constants for metadata keys
			
 
				+const (
			
 
				+	fakeStreamKey = "fake_stream"
			
 
				+)
			
 
				+
			
 
				+// getConfig retrieves the plugin configuration
			
 
				+func (p *StreamFake) getConfig(meta *meta.Meta) (*Config, error) {
			
 
				+	pluginConfig := &Config{}
			
 
				+	if err := meta.ModelConfig.LoadPluginConfig("stream-fake", pluginConfig); err != nil {
			
 
				+		return nil, err
			
 
				+	}
			
 
				+	return pluginConfig, nil
			
 
				+}
			
 
				+
			
 
				+// ConvertRequest modifies the request to enable streaming if it's originally non-streaming
			
 
				+func (p *StreamFake) ConvertRequest(
			
 
				+	meta *meta.Meta,
			
 
				+	store adaptor.Store,
			
 
				+	req *http.Request,
			
 
				+	do adaptor.ConvertRequest,
			
 
				+) (adaptor.ConvertResult, error) {
			
 
				+	// Only process chat completions
			
 
				+	if meta.Mode != mode.ChatCompletions {
			
 
				+		return do.ConvertRequest(meta, store, req)
			
 
				+	}
			
 
				+
			
 
				+	// Check if stream fake is enabled
			
 
				+	pluginConfig, err := p.getConfig(meta)
			
 
				+	if err != nil || !pluginConfig.Enable {
			
 
				+		return do.ConvertRequest(meta, store, req)
			
 
				+	}
			
 
				+
			
 
				+	body, err := common.GetRequestBodyReusable(req)
			
 
				+	if err != nil {
			
 
				+		return adaptor.ConvertResult{}, fmt.Errorf("failed to read request body: %w", err)
			
 
				+	}
			
 
				+
			
 
				+	node, err := sonic.Get(body)
			
 
				+	if err != nil {
			
 
				+		return do.ConvertRequest(meta, store, req)
			
 
				+	}
			
 
				+
			
 
				+	stream, _ := node.Get("stream").Bool()
			
 
				+	if stream {
			
 
				+		// Already streaming, no need to fake
			
 
				+		return do.ConvertRequest(meta, store, req)
			
 
				+	}
			
 
				+
			
 
				+	// Modify request to enable streaming
			
 
				+	_, err = node.Set("stream", ast.NewBool(true))
			
 
				+	if err != nil {
			
 
				+		return do.ConvertRequest(meta, store, req)
			
 
				+	}
			
 
				+
			
 
				+	// Create new request body
			
 
				+	modifiedBody, err := node.MarshalJSON()
			
 
				+	if err != nil {
			
 
				+		return do.ConvertRequest(meta, store, req)
			
 
				+	}
			
 
				+
			
 
				+	// Update the request
			
 
				+	common.SetRequestBody(req, modifiedBody)
			
 
				+	defer common.SetRequestBody(req, body)
			
 
				+
			
 
				+	meta.Set(fakeStreamKey, true)
			
 
				+
			
 
				+	return do.ConvertRequest(meta, store, req)
			
 
				+}
			
 
				+
			
 
				+// DoResponse handles the response processing to collect streaming data and convert back to non-streaming
			
 
				+func (p *StreamFake) DoResponse(
			
 
				+	meta *meta.Meta,
			
 
				+	store adaptor.Store,
			
 
				+	c *gin.Context,
			
 
				+	resp *http.Response,
			
 
				+	do adaptor.DoResponse,
			
 
				+) (model.Usage, adaptor.Error) {
			
 
				+	// Only process chat completions
			
 
				+	if meta.Mode != mode.ChatCompletions {
			
 
				+		return do.DoResponse(meta, store, c, resp)
			
 
				+	}
			
 
				+
			
 
				+	// Check if this is a fake stream request
			
 
				+	isFakeStream, ok := meta.Get(fakeStreamKey)
			
 
				+	if !ok {
			
 
				+		return do.DoResponse(meta, store, c, resp)
			
 
				+	}
			
 
				+	isFakeStreamBool, ok := isFakeStream.(bool)
			
 
				+	if !ok || !isFakeStreamBool {
			
 
				+		return do.DoResponse(meta, store, c, resp)
			
 
				+	}
			
 
				+
			
 
				+	return p.handleFakeStreamResponse(meta, store, c, resp, do)
			
 
				+}
			
 
				+
			
 
				+// handleFakeStreamResponse processes the streaming response and converts it back to non-streaming
			
 
				+func (p *StreamFake) handleFakeStreamResponse(
			
 
				+	meta *meta.Meta,
			
 
				+	store adaptor.Store,
			
 
				+	c *gin.Context,
			
 
				+	resp *http.Response,
			
 
				+	do adaptor.DoResponse,
			
 
				+) (model.Usage, adaptor.Error) {
			
 
				+	log := common.GetLogger(c)
			
 
				+	// Create a custom response writer to collect streaming data
			
 
				+	rw := &fakeStreamResponseWriter{
			
 
				+		ResponseWriter: c.Writer,
			
 
				+	}
			
 
				+	c.Writer = rw
			
 
				+	defer func() {
			
 
				+		c.Writer = rw.ResponseWriter
			
 
				+	}()
			
 
				+
			
 
				+	// Process the streaming response
			
 
				+	usage, relayErr := do.DoResponse(meta, store, c, resp)
			
 
				+	if relayErr != nil {
			
 
				+		return usage, relayErr
			
 
				+	}
			
 
				+
			
 
				+	// Convert collected streaming chunks to non-streaming response
			
 
				+	respBody, err := rw.convertToNonStream()
			
 
				+	if err != nil {
			
 
				+		log.Errorf("failed to convert to non-streaming response: %v", err)
			
 
				+		return usage, relayErr
			
 
				+	}
			
 
				+
			
 
				+	// Set appropriate headers for non-streaming response
			
 
				+	c.Header("Content-Type", "application/json")
			
 
				+	c.Header("Content-Length", strconv.Itoa(len(respBody)))
			
 
				+
			
 
				+	// Remove streaming-specific headers
			
 
				+	c.Header("Cache-Control", "")
			
 
				+	c.Header("Connection", "")
			
 
				+	c.Header("Transfer-Encoding", "")
			
 
				+	c.Header("X-Accel-Buffering", "")
			
 
				+
			
 
				+	// Write the non-streaming response
			
 
				+	_, _ = rw.ResponseWriter.Write(respBody)
			
 
				+
			
 
				+	return usage, nil
			
 
				+}
			
 
				+
			
 
				+// fakeStreamResponseWriter captures streaming response data
			
 
				+type fakeStreamResponseWriter struct {
			
 
				+	gin.ResponseWriter
			
 
				+
			
 
				+	lastChunk        *ast.Node
			
 
				+	usageNode        *ast.Node
			
 
				+	contentBuilder   bytes.Buffer
			
 
				+	reasoningContent bytes.Buffer
			
 
				+	finishReason     relaymodel.FinishReason
			
 
				+	logprobsContent  []ast.Node
			
 
				+	toolCalls        []*relaymodel.ToolCall
			
 
				+}
			
 
				+
			
 
				+// ignore flush
			
 
				+func (rw *fakeStreamResponseWriter) Flush() {}
			
 
				+
			
 
				+// ignore WriteHeaderNow
			
 
				+func (rw *fakeStreamResponseWriter) WriteHeaderNow() {}
			
 
				+
			
 
				+func (rw *fakeStreamResponseWriter) Write(b []byte) (int, error) {
			
 
				+	// Parse streaming data
			
 
				+	_ = rw.parseStreamingData(b)
			
 
				+
			
 
				+	return len(b), nil
			
 
				+}
			
 
				+
			
 
				+func (rw *fakeStreamResponseWriter) WriteString(s string) (int, error) {
			
 
				+	return rw.Write(conv.StringToBytes(s))
			
 
				+}
			
 
				+
			
 
				+// parseStreamingData extracts individual chunks from streaming response
			
 
				+func (rw *fakeStreamResponseWriter) parseStreamingData(data []byte) error {
			
 
				+	node, err := sonic.Get(data)
			
 
				+	if err != nil {
			
 
				+		return err
			
 
				+	}
			
 
				+	rw.lastChunk = &node
			
 
				+	usageNode := node.Get("usage")
			
 
				+	if err := usageNode.Check(); err != nil {
			
 
				+		if !errors.Is(err, ast.ErrNotExist) {
			
 
				+			return err
			
 
				+		}
			
 
				+	} else {
			
 
				+		rw.usageNode = usageNode
			
 
				+	}
			
 
				+
			
 
				+	choicesNode := node.Get("choices")
			
 
				+	if err := choicesNode.Check(); err != nil {
			
 
				+		return err
			
 
				+	}
			
 
				+
			
 
				+	return choicesNode.ForEach(func(_ ast.Sequence, choiceNode *ast.Node) bool {
			
 
				+		deltaNode := choiceNode.Get("delta")
			
 
				+		if err := deltaNode.Check(); err != nil {
			
 
				+			return true
			
 
				+		}
			
 
				+		content, err := deltaNode.Get("content").String()
			
 
				+		if err == nil {
			
 
				+			rw.contentBuilder.WriteString(content)
			
 
				+		}
			
 
				+		reasoningContent, err := deltaNode.Get("reasoning_content").String()
			
 
				+		if err == nil {
			
 
				+			rw.reasoningContent.WriteString(reasoningContent)
			
 
				+		}
			
 
				+		_ = deltaNode.Get("tool_calls").
			
 
				+			ForEach(func(_ ast.Sequence, toolCallNode *ast.Node) bool {
			
 
				+				toolCallRaw, err := toolCallNode.Raw()
			
 
				+				if err != nil {
			
 
				+					return true
			
 
				+				}
			
 
				+				var toolCall relaymodel.ToolCall
			
 
				+				if err := sonic.UnmarshalString(toolCallRaw, &toolCall); err != nil {
			
 
				+					return true
			
 
				+				}
			
 
				+				rw.toolCalls = mergeToolCalls(rw.toolCalls, &toolCall)
			
 
				+				return true
			
 
				+			})
			
 
				+		finishReason, err := choiceNode.Get("finish_reason").String()
			
 
				+		if err == nil && finishReason != "" {
			
 
				+			rw.finishReason = finishReason
			
 
				+		}
			
 
				+		logprobsContentNode := choiceNode.Get("logprobs").Get("content")
			
 
				+		if err := logprobsContentNode.Check(); err == nil {
			
 
				+			l, err := logprobsContentNode.Len()
			
 
				+			if err != nil {
			
 
				+				return true
			
 
				+			}
			
 
				+			rw.logprobsContent = slices.Grow(rw.logprobsContent, l)
			
 
				+			_ = logprobsContentNode.ForEach(
			
 
				+				func(_ ast.Sequence, logprobsContentNode *ast.Node) bool {
			
 
				+					rw.logprobsContent = append(rw.logprobsContent, *logprobsContentNode)
			
 
				+					return true
			
 
				+				},
			
 
				+			)
			
 
				+		}
			
 
				+		return true
			
 
				+	})
			
 
				+}
			
 
				+
			
 
				+func (rw *fakeStreamResponseWriter) convertToNonStream() ([]byte, error) {
			
 
				+	lastChunk := rw.lastChunk
			
 
				+	if lastChunk == nil {
			
 
				+		return nil, errors.New("last chunk is nil")
			
 
				+	}
			
 
				+
			
 
				+	_, err := lastChunk.Set("object", ast.NewString(relaymodel.ChatCompletionObject))
			
 
				+	if err != nil {
			
 
				+		return nil, err
			
 
				+	}
			
 
				+	if rw.usageNode != nil {
			
 
				+		_, err = lastChunk.Set("usage", *rw.usageNode)
			
 
				+		if err != nil {
			
 
				+			return nil, err
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	message := map[string]any{
			
 
				+		"role":    "assistant",
			
 
				+		"content": rw.contentBuilder.String(),
			
 
				+	}
			
 
				+
			
 
				+	reasoningContent := rw.reasoningContent.String()
			
 
				+	if reasoningContent != "" {
			
 
				+		message["reasoning_content"] = reasoningContent
			
 
				+	}
			
 
				+
			
 
				+	if len(rw.toolCalls) > 0 {
			
 
				+		slices.SortFunc(rw.toolCalls, func(a, b *relaymodel.ToolCall) int {
			
 
				+			return a.Index - b.Index
			
 
				+		})
			
 
				+		message["tool_calls"] = rw.toolCalls
			
 
				+	}
			
 
				+	if len(rw.logprobsContent) > 0 {
			
 
				+		message["logprobs"] = map[string]any{
			
 
				+			"content": rw.logprobsContent,
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	_, err = lastChunk.SetAny("choices", []any{
			
 
				+		map[string]any{
			
 
				+			"index":         0,
			
 
				+			"message":       message,
			
 
				+			"finish_reason": rw.finishReason,
			
 
				+		},
			
 
				+	})
			
 
				+	if err != nil {
			
 
				+		return nil, err
			
 
				+	}
			
 
				+
			
 
				+	return lastChunk.MarshalJSON()
			
 
				+}
			
 
				+
			
 
				+func mergeToolCalls(
			
 
				+	oldToolCalls []*relaymodel.ToolCall,
			
 
				+	newToolCall *relaymodel.ToolCall,
			
 
				+) []*relaymodel.ToolCall {
			
 
				+	findedToolCallIndex := slices.IndexFunc(oldToolCalls, func(t *relaymodel.ToolCall) bool {
			
 
				+		return t.Index == newToolCall.Index
			
 
				+	})
			
 
				+	if findedToolCallIndex != -1 {
			
 
				+		oldToolCall := oldToolCalls[findedToolCallIndex]
			
 
				+		oldToolCalls[findedToolCallIndex] = mergeToolCall(oldToolCall, newToolCall)
			
 
				+	} else {
			
 
				+		oldToolCalls = append(oldToolCalls, newToolCall)
			
 
				+	}
			
 
				+	return oldToolCalls
			
 
				+}
			
 
				+
			
 
				+func mergeToolCall(oldToolCall, newToolCall *relaymodel.ToolCall) *relaymodel.ToolCall {
			
 
				+	if oldToolCall == nil {
			
 
				+		return newToolCall
			
 
				+	}
			
 
				+
			
 
				+	if newToolCall == nil {
			
 
				+		return oldToolCall
			
 
				+	}
			
 
				+
			
 
				+	merged := &relaymodel.ToolCall{
			
 
				+		Index:    oldToolCall.Index,
			
 
				+		ID:       oldToolCall.ID,
			
 
				+		Type:     oldToolCall.Type,
			
 
				+		Function: oldToolCall.Function,
			
 
				+	}
			
 
				+
			
 
				+	merged.Function.Arguments += newToolCall.Function.Arguments
			
 
				+
			
 
				+	return merged
			
 
				+}
			
--- a/core/relay/plugin/thinksplit/split.go
+++ b/core/relay/plugin/thinksplit/split.go
@@ -95,6 +95,9 @@ func (rw *thinkResponseWriter) getThinkSplitter() *splitter.Splitter {
 
				 	return rw.thinkSplitter
			
 
				 }
			
 
				 
			
 
				+// ignore WriteHeaderNow
			
 
				+func (rw *thinkResponseWriter) WriteHeaderNow() {}
			
 
				+
			
 
				 func (rw *thinkResponseWriter) Write(b []byte) (int, error) {
			
 
				 	if rw.done {
			
 
				 		return rw.ResponseWriter.Write(b)