package dto import ( "encoding/json" "github.com/QuantumNous/new-api/types" "github.com/gin-gonic/gin" ) type AudioRequest struct { Model string `json:"model"` Input string `json:"input"` Voice string `json:"voice"` Instructions string `json:"instructions,omitempty"` ResponseFormat string `json:"response_format,omitempty"` Speed float64 `json:"speed,omitempty"` StreamFormat string `json:"stream_format,omitempty"` Metadata json.RawMessage `json:"metadata,omitempty"` } func (r *AudioRequest) GetTokenCountMeta() *types.TokenCountMeta { meta := &types.TokenCountMeta{ CombineText: r.Input, TokenType: types.TokenTypeTextNumber, } return meta } func (r *AudioRequest) IsStream(c *gin.Context) bool { return false } func (r *AudioRequest) SetModelName(modelName string) { if modelName != "" { r.Model = modelName } } type AudioResponse struct { Text string `json:"text"` } type WhisperVerboseJSONResponse struct { Task string `json:"task,omitempty"` Language string `json:"language,omitempty"` Duration float64 `json:"duration,omitempty"` Text string `json:"text,omitempty"` Segments []Segment `json:"segments,omitempty"` } type Segment struct { Id int `json:"id"` Seek int `json:"seek"` Start float64 `json:"start"` End float64 `json:"end"` Text string `json:"text"` Tokens []int `json:"tokens"` Temperature float64 `json:"temperature"` AvgLogprob float64 `json:"avg_logprob"` CompressionRatio float64 `json:"compression_ratio"` NoSpeechProb float64 `json:"no_speech_prob"` }