| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501 |
- package doc2x
- import (
- "context"
- "encoding/base64"
- "errors"
- "fmt"
- "net/http"
- "net/url"
- "regexp"
- "strconv"
- "strings"
- "sync"
- "time"
- "github.com/bytedance/sonic"
- "github.com/gin-gonic/gin"
- "github.com/labring/aiproxy/core/common"
- "github.com/labring/aiproxy/core/model"
- "github.com/labring/aiproxy/core/relay/adaptor"
- "github.com/labring/aiproxy/core/relay/meta"
- relaymodel "github.com/labring/aiproxy/core/relay/model"
- log "github.com/sirupsen/logrus"
- )
- func ConvertParsePdfRequest(
- meta *meta.Meta,
- req *http.Request,
- ) (adaptor.ConvertResult, error) {
- err := req.ParseMultipartForm(1024 * 1024 * 4)
- if err != nil {
- return adaptor.ConvertResult{}, err
- }
- file, _, err := req.FormFile("file")
- if err != nil {
- return adaptor.ConvertResult{}, err
- }
- responseFormat := req.FormValue("response_format")
- meta.Set("response_format", responseFormat)
- return adaptor.ConvertResult{
- Header: http.Header{
- "Content-Type": {"multipart/form-data"},
- },
- Body: file,
- }, nil
- }
- type ParsePdfResponse struct {
- Code string `json:"code"`
- Data ParsePdfResponseData `json:"data"`
- Msg string `json:"msg"`
- }
- type ParsePdfResponseData struct {
- UID string `json:"uid"`
- }
- func HandleParsePdfResponse(
- meta *meta.Meta,
- c *gin.Context,
- resp *http.Response,
- ) (model.Usage, adaptor.Error) {
- var response ParsePdfResponse
- err := sonic.ConfigDefault.NewDecoder(resp.Body).Decode(&response)
- if err != nil {
- return model.Usage{}, relaymodel.WrapperOpenAIErrorWithMessage(
- "decode response failed: "+err.Error(),
- "decode_response_failed",
- http.StatusBadRequest,
- )
- }
- if response.Code != "success" {
- return model.Usage{}, relaymodel.WrapperOpenAIErrorWithMessage(
- "parse pdf failed: "+response.Msg,
- "parse_pdf_failed",
- http.StatusBadRequest,
- )
- }
- for {
- status, err := GetStatus(context.Background(), meta, response.Data.UID)
- if err != nil {
- return model.Usage{}, relaymodel.WrapperOpenAIErrorWithMessage(
- "get status failed: "+err.Error(),
- "get_status_failed",
- http.StatusInternalServerError,
- )
- }
- switch status.Status {
- case StatusResponseDataStatusSuccess:
- return handleParsePdfResponse(meta, c, status.Result)
- case StatusResponseDataStatusProcessing:
- time.Sleep(1 * time.Second)
- case StatusResponseDataStatusFailed:
- return model.Usage{}, relaymodel.WrapperOpenAIErrorWithMessage(
- "parse pdf failed: "+status.Detail,
- "parse_pdf_failed",
- http.StatusBadRequest,
- )
- }
- }
- }
- // Start of Selection
- var (
- tableRegex = regexp.MustCompile(`<table>[\s\S]*?</table>`)
- rowRegex = regexp.MustCompile(`<tr>(.*?)</tr>`)
- cellRegex = regexp.MustCompile(`<td[^>]*/>|<td[^>]*>(.*?)</td>`)
- whitespaceRegex = regexp.MustCompile(`\n\s*`)
- tdCleanRegex = regexp.MustCompile(`<td.*?>|</td>`)
- colspanRegex = regexp.MustCompile(`colspan="(\d+)"`)
- rowspanRegex = regexp.MustCompile(`rowspan="(\d+)"`)
- htmlImageRegex = regexp.MustCompile(`<img\s+src="([^"]+)"(?:\s*\?[^>]*)?(?:\s*\/>|>)`)
- imageRegex = regexp.MustCompile(`!\[(.*?)\]\((http[^)]+)\)`)
- mediaCommentRegex = regexp.MustCompile(`<!-- Media -->`)
- footnoteCommentRegex = regexp.MustCompile(`<!-- Footnote -->`)
- )
- func HTMLTable2Md(content string) string {
- return tableRegex.ReplaceAllStringFunc(content, func(htmlTable string) string {
- cleanHTML := whitespaceRegex.ReplaceAllString(htmlTable, "")
- rows := rowRegex.FindAllString(cleanHTML, -1)
- if len(rows) == 0 {
- return htmlTable
- }
- var tableData [][]string
- maxColumns := 0
- for rowIndex, row := range rows {
- for len(tableData) <= rowIndex {
- tableData = append(tableData, []string{})
- }
- colIndex := 0
- cells := cellRegex.FindAllString(row, -1)
- if len(cells) > maxColumns {
- maxColumns = len(cells)
- }
- for _, cell := range cells {
- colspan := 1
- if matches := colspanRegex.FindStringSubmatch(cell); len(matches) > 1 {
- colspan, _ = strconv.Atoi(matches[1])
- }
- rowspan := 1
- if matches := rowspanRegex.FindStringSubmatch(cell); len(matches) > 1 {
- rowspan, _ = strconv.Atoi(matches[1])
- }
- content := strings.TrimSpace(tdCleanRegex.ReplaceAllString(cell, ""))
- for i := range rowspan {
- for j := range colspan {
- for len(tableData) <= rowIndex+i {
- tableData = append(tableData, []string{})
- }
- for len(tableData[rowIndex+i]) <= colIndex+j {
- tableData[rowIndex+i] = append(tableData[rowIndex+i], "")
- }
- if i == 0 && j == 0 {
- tableData[rowIndex+i][colIndex+j] = content
- } else {
- tableData[rowIndex+i][colIndex+j] = "^^"
- }
- }
- }
- colIndex += colspan
- }
- }
- for i := range tableData {
- for len(tableData[i]) < maxColumns {
- tableData[i] = append(tableData[i], " ")
- }
- }
- var chunks []string
- headerCells := make([]string, maxColumns)
- for i := range maxColumns {
- if i < len(tableData[0]) {
- headerCells[i] = tableData[0][i]
- } else {
- headerCells[i] = " "
- }
- }
- chunks = append(chunks, fmt.Sprintf("| %s |", strings.Join(headerCells, " | ")))
- separatorCells := make([]string, maxColumns)
- for i := range maxColumns {
- separatorCells[i] = "---"
- }
- chunks = append(chunks, fmt.Sprintf("| %s |", strings.Join(separatorCells, " | ")))
- for _, row := range tableData[1:] {
- chunks = append(chunks, fmt.Sprintf("| %s |", strings.Join(row, " | ")))
- }
- return strings.Join(chunks, "\n")
- })
- }
- func HTMLImage2Md(content string) string {
- return htmlImageRegex.ReplaceAllString(content, "")
- }
- func InlineMdImage(ctx context.Context, text string) string {
- text = HTMLImage2Md(text)
- matches := imageRegex.FindAllStringSubmatchIndex(text, -1)
- if len(matches) == 0 {
- return text
- }
- var (
- resultText strings.Builder
- wg sync.WaitGroup
- mutex sync.Mutex
- )
- type imageInfo struct {
- startPos int
- endPos int
- altText string
- url string
- replacement string
- }
- imageInfos := make([]imageInfo, len(matches))
- for i, match := range matches {
- altTextStart, altTextEnd := match[2], match[3]
- urlStart, urlEnd := match[4], match[5]
- imageInfos[i] = imageInfo{
- startPos: match[0],
- endPos: match[1],
- altText: text[altTextStart:altTextEnd],
- url: text[urlStart:urlEnd],
- }
- }
- for i := range imageInfos {
- wg.Add(1)
- go func(index int) {
- defer wg.Done()
- info := &imageInfos[index]
- replacement, err := imageURL2MdBase64(ctx, info.url, info.altText)
- if err != nil {
- log.Printf("failed to process image %s: %v", info.url, err)
- // when the image is not found, keep the original link
- mutex.Lock()
- info.replacement = text[info.startPos:info.endPos]
- mutex.Unlock()
- return
- }
- mutex.Lock()
- info.replacement = replacement
- mutex.Unlock()
- }(i)
- }
- wg.Wait()
- lastPos := 0
- for _, info := range imageInfos {
- resultText.WriteString(text[lastPos:info.startPos])
- resultText.WriteString(info.replacement)
- lastPos = info.endPos
- }
- resultText.WriteString(text[lastPos:])
- return resultText.String()
- }
- func imageURL2MdBase64(ctx context.Context, url, altText string) (string, error) {
- req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
- if err != nil {
- return "", fmt.Errorf("failed to create request: %w", err)
- }
- var (
- resp *http.Response
- downloadErr error
- )
- retries := 0
- maxRetries := 3
- for retries <= maxRetries {
- resp, downloadErr = http.DefaultClient.Do(req)
- if downloadErr != nil {
- return "", fmt.Errorf("failed to download image: %w", downloadErr)
- }
- if resp.StatusCode == http.StatusNotFound {
- resp.Body.Close()
- if retries == maxRetries {
- return "", fmt.Errorf(
- "failed to download image, status code: %d after %d retries",
- resp.StatusCode,
- retries,
- )
- }
- retries++
- time.Sleep(1 * time.Second)
- continue
- }
- if resp.StatusCode != http.StatusOK {
- resp.Body.Close()
- return "", fmt.Errorf("failed to download image, status code: %d", resp.StatusCode)
- }
- break
- }
- defer resp.Body.Close()
- data, err := common.GetResponseBody(resp)
- if err != nil {
- return "", fmt.Errorf("failed to read image data: %w", err)
- }
- mime := resp.Header.Get("Content-Type")
- if mime == "" {
- mime = inferMimeType(url)
- }
- base64Data := base64.StdEncoding.EncodeToString(data)
- return fmt.Sprintf("", altText, mime, base64Data), nil
- }
- func inferMimeType(u string) string {
- p, err := url.Parse(u)
- if err != nil {
- return "image/jpeg"
- }
- lowerURL := strings.ToLower(p.Path)
- switch {
- case strings.HasSuffix(lowerURL, ".png"):
- return "image/png"
- case strings.HasSuffix(lowerURL, ".gif"):
- return "image/gif"
- case strings.HasSuffix(lowerURL, ".webp"):
- return "image/webp"
- case strings.HasSuffix(lowerURL, ".svg"):
- return "image/svg+xml"
- default:
- return "image/jpeg"
- }
- }
- func handleConvertPdfToMd(ctx context.Context, str string) string {
- result := InlineMdImage(ctx, str)
- result = HTMLTable2Md(result)
- result = mediaCommentRegex.ReplaceAllString(result, "")
- result = footnoteCommentRegex.ReplaceAllString(result, "")
- return result
- }
- func handleParsePdfResponse(
- meta *meta.Meta,
- c *gin.Context,
- response *StatusResponseDataResult,
- ) (model.Usage, adaptor.Error) {
- mds := make([]string, 0, len(response.Pages))
- totalLength := 0
- for _, page := range response.Pages {
- mds = append(mds, page.MD)
- totalLength += len(page.MD)
- }
- pages := int64(len(response.Pages))
- switch meta.GetString("response_format") {
- case "list":
- for i, md := range mds {
- result := handleConvertPdfToMd(c.Request.Context(), md)
- mds[i] = result
- }
- c.JSON(http.StatusOK, relaymodel.ParsePdfListResponse{
- Markdowns: mds,
- })
- default:
- builder := strings.Builder{}
- builder.Grow(totalLength)
- for _, md := range mds {
- builder.WriteString(md)
- }
- result := handleConvertPdfToMd(c.Request.Context(), builder.String())
- c.JSON(http.StatusOK, relaymodel.ParsePdfResponse{
- Pages: pages,
- Markdown: result,
- })
- }
- return model.Usage{
- InputTokens: model.ZeroNullInt64(pages),
- TotalTokens: model.ZeroNullInt64(pages),
- }, nil
- }
- type StatusResponse struct {
- Code string `json:"code"`
- Msg string `json:"msg"`
- Data *StatusResponseData `json:"data"`
- }
- const (
- StatusResponseDataStatusSuccess = "success"
- StatusResponseDataStatusProcessing = "processing"
- StatusResponseDataStatusFailed = "failed"
- )
- type StatusResponseData struct {
- Progress int `json:"progress"`
- Status string `json:"status"`
- Detail string `json:"detail"`
- Result *StatusResponseDataResult `json:"result"`
- }
- type StatusResponseDataResult struct {
- Version string `json:"version"`
- Pages []StatusResponseDataResultPage `json:"pages"`
- }
- type StatusResponseDataResultPage struct {
- URL string `json:"url"`
- PageIdx int `json:"page_idx"`
- PageWidth int `json:"page_width"`
- PageHeight int `json:"page_height"`
- MD string `json:"md"`
- }
- func GetStatus(ctx context.Context, meta *meta.Meta, uid string) (*StatusResponseData, error) {
- url := fmt.Sprintf("%s/api/v2/parse/status?uid=%s", meta.Channel.BaseURL, uid)
- req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
- if err != nil {
- return nil, err
- }
- req.Header.Set("Authorization", "Bearer "+meta.Channel.Key)
- resp, err := http.DefaultClient.Do(req)
- if err != nil {
- return nil, err
- }
- defer resp.Body.Close()
- var response StatusResponse
- err = sonic.ConfigDefault.NewDecoder(resp.Body).Decode(&response)
- if err != nil {
- return nil, err
- }
- if response.Code != "success" {
- return nil, errors.New("get status failed: " + response.Msg)
- }
- return response.Data, nil
- }
|