pdf.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501
  1. package doc2x
  2. import (
  3. "context"
  4. "encoding/base64"
  5. "errors"
  6. "fmt"
  7. "net/http"
  8. "net/url"
  9. "regexp"
  10. "strconv"
  11. "strings"
  12. "sync"
  13. "time"
  14. "github.com/bytedance/sonic"
  15. "github.com/gin-gonic/gin"
  16. "github.com/labring/aiproxy/core/common"
  17. "github.com/labring/aiproxy/core/model"
  18. "github.com/labring/aiproxy/core/relay/adaptor"
  19. "github.com/labring/aiproxy/core/relay/meta"
  20. relaymodel "github.com/labring/aiproxy/core/relay/model"
  21. log "github.com/sirupsen/logrus"
  22. )
  23. func ConvertParsePdfRequest(
  24. meta *meta.Meta,
  25. req *http.Request,
  26. ) (adaptor.ConvertResult, error) {
  27. err := req.ParseMultipartForm(1024 * 1024 * 4)
  28. if err != nil {
  29. return adaptor.ConvertResult{}, err
  30. }
  31. file, _, err := req.FormFile("file")
  32. if err != nil {
  33. return adaptor.ConvertResult{}, err
  34. }
  35. responseFormat := req.FormValue("response_format")
  36. meta.Set("response_format", responseFormat)
  37. return adaptor.ConvertResult{
  38. Header: http.Header{
  39. "Content-Type": {"multipart/form-data"},
  40. },
  41. Body: file,
  42. }, nil
  43. }
  44. type ParsePdfResponse struct {
  45. Code string `json:"code"`
  46. Data ParsePdfResponseData `json:"data"`
  47. Msg string `json:"msg"`
  48. }
  49. type ParsePdfResponseData struct {
  50. UID string `json:"uid"`
  51. }
  52. func HandleParsePdfResponse(
  53. meta *meta.Meta,
  54. c *gin.Context,
  55. resp *http.Response,
  56. ) (model.Usage, adaptor.Error) {
  57. var response ParsePdfResponse
  58. err := sonic.ConfigDefault.NewDecoder(resp.Body).Decode(&response)
  59. if err != nil {
  60. return model.Usage{}, relaymodel.WrapperOpenAIErrorWithMessage(
  61. "decode response failed: "+err.Error(),
  62. "decode_response_failed",
  63. http.StatusBadRequest,
  64. )
  65. }
  66. if response.Code != "success" {
  67. return model.Usage{}, relaymodel.WrapperOpenAIErrorWithMessage(
  68. "parse pdf failed: "+response.Msg,
  69. "parse_pdf_failed",
  70. http.StatusBadRequest,
  71. )
  72. }
  73. for {
  74. status, err := GetStatus(context.Background(), meta, response.Data.UID)
  75. if err != nil {
  76. return model.Usage{}, relaymodel.WrapperOpenAIErrorWithMessage(
  77. "get status failed: "+err.Error(),
  78. "get_status_failed",
  79. http.StatusInternalServerError,
  80. )
  81. }
  82. switch status.Status {
  83. case StatusResponseDataStatusSuccess:
  84. return handleParsePdfResponse(meta, c, status.Result)
  85. case StatusResponseDataStatusProcessing:
  86. time.Sleep(1 * time.Second)
  87. case StatusResponseDataStatusFailed:
  88. return model.Usage{}, relaymodel.WrapperOpenAIErrorWithMessage(
  89. "parse pdf failed: "+status.Detail,
  90. "parse_pdf_failed",
  91. http.StatusBadRequest,
  92. )
  93. }
  94. }
  95. }
  96. // Start of Selection
  97. var (
  98. tableRegex = regexp.MustCompile(`<table>[\s\S]*?</table>`)
  99. rowRegex = regexp.MustCompile(`<tr>(.*?)</tr>`)
  100. cellRegex = regexp.MustCompile(`<td[^>]*/>|<td[^>]*>(.*?)</td>`)
  101. whitespaceRegex = regexp.MustCompile(`\n\s*`)
  102. tdCleanRegex = regexp.MustCompile(`<td.*?>|</td>`)
  103. colspanRegex = regexp.MustCompile(`colspan="(\d+)"`)
  104. rowspanRegex = regexp.MustCompile(`rowspan="(\d+)"`)
  105. htmlImageRegex = regexp.MustCompile(`<img\s+src="([^"]+)"(?:\s*\?[^>]*)?(?:\s*\/>|>)`)
  106. imageRegex = regexp.MustCompile(`!\[(.*?)\]\((http[^)]+)\)`)
  107. mediaCommentRegex = regexp.MustCompile(`<!-- Media -->`)
  108. footnoteCommentRegex = regexp.MustCompile(`<!-- Footnote -->`)
  109. )
  110. func HTMLTable2Md(content string) string {
  111. return tableRegex.ReplaceAllStringFunc(content, func(htmlTable string) string {
  112. cleanHTML := whitespaceRegex.ReplaceAllString(htmlTable, "")
  113. rows := rowRegex.FindAllString(cleanHTML, -1)
  114. if len(rows) == 0 {
  115. return htmlTable
  116. }
  117. var tableData [][]string
  118. maxColumns := 0
  119. for rowIndex, row := range rows {
  120. for len(tableData) <= rowIndex {
  121. tableData = append(tableData, []string{})
  122. }
  123. colIndex := 0
  124. cells := cellRegex.FindAllString(row, -1)
  125. if len(cells) > maxColumns {
  126. maxColumns = len(cells)
  127. }
  128. for _, cell := range cells {
  129. colspan := 1
  130. if matches := colspanRegex.FindStringSubmatch(cell); len(matches) > 1 {
  131. colspan, _ = strconv.Atoi(matches[1])
  132. }
  133. rowspan := 1
  134. if matches := rowspanRegex.FindStringSubmatch(cell); len(matches) > 1 {
  135. rowspan, _ = strconv.Atoi(matches[1])
  136. }
  137. content := strings.TrimSpace(tdCleanRegex.ReplaceAllString(cell, ""))
  138. for i := range rowspan {
  139. for j := range colspan {
  140. for len(tableData) <= rowIndex+i {
  141. tableData = append(tableData, []string{})
  142. }
  143. for len(tableData[rowIndex+i]) <= colIndex+j {
  144. tableData[rowIndex+i] = append(tableData[rowIndex+i], "")
  145. }
  146. if i == 0 && j == 0 {
  147. tableData[rowIndex+i][colIndex+j] = content
  148. } else {
  149. tableData[rowIndex+i][colIndex+j] = "^^"
  150. }
  151. }
  152. }
  153. colIndex += colspan
  154. }
  155. }
  156. for i := range tableData {
  157. for len(tableData[i]) < maxColumns {
  158. tableData[i] = append(tableData[i], " ")
  159. }
  160. }
  161. var chunks []string
  162. headerCells := make([]string, maxColumns)
  163. for i := range maxColumns {
  164. if i < len(tableData[0]) {
  165. headerCells[i] = tableData[0][i]
  166. } else {
  167. headerCells[i] = " "
  168. }
  169. }
  170. chunks = append(chunks, fmt.Sprintf("| %s |", strings.Join(headerCells, " | ")))
  171. separatorCells := make([]string, maxColumns)
  172. for i := range maxColumns {
  173. separatorCells[i] = "---"
  174. }
  175. chunks = append(chunks, fmt.Sprintf("| %s |", strings.Join(separatorCells, " | ")))
  176. for _, row := range tableData[1:] {
  177. chunks = append(chunks, fmt.Sprintf("| %s |", strings.Join(row, " | ")))
  178. }
  179. return strings.Join(chunks, "\n")
  180. })
  181. }
  182. func HTMLImage2Md(content string) string {
  183. return htmlImageRegex.ReplaceAllString(content, "![img]($1)")
  184. }
  185. func InlineMdImage(ctx context.Context, text string) string {
  186. text = HTMLImage2Md(text)
  187. matches := imageRegex.FindAllStringSubmatchIndex(text, -1)
  188. if len(matches) == 0 {
  189. return text
  190. }
  191. var (
  192. resultText strings.Builder
  193. wg sync.WaitGroup
  194. mutex sync.Mutex
  195. )
  196. type imageInfo struct {
  197. startPos int
  198. endPos int
  199. altText string
  200. url string
  201. replacement string
  202. }
  203. imageInfos := make([]imageInfo, len(matches))
  204. for i, match := range matches {
  205. altTextStart, altTextEnd := match[2], match[3]
  206. urlStart, urlEnd := match[4], match[5]
  207. imageInfos[i] = imageInfo{
  208. startPos: match[0],
  209. endPos: match[1],
  210. altText: text[altTextStart:altTextEnd],
  211. url: text[urlStart:urlEnd],
  212. }
  213. }
  214. for i := range imageInfos {
  215. wg.Add(1)
  216. go func(index int) {
  217. defer wg.Done()
  218. info := &imageInfos[index]
  219. replacement, err := imageURL2MdBase64(ctx, info.url, info.altText)
  220. if err != nil {
  221. log.Printf("failed to process image %s: %v", info.url, err)
  222. // when the image is not found, keep the original link
  223. mutex.Lock()
  224. info.replacement = text[info.startPos:info.endPos]
  225. mutex.Unlock()
  226. return
  227. }
  228. mutex.Lock()
  229. info.replacement = replacement
  230. mutex.Unlock()
  231. }(i)
  232. }
  233. wg.Wait()
  234. lastPos := 0
  235. for _, info := range imageInfos {
  236. resultText.WriteString(text[lastPos:info.startPos])
  237. resultText.WriteString(info.replacement)
  238. lastPos = info.endPos
  239. }
  240. resultText.WriteString(text[lastPos:])
  241. return resultText.String()
  242. }
  243. func imageURL2MdBase64(ctx context.Context, url, altText string) (string, error) {
  244. req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
  245. if err != nil {
  246. return "", fmt.Errorf("failed to create request: %w", err)
  247. }
  248. var (
  249. resp *http.Response
  250. downloadErr error
  251. )
  252. retries := 0
  253. maxRetries := 3
  254. for retries <= maxRetries {
  255. resp, downloadErr = http.DefaultClient.Do(req)
  256. if downloadErr != nil {
  257. return "", fmt.Errorf("failed to download image: %w", downloadErr)
  258. }
  259. if resp.StatusCode == http.StatusNotFound {
  260. resp.Body.Close()
  261. if retries == maxRetries {
  262. return "", fmt.Errorf(
  263. "failed to download image, status code: %d after %d retries",
  264. resp.StatusCode,
  265. retries,
  266. )
  267. }
  268. retries++
  269. time.Sleep(1 * time.Second)
  270. continue
  271. }
  272. if resp.StatusCode != http.StatusOK {
  273. resp.Body.Close()
  274. return "", fmt.Errorf("failed to download image, status code: %d", resp.StatusCode)
  275. }
  276. break
  277. }
  278. defer resp.Body.Close()
  279. data, err := common.GetResponseBody(resp)
  280. if err != nil {
  281. return "", fmt.Errorf("failed to read image data: %w", err)
  282. }
  283. mime := resp.Header.Get("Content-Type")
  284. if mime == "" {
  285. mime = inferMimeType(url)
  286. }
  287. base64Data := base64.StdEncoding.EncodeToString(data)
  288. return fmt.Sprintf("![%s](data:%s;base64,%s)", altText, mime, base64Data), nil
  289. }
  290. func inferMimeType(u string) string {
  291. p, err := url.Parse(u)
  292. if err != nil {
  293. return "image/jpeg"
  294. }
  295. lowerURL := strings.ToLower(p.Path)
  296. switch {
  297. case strings.HasSuffix(lowerURL, ".png"):
  298. return "image/png"
  299. case strings.HasSuffix(lowerURL, ".gif"):
  300. return "image/gif"
  301. case strings.HasSuffix(lowerURL, ".webp"):
  302. return "image/webp"
  303. case strings.HasSuffix(lowerURL, ".svg"):
  304. return "image/svg+xml"
  305. default:
  306. return "image/jpeg"
  307. }
  308. }
  309. func handleConvertPdfToMd(ctx context.Context, str string) string {
  310. result := InlineMdImage(ctx, str)
  311. result = HTMLTable2Md(result)
  312. result = mediaCommentRegex.ReplaceAllString(result, "")
  313. result = footnoteCommentRegex.ReplaceAllString(result, "")
  314. return result
  315. }
  316. func handleParsePdfResponse(
  317. meta *meta.Meta,
  318. c *gin.Context,
  319. response *StatusResponseDataResult,
  320. ) (model.Usage, adaptor.Error) {
  321. mds := make([]string, 0, len(response.Pages))
  322. totalLength := 0
  323. for _, page := range response.Pages {
  324. mds = append(mds, page.MD)
  325. totalLength += len(page.MD)
  326. }
  327. pages := int64(len(response.Pages))
  328. switch meta.GetString("response_format") {
  329. case "list":
  330. for i, md := range mds {
  331. result := handleConvertPdfToMd(c.Request.Context(), md)
  332. mds[i] = result
  333. }
  334. c.JSON(http.StatusOK, relaymodel.ParsePdfListResponse{
  335. Markdowns: mds,
  336. })
  337. default:
  338. builder := strings.Builder{}
  339. builder.Grow(totalLength)
  340. for _, md := range mds {
  341. builder.WriteString(md)
  342. }
  343. result := handleConvertPdfToMd(c.Request.Context(), builder.String())
  344. c.JSON(http.StatusOK, relaymodel.ParsePdfResponse{
  345. Pages: pages,
  346. Markdown: result,
  347. })
  348. }
  349. return model.Usage{
  350. InputTokens: model.ZeroNullInt64(pages),
  351. TotalTokens: model.ZeroNullInt64(pages),
  352. }, nil
  353. }
  354. type StatusResponse struct {
  355. Code string `json:"code"`
  356. Msg string `json:"msg"`
  357. Data *StatusResponseData `json:"data"`
  358. }
  359. const (
  360. StatusResponseDataStatusSuccess = "success"
  361. StatusResponseDataStatusProcessing = "processing"
  362. StatusResponseDataStatusFailed = "failed"
  363. )
  364. type StatusResponseData struct {
  365. Progress int `json:"progress"`
  366. Status string `json:"status"`
  367. Detail string `json:"detail"`
  368. Result *StatusResponseDataResult `json:"result"`
  369. }
  370. type StatusResponseDataResult struct {
  371. Version string `json:"version"`
  372. Pages []StatusResponseDataResultPage `json:"pages"`
  373. }
  374. type StatusResponseDataResultPage struct {
  375. URL string `json:"url"`
  376. PageIdx int `json:"page_idx"`
  377. PageWidth int `json:"page_width"`
  378. PageHeight int `json:"page_height"`
  379. MD string `json:"md"`
  380. }
  381. func GetStatus(ctx context.Context, meta *meta.Meta, uid string) (*StatusResponseData, error) {
  382. url := fmt.Sprintf("%s/api/v2/parse/status?uid=%s", meta.Channel.BaseURL, uid)
  383. req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
  384. if err != nil {
  385. return nil, err
  386. }
  387. req.Header.Set("Authorization", "Bearer "+meta.Channel.Key)
  388. resp, err := http.DefaultClient.Do(req)
  389. if err != nil {
  390. return nil, err
  391. }
  392. defer resp.Body.Close()
  393. var response StatusResponse
  394. err = sonic.ConfigDefault.NewDecoder(resp.Body).Decode(&response)
  395. if err != nil {
  396. return nil, err
  397. }
  398. if response.Code != "success" {
  399. return nil, errors.New("get status failed: " + response.Msg)
  400. }
  401. return response.Data, nil
  402. }