fetch_helpers.go 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. package tools
  2. import (
  3. "bytes"
  4. "context"
  5. "encoding/json"
  6. "errors"
  7. "fmt"
  8. "io"
  9. "net/http"
  10. "regexp"
  11. "strings"
  12. "unicode/utf8"
  13. md "github.com/JohannesKaufmann/html-to-markdown"
  14. "golang.org/x/net/html"
  15. )
  16. // BrowserUserAgent is a realistic browser User-Agent for better compatibility.
  17. const BrowserUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
  18. var multipleNewlinesRe = regexp.MustCompile(`\n{3,}`)
  19. // FetchURLAndConvert fetches a URL and converts HTML content to markdown.
  20. func FetchURLAndConvert(ctx context.Context, client *http.Client, url string) (string, error) {
  21. req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
  22. if err != nil {
  23. return "", fmt.Errorf("failed to create request: %w", err)
  24. }
  25. // Use realistic browser headers for better compatibility.
  26. req.Header.Set("User-Agent", BrowserUserAgent)
  27. req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
  28. req.Header.Set("Accept-Language", "en-US,en;q=0.5")
  29. resp, err := client.Do(req)
  30. if err != nil {
  31. return "", fmt.Errorf("failed to fetch URL: %w", err)
  32. }
  33. defer resp.Body.Close()
  34. if resp.StatusCode != http.StatusOK {
  35. return "", fmt.Errorf("request failed with status code: %d", resp.StatusCode)
  36. }
  37. maxSize := int64(5 * 1024 * 1024) // 5MB
  38. body, err := io.ReadAll(io.LimitReader(resp.Body, maxSize))
  39. if err != nil {
  40. return "", fmt.Errorf("failed to read response body: %w", err)
  41. }
  42. content := string(body)
  43. if !utf8.ValidString(content) {
  44. return "", errors.New("response content is not valid UTF-8")
  45. }
  46. contentType := resp.Header.Get("Content-Type")
  47. // Convert HTML to markdown for better AI processing.
  48. if strings.Contains(contentType, "text/html") {
  49. // Remove noisy elements before conversion.
  50. cleanedHTML := removeNoisyElements(content)
  51. markdown, err := ConvertHTMLToMarkdown(cleanedHTML)
  52. if err != nil {
  53. return "", fmt.Errorf("failed to convert HTML to markdown: %w", err)
  54. }
  55. content = cleanupMarkdown(markdown)
  56. } else if strings.Contains(contentType, "application/json") || strings.Contains(contentType, "text/json") {
  57. // Format JSON for better readability.
  58. formatted, err := FormatJSON(content)
  59. if err == nil {
  60. content = formatted
  61. }
  62. // If formatting fails, keep original content.
  63. }
  64. return content, nil
  65. }
  66. // removeNoisyElements removes script, style, nav, header, footer, and other
  67. // noisy elements from HTML to improve content extraction.
  68. func removeNoisyElements(htmlContent string) string {
  69. doc, err := html.Parse(strings.NewReader(htmlContent))
  70. if err != nil {
  71. // If parsing fails, return original content.
  72. return htmlContent
  73. }
  74. // Elements to remove entirely.
  75. noisyTags := map[string]bool{
  76. "script": true,
  77. "style": true,
  78. "nav": true,
  79. "header": true,
  80. "footer": true,
  81. "aside": true,
  82. "noscript": true,
  83. "iframe": true,
  84. "svg": true,
  85. }
  86. var removeNodes func(*html.Node)
  87. removeNodes = func(n *html.Node) {
  88. var toRemove []*html.Node
  89. for c := n.FirstChild; c != nil; c = c.NextSibling {
  90. if c.Type == html.ElementNode && noisyTags[c.Data] {
  91. toRemove = append(toRemove, c)
  92. } else {
  93. removeNodes(c)
  94. }
  95. }
  96. for _, node := range toRemove {
  97. n.RemoveChild(node)
  98. }
  99. }
  100. removeNodes(doc)
  101. var buf bytes.Buffer
  102. if err := html.Render(&buf, doc); err != nil {
  103. return htmlContent
  104. }
  105. return buf.String()
  106. }
  107. // cleanupMarkdown removes excessive whitespace and blank lines from markdown.
  108. func cleanupMarkdown(content string) string {
  109. // Collapse multiple blank lines into at most two.
  110. content = multipleNewlinesRe.ReplaceAllString(content, "\n\n")
  111. // Remove trailing whitespace from each line.
  112. lines := strings.Split(content, "\n")
  113. for i, line := range lines {
  114. lines[i] = strings.TrimRight(line, " \t")
  115. }
  116. content = strings.Join(lines, "\n")
  117. // Trim leading/trailing whitespace.
  118. content = strings.TrimSpace(content)
  119. return content
  120. }
  121. // ConvertHTMLToMarkdown converts HTML content to markdown format.
  122. func ConvertHTMLToMarkdown(htmlContent string) (string, error) {
  123. converter := md.NewConverter("", true, nil)
  124. markdown, err := converter.ConvertString(htmlContent)
  125. if err != nil {
  126. return "", err
  127. }
  128. return markdown, nil
  129. }
  130. // FormatJSON formats JSON content with proper indentation.
  131. func FormatJSON(content string) (string, error) {
  132. var data any
  133. if err := json.Unmarshal([]byte(content), &data); err != nil {
  134. return "", err
  135. }
  136. var buf bytes.Buffer
  137. encoder := json.NewEncoder(&buf)
  138. encoder.SetIndent("", " ")
  139. if err := encoder.Encode(data); err != nil {
  140. return "", err
  141. }
  142. return buf.String(), nil
  143. }