2
0

fetch_helpers.go 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. package tools
  2. import (
  3. "bytes"
  4. "context"
  5. "encoding/json"
  6. "errors"
  7. "fmt"
  8. "io"
  9. "net/http"
  10. "strings"
  11. "unicode/utf8"
  12. md "github.com/JohannesKaufmann/html-to-markdown"
  13. )
  14. // FetchURLAndConvert fetches a URL and converts HTML content to markdown.
  15. func FetchURLAndConvert(ctx context.Context, client *http.Client, url string) (string, error) {
  16. req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
  17. if err != nil {
  18. return "", fmt.Errorf("failed to create request: %w", err)
  19. }
  20. req.Header.Set("User-Agent", "crush/1.0")
  21. resp, err := client.Do(req)
  22. if err != nil {
  23. return "", fmt.Errorf("failed to fetch URL: %w", err)
  24. }
  25. defer resp.Body.Close()
  26. if resp.StatusCode != http.StatusOK {
  27. return "", fmt.Errorf("request failed with status code: %d", resp.StatusCode)
  28. }
  29. maxSize := int64(5 * 1024 * 1024) // 5MB
  30. body, err := io.ReadAll(io.LimitReader(resp.Body, maxSize))
  31. if err != nil {
  32. return "", fmt.Errorf("failed to read response body: %w", err)
  33. }
  34. content := string(body)
  35. if !utf8.ValidString(content) {
  36. return "", errors.New("response content is not valid UTF-8")
  37. }
  38. contentType := resp.Header.Get("Content-Type")
  39. // Convert HTML to markdown for better AI processing.
  40. if strings.Contains(contentType, "text/html") {
  41. markdown, err := ConvertHTMLToMarkdown(content)
  42. if err != nil {
  43. return "", fmt.Errorf("failed to convert HTML to markdown: %w", err)
  44. }
  45. content = markdown
  46. } else if strings.Contains(contentType, "application/json") || strings.Contains(contentType, "text/json") {
  47. // Format JSON for better readability.
  48. formatted, err := FormatJSON(content)
  49. if err == nil {
  50. content = formatted
  51. }
  52. // If formatting fails, keep original content.
  53. }
  54. return content, nil
  55. }
  56. // ConvertHTMLToMarkdown converts HTML content to markdown format.
  57. func ConvertHTMLToMarkdown(html string) (string, error) {
  58. converter := md.NewConverter("", true, nil)
  59. markdown, err := converter.ConvertString(html)
  60. if err != nil {
  61. return "", err
  62. }
  63. return markdown, nil
  64. }
  65. // FormatJSON formats JSON content with proper indentation.
  66. func FormatJSON(content string) (string, error) {
  67. var data interface{}
  68. if err := json.Unmarshal([]byte(content), &data); err != nil {
  69. return "", err
  70. }
  71. var buf bytes.Buffer
  72. encoder := json.NewEncoder(&buf)
  73. encoder.SetIndent("", " ")
  74. if err := encoder.Encode(data); err != nil {
  75. return "", err
  76. }
  77. return buf.String(), nil
  78. }