webfetch.ts 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. import { z } from "zod"
  2. import { Tool } from "./tool"
  3. import TurndownService from "turndown"
  4. import DESCRIPTION from "./webfetch.txt"
  5. const MAX_RESPONSE_SIZE = 5 * 1024 * 1024 // 5MB
  6. const DEFAULT_TIMEOUT = 30 * 1000 // 30 seconds
  7. const MAX_TIMEOUT = 120 * 1000 // 2 minutes
  8. export const WebFetchTool = Tool.define({
  9. id: "opencode.webfetch",
  10. description: DESCRIPTION,
  11. parameters: z.object({
  12. url: z.string().describe("The URL to fetch content from"),
  13. format: z
  14. .enum(["text", "markdown", "html"])
  15. .describe(
  16. "The format to return the content in (text, markdown, or html)",
  17. ),
  18. timeout: z
  19. .number()
  20. .min(0)
  21. .max(MAX_TIMEOUT / 1000)
  22. .describe("Optional timeout in seconds (max 120)")
  23. .nullable(),
  24. }),
  25. async execute(params) {
  26. // Validate URL
  27. if (
  28. !params.url.startsWith("http://") &&
  29. !params.url.startsWith("https://")
  30. ) {
  31. throw new Error("URL must start with http:// or https://")
  32. }
  33. const timeout = Math.min(
  34. (params.timeout ?? DEFAULT_TIMEOUT / 1000) * 1000,
  35. MAX_TIMEOUT,
  36. )
  37. const controller = new AbortController()
  38. const timeoutId = setTimeout(() => controller.abort(), timeout)
  39. const response = await fetch(params.url, {
  40. signal: controller.signal,
  41. headers: {
  42. "User-Agent":
  43. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  44. Accept:
  45. "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
  46. "Accept-Language": "en-US,en;q=0.9",
  47. },
  48. })
  49. clearTimeout(timeoutId)
  50. if (!response.ok) {
  51. throw new Error(`Request failed with status code: ${response.status}`)
  52. }
  53. // Check content length
  54. const contentLength = response.headers.get("content-length")
  55. if (contentLength && parseInt(contentLength) > MAX_RESPONSE_SIZE) {
  56. throw new Error("Response too large (exceeds 5MB limit)")
  57. }
  58. const arrayBuffer = await response.arrayBuffer()
  59. if (arrayBuffer.byteLength > MAX_RESPONSE_SIZE) {
  60. throw new Error("Response too large (exceeds 5MB limit)")
  61. }
  62. const content = new TextDecoder().decode(arrayBuffer)
  63. const contentType = response.headers.get("content-type") || ""
  64. switch (params.format) {
  65. case "text":
  66. if (contentType.includes("text/html")) {
  67. const text = extractTextFromHTML(content)
  68. return { output: text, metadata: {} }
  69. }
  70. return { output: content, metadata: {} }
  71. case "markdown":
  72. if (contentType.includes("text/html")) {
  73. const markdown = convertHTMLToMarkdown(content)
  74. return { output: markdown, metadata: {} }
  75. }
  76. return { output: "```\n" + content + "\n```", metadata: {} }
  77. case "html":
  78. return { output: content, metadata: {} }
  79. default:
  80. return { output: content, metadata: {} }
  81. }
  82. },
  83. })
  84. function extractTextFromHTML(html: string): string {
  85. const doc = new DOMParser().parseFromString(html, "text/html")
  86. const text = doc.body.textContent || doc.body.innerText || ""
  87. return text.replace(/\s+/g, " ").trim()
  88. }
  89. function convertHTMLToMarkdown(html: string): string {
  90. const turndownService = new TurndownService({
  91. headingStyle: "atx",
  92. hr: "---",
  93. bulletListMarker: "-",
  94. codeBlockStyle: "fenced",
  95. emDelimiter: "*",
  96. })
  97. turndownService.remove(["script", "style", "meta", "link"])
  98. return turndownService.turndown(html)
  99. }