webfetch.ts 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. import z from "zod"
  2. import { Tool } from "./tool"
  3. import TurndownService from "turndown"
  4. import DESCRIPTION from "./webfetch.txt"
  5. const MAX_RESPONSE_SIZE = 5 * 1024 * 1024 // 5MB
  6. const DEFAULT_TIMEOUT = 30 * 1000 // 30 seconds
  7. const MAX_TIMEOUT = 120 * 1000 // 2 minutes
  8. export const WebFetchTool = Tool.define("webfetch", {
  9. description: DESCRIPTION,
  10. parameters: z.object({
  11. url: z.string().describe("The URL to fetch content from"),
  12. format: z
  13. .enum(["text", "markdown", "html"])
  14. .default("markdown")
  15. .describe("The format to return the content in (text, markdown, or html). Defaults to markdown."),
  16. timeout: z.number().describe("Optional timeout in seconds (max 120)").optional(),
  17. }),
  18. async execute(params, ctx) {
  19. // Validate URL
  20. if (!params.url.startsWith("http://") && !params.url.startsWith("https://")) {
  21. throw new Error("URL must start with http:// or https://")
  22. }
  23. await ctx.ask({
  24. permission: "webfetch",
  25. patterns: [params.url],
  26. always: ["*"],
  27. metadata: {
  28. url: params.url,
  29. format: params.format,
  30. timeout: params.timeout,
  31. },
  32. })
  33. const timeout = Math.min((params.timeout ?? DEFAULT_TIMEOUT / 1000) * 1000, MAX_TIMEOUT)
  34. const controller = new AbortController()
  35. const timeoutId = setTimeout(() => controller.abort(), timeout)
  36. // Build Accept header based on requested format with q parameters for fallbacks
  37. let acceptHeader = "*/*"
  38. switch (params.format) {
  39. case "markdown":
  40. acceptHeader = "text/markdown;q=1.0, text/x-markdown;q=0.9, text/plain;q=0.8, text/html;q=0.7, */*;q=0.1"
  41. break
  42. case "text":
  43. acceptHeader = "text/plain;q=1.0, text/markdown;q=0.9, text/html;q=0.8, */*;q=0.1"
  44. break
  45. case "html":
  46. acceptHeader = "text/html;q=1.0, application/xhtml+xml;q=0.9, text/plain;q=0.8, text/markdown;q=0.7, */*;q=0.1"
  47. break
  48. default:
  49. acceptHeader =
  50. "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8"
  51. }
  52. const signal = AbortSignal.any([controller.signal, ctx.abort])
  53. const headers = {
  54. "User-Agent":
  55. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36",
  56. Accept: acceptHeader,
  57. "Accept-Language": "en-US,en;q=0.9",
  58. }
  59. const initial = await fetch(params.url, { signal, headers })
  60. // Retry with honest UA if blocked by Cloudflare bot detection (TLS fingerprint mismatch)
  61. const response =
  62. initial.status === 403 && initial.headers.get("cf-mitigated") === "challenge"
  63. ? await fetch(params.url, { signal, headers: { ...headers, "User-Agent": "opencode" } })
  64. : initial
  65. clearTimeout(timeoutId)
  66. if (!response.ok) {
  67. throw new Error(`Request failed with status code: ${response.status}`)
  68. }
  69. // Check content length
  70. const contentLength = response.headers.get("content-length")
  71. if (contentLength && parseInt(contentLength) > MAX_RESPONSE_SIZE) {
  72. throw new Error("Response too large (exceeds 5MB limit)")
  73. }
  74. const arrayBuffer = await response.arrayBuffer()
  75. if (arrayBuffer.byteLength > MAX_RESPONSE_SIZE) {
  76. throw new Error("Response too large (exceeds 5MB limit)")
  77. }
  78. const content = new TextDecoder().decode(arrayBuffer)
  79. const contentType = response.headers.get("content-type") || ""
  80. const title = `${params.url} (${contentType})`
  81. // Handle content based on requested format and actual content type
  82. switch (params.format) {
  83. case "markdown":
  84. if (contentType.includes("text/html")) {
  85. const markdown = convertHTMLToMarkdown(content)
  86. return {
  87. output: markdown,
  88. title,
  89. metadata: {},
  90. }
  91. }
  92. return {
  93. output: content,
  94. title,
  95. metadata: {},
  96. }
  97. case "text":
  98. if (contentType.includes("text/html")) {
  99. const text = await extractTextFromHTML(content)
  100. return {
  101. output: text,
  102. title,
  103. metadata: {},
  104. }
  105. }
  106. return {
  107. output: content,
  108. title,
  109. metadata: {},
  110. }
  111. case "html":
  112. return {
  113. output: content,
  114. title,
  115. metadata: {},
  116. }
  117. default:
  118. return {
  119. output: content,
  120. title,
  121. metadata: {},
  122. }
  123. }
  124. },
  125. })
  126. async function extractTextFromHTML(html: string) {
  127. let text = ""
  128. let skipContent = false
  129. const rewriter = new HTMLRewriter()
  130. .on("script, style, noscript, iframe, object, embed", {
  131. element() {
  132. skipContent = true
  133. },
  134. text() {
  135. // Skip text content inside these elements
  136. },
  137. })
  138. .on("*", {
  139. element(element) {
  140. // Reset skip flag when entering other elements
  141. if (!["script", "style", "noscript", "iframe", "object", "embed"].includes(element.tagName)) {
  142. skipContent = false
  143. }
  144. },
  145. text(input) {
  146. if (!skipContent) {
  147. text += input.text
  148. }
  149. },
  150. })
  151. .transform(new Response(html))
  152. await rewriter.text()
  153. return text.trim()
  154. }
  155. function convertHTMLToMarkdown(html: string): string {
  156. const turndownService = new TurndownService({
  157. headingStyle: "atx",
  158. hr: "---",
  159. bulletListMarker: "-",
  160. codeBlockStyle: "fenced",
  161. emDelimiter: "*",
  162. })
  163. turndownService.remove(["script", "style", "meta", "link"])
  164. return turndownService.turndown(html)
  165. }