webfetch.ts 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
  1. import z from "zod"
  2. import { Tool } from "./tool"
  3. import TurndownService from "turndown"
  4. import DESCRIPTION from "./webfetch.txt"
  5. import { Config } from "../config/config"
  6. import { Permission } from "../permission"
  7. const MAX_RESPONSE_SIZE = 5 * 1024 * 1024 // 5MB
  8. const DEFAULT_TIMEOUT = 30 * 1000 // 30 seconds
  9. const MAX_TIMEOUT = 120 * 1000 // 2 minutes
  10. export const WebFetchTool = Tool.define("webfetch", {
  11. description: DESCRIPTION,
  12. parameters: z.object({
  13. url: z.string().describe("The URL to fetch content from"),
  14. format: z
  15. .enum(["text", "markdown", "html"])
  16. .describe("The format to return the content in (text, markdown, or html)"),
  17. timeout: z.number().describe("Optional timeout in seconds (max 120)").optional(),
  18. }),
  19. async execute(params, ctx) {
  20. // Validate URL
  21. if (!params.url.startsWith("http://") && !params.url.startsWith("https://")) {
  22. throw new Error("URL must start with http:// or https://")
  23. }
  24. const cfg = await Config.get()
  25. if (cfg.permission?.webfetch === "ask")
  26. await Permission.ask({
  27. type: "webfetch",
  28. sessionID: ctx.sessionID,
  29. messageID: ctx.messageID,
  30. callID: ctx.callID,
  31. title: "Fetch content from: " + params.url,
  32. metadata: {
  33. url: params.url,
  34. format: params.format,
  35. timeout: params.timeout,
  36. },
  37. })
  38. const timeout = Math.min((params.timeout ?? DEFAULT_TIMEOUT / 1000) * 1000, MAX_TIMEOUT)
  39. const controller = new AbortController()
  40. const timeoutId = setTimeout(() => controller.abort(), timeout)
  41. // Build Accept header based on requested format with q parameters for fallbacks
  42. let acceptHeader = "*/*"
  43. switch (params.format) {
  44. case "markdown":
  45. acceptHeader = "text/markdown;q=1.0, text/x-markdown;q=0.9, text/plain;q=0.8, text/html;q=0.7, */*;q=0.1"
  46. break
  47. case "text":
  48. acceptHeader = "text/plain;q=1.0, text/markdown;q=0.9, text/html;q=0.8, */*;q=0.1"
  49. break
  50. case "html":
  51. acceptHeader = "text/html;q=1.0, application/xhtml+xml;q=0.9, text/plain;q=0.8, text/markdown;q=0.7, */*;q=0.1"
  52. break
  53. default:
  54. acceptHeader =
  55. "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8"
  56. }
  57. const response = await fetch(params.url, {
  58. signal: AbortSignal.any([controller.signal, ctx.abort]),
  59. headers: {
  60. "User-Agent":
  61. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  62. Accept: acceptHeader,
  63. "Accept-Language": "en-US,en;q=0.9",
  64. },
  65. })
  66. clearTimeout(timeoutId)
  67. if (!response.ok) {
  68. throw new Error(`Request failed with status code: ${response.status}`)
  69. }
  70. // Check content length
  71. const contentLength = response.headers.get("content-length")
  72. if (contentLength && parseInt(contentLength) > MAX_RESPONSE_SIZE) {
  73. throw new Error("Response too large (exceeds 5MB limit)")
  74. }
  75. const arrayBuffer = await response.arrayBuffer()
  76. if (arrayBuffer.byteLength > MAX_RESPONSE_SIZE) {
  77. throw new Error("Response too large (exceeds 5MB limit)")
  78. }
  79. const content = new TextDecoder().decode(arrayBuffer)
  80. const contentType = response.headers.get("content-type") || ""
  81. const title = `${params.url} (${contentType})`
  82. // Handle content based on requested format and actual content type
  83. switch (params.format) {
  84. case "markdown":
  85. if (contentType.includes("text/html")) {
  86. const markdown = convertHTMLToMarkdown(content)
  87. return {
  88. output: markdown,
  89. title,
  90. metadata: {},
  91. }
  92. }
  93. return {
  94. output: content,
  95. title,
  96. metadata: {},
  97. }
  98. case "text":
  99. if (contentType.includes("text/html")) {
  100. const text = await extractTextFromHTML(content)
  101. return {
  102. output: text,
  103. title,
  104. metadata: {},
  105. }
  106. }
  107. return {
  108. output: content,
  109. title,
  110. metadata: {},
  111. }
  112. case "html":
  113. return {
  114. output: content,
  115. title,
  116. metadata: {},
  117. }
  118. default:
  119. return {
  120. output: content,
  121. title,
  122. metadata: {},
  123. }
  124. }
  125. },
  126. })
  127. async function extractTextFromHTML(html: string) {
  128. let text = ""
  129. let skipContent = false
  130. const rewriter = new HTMLRewriter()
  131. .on("script, style, noscript, iframe, object, embed", {
  132. element() {
  133. skipContent = true
  134. },
  135. text() {
  136. // Skip text content inside these elements
  137. },
  138. })
  139. .on("*", {
  140. element(element) {
  141. // Reset skip flag when entering other elements
  142. if (!["script", "style", "noscript", "iframe", "object", "embed"].includes(element.tagName)) {
  143. skipContent = false
  144. }
  145. },
  146. text(input) {
  147. if (!skipContent) {
  148. text += input.text
  149. }
  150. },
  151. })
  152. .transform(new Response(html))
  153. await rewriter.text()
  154. return text.trim()
  155. }
  156. function convertHTMLToMarkdown(html: string): string {
  157. const turndownService = new TurndownService({
  158. headingStyle: "atx",
  159. hr: "---",
  160. bulletListMarker: "-",
  161. codeBlockStyle: "fenced",
  162. emDelimiter: "*",
  163. })
  164. turndownService.remove(["script", "style", "meta", "link"])
  165. return turndownService.turndown(html)
  166. }