Explorar o código

fix webfetch tool when returning html as text

Dax Raad hai 8 meses
pai
achega
7ca8334a8b
Modificáronse 1 ficheiros con 40 adicións e 5 borrados
  1. 40 5
      packages/opencode/src/tool/webfetch.ts

+ 40 - 5
packages/opencode/src/tool/webfetch.ts

@@ -76,7 +76,7 @@ export const WebFetchTool = Tool.define({
     switch (params.format) {
     switch (params.format) {
       case "text":
       case "text":
         if (contentType.includes("text/html")) {
         if (contentType.includes("text/html")) {
-          const text = extractTextFromHTML(content)
+          const text = await extractTextFromHTML(content)
           return {
           return {
             output: text,
             output: text,
             metadata: {
             metadata: {
@@ -127,10 +127,45 @@ export const WebFetchTool = Tool.define({
   },
   },
 })
 })
 
 
-function extractTextFromHTML(html: string): string {
-  const doc = new DOMParser().parseFromString(html, "text/html")
-  const text = doc.body.textContent || doc.body.innerText || ""
-  return text.replace(/\s+/g, " ").trim()
+async function extractTextFromHTML(html: string) {
+  let text = ""
+  let skipContent = false
+
+  const rewriter = new HTMLRewriter()
+    .on("script, style, noscript, iframe, object, embed", {
+      element() {
+        skipContent = true
+      },
+      text() {
+        // Skip text content inside these elements
+      },
+    })
+    .on("*", {
+      element(element) {
+        // Reset skip flag when entering other elements
+        if (
+          ![
+            "script",
+            "style",
+            "noscript",
+            "iframe",
+            "object",
+            "embed",
+          ].includes(element.tagName)
+        ) {
+          skipContent = false
+        }
+      },
+      text(input) {
+        if (!skipContent) {
+          text += input.text
+        }
+      },
+    })
+    .transform(new Response(html))
+
+  await rewriter.text()
+  return text.trim()
 }
 }
 
 
 function convertHTMLToMarkdown(html: string): string {
 function convertHTMLToMarkdown(html: string): string {