1 anno fa · e3144996fb
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@@ -151,8 +151,10 @@
 
				     "@google/generative-ai": "^0.18.0",
			
 
				     "@types/clone-deep": "^4.0.4",
			
 
				     "@types/pdf-parse": "^1.1.4",
			
 
				+    "@types/turndown": "^5.0.5",
			
 
				     "@vscode/codicons": "^0.0.36",
			
 
				     "axios": "^1.7.4",
			
 
				+    "cheerio": "^1.0.0",
			
 
				     "clone-deep": "^4.0.1",
			
 
				     "default-shell": "^2.2.0",
			
 
				     "delay": "^6.0.0",
			
@@ -165,9 +167,12 @@
 
				     "os-name": "^6.0.0",
			
 
				     "p-wait-for": "^5.0.2",
			
 
				     "pdf-parse": "^1.1.1",
			
 
				+    "puppeteer-chromium-resolver": "^23.0.0",
			
 
				+    "puppeteer-core": "^23.4.0",
			
 
				     "serialize-error": "^11.0.3",
			
 
				     "strip-ansi": "^7.1.0",
			
 
				     "tree-sitter-wasms": "^0.1.11",
			
 
				+    "turndown": "^7.2.0",
			
 
				     "web-tree-sitter": "^0.22.6"
			
 
				   }
			
 
				 }
			
--- a/src/extension.ts
+++ b/src/extension.ts
@@ -3,6 +3,7 @@
 
				 import * as vscode from "vscode"
			
 
				 import { ClaudeDevProvider } from "./providers/ClaudeDevProvider"
			
 
				 import delay from "delay"
			
 
				+import { UrlScraper } from "./utils/UrlScraper"
			
 
				 
			
 
				 /*
			
 
				 Built using https://github.com/microsoft/vscode-webview-ui-toolkit
			
@@ -39,6 +40,11 @@ export function activate(context: vscode.ExtensionContext) {
 
				 
			
 
				 	const sidebarProvider = new ClaudeDevProvider(context, outputChannel)
			
 
				 
			
 
				+	// Installs chromium for puppeteer url scraping
			
 
				+	UrlScraper.ensureChromiumExists(context).catch((error) => {
			
 
				+		outputChannel.appendLine(`Error installing Chromium: ${JSON.stringify(error)}`)
			
 
				+	})
			
 
				+
			
 
				 	context.subscriptions.push(
			
 
				 		vscode.window.registerWebviewViewProvider(ClaudeDevProvider.sideBarId, sidebarProvider, {
			
 
				 			webviewOptions: { retainContextWhenHidden: true },
			
--- a/src/utils/UrlScraper.ts
+++ b/src/utils/UrlScraper.ts
@@ -0,0 +1,120 @@
 
				+import * as vscode from "vscode"
			
 
				+import * as fs from "fs/promises"
			
 
				+import * as path from "path"
			
 
				+import { Page } from "puppeteer-core"
			
 
				+import * as cheerio from "cheerio"
			
 
				+import TurndownService from "turndown"
			
 
				+import delay from "delay"
			
 
				+// @ts-ignore
			
 
				+import PCR from "puppeteer-chromium-resolver"
			
 
				+
			
 
				+const PUPPETEER_DIR = "puppeteer"
			
 
				+
			
 
				+export class UrlScraper {
			
 
				+	private static context?: vscode.ExtensionContext
			
 
				+
			
 
				+	static async ensureChromiumExists(context?: vscode.ExtensionContext): Promise<void> {
			
 
				+		this.context = context
			
 
				+		const globalStoragePath = context?.globalStorageUri?.fsPath
			
 
				+		if (!globalStoragePath) {
			
 
				+			throw new Error("Global storage uri is invalid")
			
 
				+		}
			
 
				+
			
 
				+		const puppeteerDir = path.join(globalStoragePath, PUPPETEER_DIR)
			
 
				+
			
 
				+		if (!(await fileExists(puppeteerDir))) {
			
 
				+			await fs.mkdir(puppeteerDir, { recursive: true })
			
 
				+		}
			
 
				+
			
 
				+		const chromiumPath = path.join(puppeteerDir, ".chromium-browser-snapshots")
			
 
				+
			
 
				+		if (!(await fileExists(chromiumPath))) {
			
 
				+			// If Chromium doesn't exist, download it
			
 
				+			await PCR({
			
 
				+				downloadPath: puppeteerDir,
			
 
				+			})
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	static async urlToMarkdown(url: string): Promise<string> {
			
 
				+		await this.ensureChromiumExists(this.context)
			
 
				+
			
 
				+		const globalStoragePath = this.context?.globalStorageUri?.fsPath
			
 
				+		if (!globalStoragePath) {
			
 
				+			throw new Error("Global storage uri is invalid")
			
 
				+		}
			
 
				+		const puppeteerDir = path.join(globalStoragePath, PUPPETEER_DIR)
			
 
				+
			
 
				+		const stats = await PCR({
			
 
				+			downloadPath: puppeteerDir,
			
 
				+		})
			
 
				+		const browser = await stats.puppeteer.launch({
			
 
				+			args: [
			
 
				+				"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36",
			
 
				+			],
			
 
				+			executablePath: stats.executablePath,
			
 
				+		})
			
 
				+
			
 
				+		try {
			
 
				+			const page = await browser.newPage()
			
 
				+			await page.goto(url, { timeout: 5_000, waitUntil: "load" })
			
 
				+			await this.waitTillHTMLRendered(page)
			
 
				+			const content = await page.content()
			
 
				+
			
 
				+			// Use Cheerio to parse and clean up the HTML
			
 
				+			const $ = cheerio.load(content)
			
 
				+			$("script, style, nav, footer").remove() // Remove unnecessary elements
			
 
				+
			
 
				+			// Convert cleaned HTML to Markdown
			
 
				+			const turndownService = new TurndownService()
			
 
				+			const markdown = turndownService.turndown($.html())
			
 
				+
			
 
				+			return markdown
			
 
				+		} finally {
			
 
				+			await browser.close()
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// page.goto { waitUntil: "networkidle0" } may not ever resolve, and not waiting could return page content too early before js has loaded
			
 
				+	// https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
			
 
				+	private static async waitTillHTMLRendered(page: Page, timeout = 10_000) {
			
 
				+		const checkDurationMsecs = 1000
			
 
				+		const maxChecks = timeout / checkDurationMsecs
			
 
				+		let lastHTMLSize = 0
			
 
				+		let checkCounts = 1
			
 
				+		let countStableSizeIterations = 0
			
 
				+		const minStableSizeIterations = 3
			
 
				+
			
 
				+		while (checkCounts++ <= maxChecks) {
			
 
				+			let html = await page.content()
			
 
				+			let currentHTMLSize = html.length
			
 
				+
			
 
				+			let bodyHTMLSize = await page.evaluate(() => document.body.innerHTML.length)
			
 
				+
			
 
				+			console.log("last: ", lastHTMLSize, " <> curr: ", currentHTMLSize, " body html size: ", bodyHTMLSize)
			
 
				+
			
 
				+			if (lastHTMLSize !== 0 && currentHTMLSize === lastHTMLSize) {
			
 
				+				countStableSizeIterations++
			
 
				+			} else {
			
 
				+				countStableSizeIterations = 0 //reset the counter
			
 
				+			}
			
 
				+
			
 
				+			if (countStableSizeIterations >= minStableSizeIterations) {
			
 
				+				console.log("Page rendered fully..")
			
 
				+				break
			
 
				+			}
			
 
				+
			
 
				+			lastHTMLSize = currentHTMLSize
			
 
				+			await delay(checkDurationMsecs)
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+async function fileExists(path: string): Promise<boolean> {
			
 
				+	try {
			
 
				+		await fs.access(path)
			
 
				+		return true
			
 
				+	} catch {
			
 
				+		return false
			
 
				+	}
			
 
				+}