1 year ago · eec51ad270
--- a/src/utils/UrlScraper.ts
+++ b/src/utils/UrlScraper.ts
@@ -60,13 +60,19 @@ export class UrlScraper {
 
				 
			
 
				 		try {
			
 
				 			const page = await browser.newPage()
			
 
				-			await page.goto(url, { timeout: 5_000, waitUntil: "load" })
			
 
				-			await this.waitTillHTMLRendered(page)
			
 
				+
			
 
				+			/*
			
 
				+			- networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms.
			
 
				+			- domcontentloaded is when the basic DOM is loaded
			
 
				+			this should be sufficient for most doc sites, but we can use the more elaborate waitTillHTMLRendered if we find users are scraping more dynamic complex sites
			
 
				+			*/
			
 
				+			await page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] })
			
 
				+			// await this.waitTillHTMLRendered(page)
			
 
				 			const content = await page.content()
			
 
				 
			
 
				 			// Use Cheerio to parse and clean up the HTML
			
 
				 			const $ = cheerio.load(content)
			
 
				-			$("script, style, nav, footer").remove() // Remove unnecessary elements
			
 
				+			$("script, style, nav, footer").remove() // Remove unnecessary elements (todo: make this more robust)
			
 
				 
			
 
				 			// Convert cleaned HTML to Markdown
			
 
				 			const turndownService = new TurndownService()
			
@@ -80,6 +86,7 @@ export class UrlScraper {
 
				 
			
 
				 	// page.goto { waitUntil: "networkidle0" } may not ever resolve, and not waiting could return page content too early before js has loaded
			
 
				 	// https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
			
 
				+	/*
			
 
				 	private async waitTillHTMLRendered(page: Page, timeout = 10_000) {
			
 
				 		const checkDurationMsecs = 500 // 1000
			
 
				 		const maxChecks = timeout / checkDurationMsecs
			
@@ -110,6 +117,7 @@ export class UrlScraper {
 
				 			await delay(checkDurationMsecs)
			
 
				 		}
			
 
				 	}
			
 
				+	*/
			
 
				 }
			
 
				 
			
 
				 async function fileExists(path: string): Promise<boolean> {