Browse Source

Wait for domcontentloaded and networkidle2 to determine when page is loaded

Saoud Rizwan 1 year ago
parent
commit
eec51ad270
1 changed files with 11 additions and 3 deletions
  1. 11 3
      src/utils/UrlScraper.ts

+ 11 - 3
src/utils/UrlScraper.ts

@@ -60,13 +60,19 @@ export class UrlScraper {
 
 		try {
 			const page = await browser.newPage()
-			await page.goto(url, { timeout: 5_000, waitUntil: "load" })
-			await this.waitTillHTMLRendered(page)
+
+			/*
+			- networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms.
+			- domcontentloaded is when the basic DOM is loaded
+			this should be sufficient for most doc sites, but we can use the more elaborate waitTillHTMLRendered if we find users are scraping more dynamic complex sites
+			*/
+			await page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] })
+			// await this.waitTillHTMLRendered(page)
 			const content = await page.content()
 
 			// Use Cheerio to parse and clean up the HTML
 			const $ = cheerio.load(content)
-			$("script, style, nav, footer").remove() // Remove unnecessary elements
+			$("script, style, nav, footer").remove() // Remove unnecessary elements (todo: make this more robust)
 
 			// Convert cleaned HTML to Markdown
 			const turndownService = new TurndownService()
@@ -80,6 +86,7 @@ export class UrlScraper {
 
 	// page.goto { waitUntil: "networkidle0" } may not ever resolve, and not waiting could return page content too early before js has loaded
 	// https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
+	/*
 	private async waitTillHTMLRendered(page: Page, timeout = 10_000) {
 		const checkDurationMsecs = 500 // 1000
 		const maxChecks = timeout / checkDurationMsecs
@@ -110,6 +117,7 @@ export class UrlScraper {
 			await delay(checkDurationMsecs)
 		}
 	}
+	*/
 }
 
 async function fileExists(path: string): Promise<boolean> {