|
|
@@ -7,6 +7,7 @@ import TurndownService from "turndown"
|
|
|
// @ts-ignore
|
|
|
import PCR from "puppeteer-chromium-resolver"
|
|
|
import pWaitFor from "p-wait-for"
|
|
|
+import delay from "delay"
|
|
|
|
|
|
interface PCRStats {
|
|
|
puppeteer: { launch: typeof launch }
|
|
|
@@ -114,7 +115,9 @@ export class UrlContentFetcher {
|
|
|
|
|
|
try {
|
|
|
// networkidle2 isn't good enough since page may take some time to load. we can assume locally running dev sites will reach networkidle0 in a reasonable amount of time
|
|
|
- await this.page.goto(url, { timeout: 7_000, waitUntil: ["domcontentloaded", "networkidle0"] })
|
|
|
+ await this.page.goto(url, { timeout: 7_000, waitUntil: ["domcontentloaded", "networkidle2"] })
|
|
|
+ // await this.page.goto(url, { timeout: 10_000, waitUntil: "load" })
|
|
|
+ await this.waitTillHTMLStable(this.page) // in case the page is loading more resources
|
|
|
} catch (err) {
|
|
|
if (!(err instanceof TimeoutError)) {
|
|
|
logs.push(`[Navigation Error] ${err.toString()}`)
|
|
|
@@ -143,4 +146,37 @@ export class UrlContentFetcher {
|
|
|
logs: logs.join("\n"),
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+ // page.goto { waitUntil: "networkidle0" } may not ever resolve, and not waiting could return page content too early before js has loaded
|
|
|
+ // https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
|
|
|
+ private async waitTillHTMLStable(page: Page, timeout = 5_000) {
|
|
|
+ const checkDurationMsecs = 500 // 1000
|
|
|
+ const maxChecks = timeout / checkDurationMsecs
|
|
|
+ let lastHTMLSize = 0
|
|
|
+ let checkCounts = 1
|
|
|
+ let countStableSizeIterations = 0
|
|
|
+ const minStableSizeIterations = 3
|
|
|
+
|
|
|
+ while (checkCounts++ <= maxChecks) {
|
|
|
+ let html = await page.content()
|
|
|
+ let currentHTMLSize = html.length
|
|
|
+
|
|
|
+ // let bodyHTMLSize = await page.evaluate(() => document.body.innerHTML.length)
|
|
|
+ console.log("last: ", lastHTMLSize, " <> curr: ", currentHTMLSize)
|
|
|
+
|
|
|
+ if (lastHTMLSize !== 0 && currentHTMLSize === lastHTMLSize) {
|
|
|
+ countStableSizeIterations++
|
|
|
+ } else {
|
|
|
+ countStableSizeIterations = 0 //reset the counter
|
|
|
+ }
|
|
|
+
|
|
|
+ if (countStableSizeIterations >= minStableSizeIterations) {
|
|
|
+ console.log("Page rendered fully...")
|
|
|
+ break
|
|
|
+ }
|
|
|
+
|
|
|
+ lastHTMLSize = currentHTMLSize
|
|
|
+ await delay(checkDurationMsecs)
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|