UrlScraper.ts 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. import * as vscode from "vscode"
  2. import * as fs from "fs/promises"
  3. import * as path from "path"
  4. import { Page } from "puppeteer-core"
  5. import * as cheerio from "cheerio"
  6. import TurndownService from "turndown"
  7. import delay from "delay"
  8. // @ts-ignore
  9. import PCR from "puppeteer-chromium-resolver"
  10. const PUPPETEER_DIR = "puppeteer"
  11. export class UrlScraper {
  12. private context: vscode.ExtensionContext
  13. constructor(context: vscode.ExtensionContext) {
  14. this.context = context
  15. }
  16. private async ensureChromiumExists(): Promise<void> {
  17. const globalStoragePath = this.context?.globalStorageUri?.fsPath
  18. if (!globalStoragePath) {
  19. throw new Error("Global storage uri is invalid")
  20. }
  21. const puppeteerDir = path.join(globalStoragePath, PUPPETEER_DIR)
  22. if (!(await fileExists(puppeteerDir))) {
  23. await fs.mkdir(puppeteerDir, { recursive: true })
  24. }
  25. const chromiumPath = path.join(puppeteerDir, ".chromium-browser-snapshots")
  26. if (!(await fileExists(chromiumPath))) {
  27. // If Chromium doesn't exist, download it
  28. await PCR({
  29. downloadPath: puppeteerDir,
  30. })
  31. }
  32. }
  33. async urlToMarkdown(url: string): Promise<string> {
  34. await this.ensureChromiumExists()
  35. const globalStoragePath = this.context?.globalStorageUri?.fsPath
  36. if (!globalStoragePath) {
  37. throw new Error("Global storage uri is invalid")
  38. }
  39. const puppeteerDir = path.join(globalStoragePath, PUPPETEER_DIR)
  40. const stats = await PCR({
  41. downloadPath: puppeteerDir,
  42. })
  43. const browser = await stats.puppeteer.launch({
  44. args: [
  45. "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36",
  46. ],
  47. executablePath: stats.executablePath,
  48. })
  49. try {
  50. const page = await browser.newPage()
  51. /*
  52. - networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms.
  53. - domcontentloaded is when the basic DOM is loaded
  54. this should be sufficient for most doc sites, but we can use the more elaborate waitTillHTMLRendered if we find users are scraping more dynamic complex sites
  55. */
  56. await page.goto(url, { timeout: 10_000, waitUntil: ["domcontentloaded", "networkidle2"] })
  57. // await this.waitTillHTMLRendered(page)
  58. const content = await page.content()
  59. // Use Cheerio to parse and clean up the HTML
  60. const $ = cheerio.load(content)
  61. $("script, style, nav, footer").remove() // Remove unnecessary elements (todo: make this more robust)
  62. // Convert cleaned HTML to Markdown
  63. const turndownService = new TurndownService()
  64. const markdown = turndownService.turndown($.html())
  65. return markdown
  66. } finally {
  67. await browser.close()
  68. }
  69. }
  70. // page.goto { waitUntil: "networkidle0" } may not ever resolve, and not waiting could return page content too early before js has loaded
  71. // https://stackoverflow.com/questions/52497252/puppeteer-wait-until-page-is-completely-loaded/61304202#61304202
  72. /*
  73. private async waitTillHTMLRendered(page: Page, timeout = 10_000) {
  74. const checkDurationMsecs = 500 // 1000
  75. const maxChecks = timeout / checkDurationMsecs
  76. let lastHTMLSize = 0
  77. let checkCounts = 1
  78. let countStableSizeIterations = 0
  79. const minStableSizeIterations = 3
  80. while (checkCounts++ <= maxChecks) {
  81. let html = await page.content()
  82. let currentHTMLSize = html.length
  83. // let bodyHTMLSize = await page.evaluate(() => document.body.innerHTML.length)
  84. console.log("last: ", lastHTMLSize, " <> curr: ", currentHTMLSize)
  85. if (lastHTMLSize !== 0 && currentHTMLSize === lastHTMLSize) {
  86. countStableSizeIterations++
  87. } else {
  88. countStableSizeIterations = 0 //reset the counter
  89. }
  90. if (countStableSizeIterations >= minStableSizeIterations) {
  91. console.log("Page rendered fully...")
  92. break
  93. }
  94. lastHTMLSize = currentHTMLSize
  95. await delay(checkDurationMsecs)
  96. }
  97. }
  98. */
  99. }
  100. async function fileExists(path: string): Promise<boolean> {
  101. try {
  102. await fs.access(path)
  103. return true
  104. } catch {
  105. return false
  106. }
  107. }