#!/usr/bin/env bun

/**
 * Extracts user-facing URLs from the VS Code extension and CLI source code,
 * then writes them to a markdown file that the docs link-checker validates.
 *
 * Usage:
 *   bun run script/extract-source-links.ts          # Generate / update the committed file
 *   bun run script/extract-source-links.ts --check   # CI mode — exit 1 if the file is stale
 */

import { Glob } from "bun"
import path from "path"

const ROOT = path.resolve(import.meta.dir, "..")
const OUTPUT = path.join(ROOT, "packages/kilo-docs/source-links.md")

const check = process.argv.includes("--check")

const DIRS = [
  path.join(ROOT, "packages/kilo-vscode/src"),
  path.join(ROOT, "packages/kilo-vscode/webview-ui"),
  path.join(ROOT, "packages/opencode/src"),
]

const EXTENSIONS = ["ts", "tsx", "js", "jsx"]

// Matches http:// and https:// URLs in string literals or comments
const URL_RE = /https?:\/\/[^\s"'`)\]},;*\\<>]+/g

// URLs to exclude — only genuinely non-checkable URLs (API endpoints, localhost,
// examples, dynamic templates, namespaces). Real external URLs should be extracted
// and validated by lychee; add lychee.toml exclusions for sites that block bots.
const EXCLUDE_PATTERNS = [
  // Localhost and internal
  /^https?:\/\/(localhost|127\.0\.0\.1|0\.0\.0\.0)/,
  /^https?:\/\/kilo\.internal/,
  /^https?:\/\/dev\.kilo\.ai/,
  /^https?:\/\/tauri\.localhost/,
  // Example/placeholder URLs
  /^https?:\/\/example\.com/,
  /^https?:\/\/api\.myprovider\.com/,
  /^https?:\/\/synthetic\.new/,
  // API endpoints (not user-facing)
  /^https?:\/\/api\.kilo\.ai\/api\//,
  /^https?:\/\/ingest\.kilosessions\.ai/,
  /^https?:\/\/api\.openai\.com/,
  /^https?:\/\/api\.github\.com/,
  /^https?:\/\/api\.cloudflare\.com/,
  /^https?:\/\/api\.releases\.hashicorp\.com/,
  /^https?:\/\/auth\.openai\.com/,
  /^https?:\/\/chatgpt\.com\/backend-api/,
  /^https?:\/\/mcp\.exa\.ai/,
  /^https?:\/\/registry\.npmjs\.org/,
  /^https?:\/\/formulae\.brew\.sh\/api/,
  /^https?:\/\/community\.chocolatey\.org\/api/,
  /^https?:\/\/download-cdn\.jetbrains\.com/,
  /^https?:\/\/raw\.githubusercontent\.com/,
  // XML/SVG namespace URIs
  /^https?:\/\/www\.w3\.org\//,
  // URLs that are templates with interpolation (contain ${ after stripping)
  /\$\{/,
  // Truncated/placeholder URLs (e.g., https://…) or bare protocols
  /^https?:\/\/[\W]*$/,
  // GHE example domains
  /^https?:\/\/company\.ghe\.com/,
  // Example/placeholder GitHub URLs used in docs/comments
  /^https?:\/\/github\.com\/owner\//,
  /^https?:\/\/github\.com\/\.extraheader/,
  /^https?:\/\/github\.com\/user-attachments\/assets\/xxxx/,
  /^https?:\/\/github\.com\/user-attachments\/files\/\d+\/api\.json/,
  // Example/template session URLs with placeholders
  /\/s\/abc123$/,
  // Truncated URL paths (e.g., /s/ with no ID)
  /\/s\/$/,
]

// Directories to skip entirely
const SKIP_DIRS = ["node_modules", ".storybook", "stories", "test", "tests", "__tests__", "__mocks__"]

// Subdirectories containing vendored/third-party code
const SKIP_PATH_SEGMENTS = ["continuedev"]

// Individual files to skip (data files full of non-user-facing URLs)
const SKIP_FILES = ["models-snapshot.ts", "models-snapshot.js"]

function shouldExclude(url: string): boolean {
  return EXCLUDE_PATTERNS.some((re) => re.test(url))
}

function shouldSkipFile(filepath: string): boolean {
  const rel = path.relative(ROOT, filepath)
  const parts = rel.split(path.sep)
  if (parts.some((p) => SKIP_DIRS.includes(p))) return true
  if (SKIP_PATH_SEGMENTS.some((seg) => rel.includes(seg))) return true
  if (/\.test\.[jt]sx?$/.test(filepath)) return true
  if (/\.spec\.[jt]sx?$/.test(filepath)) return true
  if (/\.stories\.[jt]sx?$/.test(filepath)) return true
  if (/\/i18n\//.test(filepath) && !filepath.endsWith("en.ts")) return true
  const basename = path.basename(filepath)
  if (SKIP_FILES.includes(basename)) return true
  return false
}

function clean(url: string): string {
  return url.replace(/[.),:;]+$/, "").replace(/<\/?\w+>$/, "")
}

async function extract(): Promise<Map<string, Set<string>>> {
  const links = new Map<string, Set<string>>()

  for (const dir of DIRS) {
    for (const ext of EXTENSIONS) {
      const glob = new Glob(`**/*.${ext}`)
      for await (const entry of glob.scan({ cwd: dir, absolute: true })) {
        if (shouldSkipFile(entry)) continue
        const content = await Bun.file(entry).text()
        for (const line of content.split("\n")) {
          for (const match of line.matchAll(URL_RE)) {
            const url = clean(match[0])
            if (shouldExclude(url)) continue
            if (!links.has(url)) links.set(url, new Set())
            links.get(url)!.add(path.relative(ROOT, entry))
          }
        }
      }
    }
  }

  return links
}

function render(sorted: [string, Set<string>][]): string {
  const parts = [
    "# Source Code Links",
    "",
    "<!-- Auto-generated by script/extract-source-links.ts — DO NOT EDIT -->",
    `<!-- ${sorted.length} unique URLs extracted from extension and CLI source -->`,
    "",
  ]

  for (const [url, files] of sorted) {
    parts.push(`- <${url}>`)
    for (const file of [...files].sort()) {
      parts.push(`  <!-- ${file} -->`)
    }
  }

  parts.push("")
  return parts.join("\n")
}

const links = await extract()
const sorted = [...links.entries()].sort(([a], [b]) => a.localeCompare(b))
const output = render(sorted)

if (check) {
  const committed = await Bun.file(OUTPUT)
    .text()
    .catch(() => "")
  if (committed === output) {
    console.log("packages/kilo-docs/source-links.md is up to date.")
    process.exit(0)
  }
  console.error(
    [
      "ERROR: packages/kilo-docs/source-links.md is out of date.",
      "",
      "Run the following command locally and commit the result:",
      "",
      "  bun run script/extract-source-links.ts",
      "",
    ].join("\n"),
  )
  process.exit(1)
}

await Bun.write(OUTPUT, output)
console.log(`Wrote ${sorted.length} unique URLs to packages/kilo-docs/source-links.md`)