ソースを参照

perf(opencode): batch snapshot diffFull blob reads (#20752)

Co-authored-by: Nate Williams <[email protected]>
Kit Langton 2 週間 前
コミット
288eb044cb

+ 178 - 23
packages/opencode/src/snapshot/index.ts

@@ -437,6 +437,146 @@ export namespace Snapshot {
           const diffFull = Effect.fnUntraced(function* (from: string, to: string) {
             return yield* locked(
               Effect.gen(function* () {
+                type Row = {
+                  file: string
+                  status: "added" | "deleted" | "modified"
+                  binary: boolean
+                  additions: number
+                  deletions: number
+                }
+
+                type Ref = {
+                  file: string
+                  side: "before" | "after"
+                  ref: string
+                }
+
+                const show = Effect.fnUntraced(function* (row: Row) {
+                  if (row.binary) return ["", ""]
+                  if (row.status === "added") {
+                    return [
+                      "",
+                      yield* git([...cfg, ...args(["show", `${to}:${row.file}`])]).pipe(
+                        Effect.map((item) => item.text),
+                      ),
+                    ]
+                  }
+                  if (row.status === "deleted") {
+                    return [
+                      yield* git([...cfg, ...args(["show", `${from}:${row.file}`])]).pipe(
+                        Effect.map((item) => item.text),
+                      ),
+                      "",
+                    ]
+                  }
+                  return yield* Effect.all(
+                    [
+                      git([...cfg, ...args(["show", `${from}:${row.file}`])]).pipe(Effect.map((item) => item.text)),
+                      git([...cfg, ...args(["show", `${to}:${row.file}`])]).pipe(Effect.map((item) => item.text)),
+                    ],
+                    { concurrency: 2 },
+                  )
+                })
+
+                const load = Effect.fnUntraced(
+                  function* (rows: Row[]) {
+                    const refs = rows.flatMap((row) => {
+                      if (row.binary) return []
+                      if (row.status === "added")
+                        return [{ file: row.file, side: "after", ref: `${to}:${row.file}` } satisfies Ref]
+                      if (row.status === "deleted") {
+                        return [{ file: row.file, side: "before", ref: `${from}:${row.file}` } satisfies Ref]
+                      }
+                      return [
+                        { file: row.file, side: "before", ref: `${from}:${row.file}` } satisfies Ref,
+                        { file: row.file, side: "after", ref: `${to}:${row.file}` } satisfies Ref,
+                      ]
+                    })
+                    if (!refs.length) return new Map<string, { before: string; after: string }>()
+
+                    const proc = ChildProcess.make("git", [...cfg, ...args(["cat-file", "--batch"])], {
+                      cwd: state.directory,
+                      extendEnv: true,
+                      stdin: Stream.make(new TextEncoder().encode(refs.map((item) => item.ref).join("\n") + "\n")),
+                    })
+                    const handle = yield* spawner.spawn(proc)
+                    const [out, err] = yield* Effect.all(
+                      [Stream.mkUint8Array(handle.stdout), Stream.mkString(Stream.decodeText(handle.stderr))],
+                      { concurrency: 2 },
+                    )
+                    const code = yield* handle.exitCode
+                    if (code !== 0) {
+                      log.info("git cat-file --batch failed during snapshot diff, falling back to per-file git show", {
+                        stderr: err,
+                        refs: refs.length,
+                      })
+                      return
+                    }
+
+                    const fail = (msg: string, extra?: Record<string, string>) => {
+                      log.info(msg, { ...extra, refs: refs.length })
+                      return undefined
+                    }
+
+                    const map = new Map<string, { before: string; after: string }>()
+                    const dec = new TextDecoder()
+                    let i = 0
+                    // Parse the default `git cat-file --batch` stream: one header line,
+                    // then exactly `size` bytes of blob content, then a trailing newline.
+                    for (const ref of refs) {
+                      let end = i
+                      while (end < out.length && out[end] !== 10) end += 1
+                      if (end >= out.length) {
+                        return fail(
+                          "git cat-file --batch returned a truncated header during snapshot diff, falling back to per-file git show",
+                        )
+                      }
+
+                      const head = dec.decode(out.slice(i, end))
+                      i = end + 1
+                      const hit = map.get(ref.file) ?? { before: "", after: "" }
+                      if (head.endsWith(" missing")) {
+                        map.set(ref.file, hit)
+                        continue
+                      }
+
+                      const match = head.match(/^[0-9a-f]+ blob (\d+)$/)
+                      if (!match) {
+                        return fail(
+                          "git cat-file --batch returned an unexpected header during snapshot diff, falling back to per-file git show",
+                          { head },
+                        )
+                      }
+
+                      const size = Number(match[1])
+                      if (!Number.isInteger(size) || size < 0 || i + size >= out.length || out[i + size] !== 10) {
+                        return fail(
+                          "git cat-file --batch returned truncated content during snapshot diff, falling back to per-file git show",
+                          { head },
+                        )
+                      }
+
+                      const text = dec.decode(out.slice(i, i + size))
+                      if (ref.side === "before") hit.before = text
+                      if (ref.side === "after") hit.after = text
+                      map.set(ref.file, hit)
+                      i += size + 1
+                    }
+
+                    if (i !== out.length) {
+                      return fail(
+                        "git cat-file --batch returned trailing data during snapshot diff, falling back to per-file git show",
+                      )
+                    }
+
+                    return map
+                  },
+                  Effect.scoped,
+                  Effect.catch(() =>
+                    Effect.succeed<Map<string, { before: string; after: string }> | undefined>(undefined),
+                  ),
+                )
+
                 const result: Snapshot.FileDiff[] = []
                 const status = new Map<string, "added" | "deleted" | "modified">()
 
@@ -459,30 +599,45 @@ export namespace Snapshot {
                   },
                 )
 
-                for (const line of numstat.text.trim().split("\n")) {
-                  if (!line) continue
-                  const [adds, dels, file] = line.split("\t")
-                  if (!file) continue
-                  const binary = adds === "-" && dels === "-"
-                  const [before, after] = binary
-                    ? ["", ""]
-                    : yield* Effect.all(
-                        [
-                          git([...cfg, ...args(["show", `${from}:${file}`])]).pipe(Effect.map((item) => item.text)),
-                          git([...cfg, ...args(["show", `${to}:${file}`])]).pipe(Effect.map((item) => item.text)),
-                        ],
-                        { concurrency: 2 },
-                      )
-                  const additions = binary ? 0 : parseInt(adds)
-                  const deletions = binary ? 0 : parseInt(dels)
-                  result.push({
-                    file,
-                    before,
-                    after,
-                    additions: Number.isFinite(additions) ? additions : 0,
-                    deletions: Number.isFinite(deletions) ? deletions : 0,
-                    status: status.get(file) ?? "modified",
+                const rows = numstat.text
+                  .trim()
+                  .split("\n")
+                  .filter(Boolean)
+                  .flatMap((line) => {
+                    const [adds, dels, file] = line.split("\t")
+                    if (!file) return []
+                    const binary = adds === "-" && dels === "-"
+                    const additions = binary ? 0 : parseInt(adds)
+                    const deletions = binary ? 0 : parseInt(dels)
+                    return [
+                      {
+                        file,
+                        status: status.get(file) ?? "modified",
+                        binary,
+                        additions: Number.isFinite(additions) ? additions : 0,
+                        deletions: Number.isFinite(deletions) ? deletions : 0,
+                      } satisfies Row,
+                    ]
                   })
+                const step = 100
+
+                // Keep batches bounded so a large diff does not buffer every blob at once.
+                for (let i = 0; i < rows.length; i += step) {
+                  const run = rows.slice(i, i + step)
+                  const text = yield* load(run)
+
+                  for (const row of run) {
+                    const hit = text?.get(row.file) ?? { before: "", after: "" }
+                    const [before, after] = row.binary ? ["", ""] : text ? [hit.before, hit.after] : yield* show(row)
+                    result.push({
+                      file: row.file,
+                      before,
+                      after,
+                      additions: row.additions,
+                      deletions: row.deletions,
+                      status: row.status,
+                    })
+                  }
                 }
 
                 return result

+ 92 - 0
packages/opencode/test/snapshot/snapshot.test.ts

@@ -982,6 +982,98 @@ test("diffFull with new file additions", async () => {
   })
 })
 
+test("diffFull with a large interleaved mixed diff", async () => {
+  await using tmp = await bootstrap()
+  await Instance.provide({
+    directory: tmp.path,
+    fn: async () => {
+      const ids = Array.from({ length: 60 }, (_, i) => i.toString().padStart(3, "0"))
+      const mod = ids.map((id) => fwd(tmp.path, "mix", `${id}-mod.txt`))
+      const del = ids.map((id) => fwd(tmp.path, "mix", `${id}-del.txt`))
+      const add = ids.map((id) => fwd(tmp.path, "mix", `${id}-add.txt`))
+      const bin = ids.map((id) => fwd(tmp.path, "mix", `${id}-bin.bin`))
+
+      await $`mkdir -p ${tmp.path}/mix`.quiet()
+      await Promise.all([
+        ...mod.map((file, i) => Filesystem.write(file, `before-${ids[i]}-é\n🙂\nline`)),
+        ...del.map((file, i) => Filesystem.write(file, `gone-${ids[i]}\n你好`)),
+        ...bin.map((file, i) => Filesystem.write(file, new Uint8Array([0, i, 255, i % 251]))),
+      ])
+
+      const before = await Snapshot.track()
+      expect(before).toBeTruthy()
+
+      await Promise.all([
+        ...mod.map((file, i) => Filesystem.write(file, `after-${ids[i]}-é\n🚀\nline`)),
+        ...add.map((file, i) => Filesystem.write(file, `new-${ids[i]}\nこんにちは`)),
+        ...bin.map((file, i) => Filesystem.write(file, new Uint8Array([9, i, 8, i % 251]))),
+        ...del.map((file) => fs.rm(file)),
+      ])
+
+      const after = await Snapshot.track()
+      expect(after).toBeTruthy()
+
+      const diffs = await Snapshot.diffFull(before!, after!)
+      expect(diffs).toHaveLength(ids.length * 4)
+
+      const map = new Map(diffs.map((item) => [item.file, item]))
+      for (let i = 0; i < ids.length; i++) {
+        const m = map.get(fwd("mix", `${ids[i]}-mod.txt`))
+        expect(m).toBeDefined()
+        expect(m!.before).toBe(`before-${ids[i]}-é\n🙂\nline`)
+        expect(m!.after).toBe(`after-${ids[i]}-é\n🚀\nline`)
+        expect(m!.status).toBe("modified")
+
+        const d = map.get(fwd("mix", `${ids[i]}-del.txt`))
+        expect(d).toBeDefined()
+        expect(d!.before).toBe(`gone-${ids[i]}\n你好`)
+        expect(d!.after).toBe("")
+        expect(d!.status).toBe("deleted")
+
+        const a = map.get(fwd("mix", `${ids[i]}-add.txt`))
+        expect(a).toBeDefined()
+        expect(a!.before).toBe("")
+        expect(a!.after).toBe(`new-${ids[i]}\nこんにちは`)
+        expect(a!.status).toBe("added")
+
+        const b = map.get(fwd("mix", `${ids[i]}-bin.bin`))
+        expect(b).toBeDefined()
+        expect(b!.before).toBe("")
+        expect(b!.after).toBe("")
+        expect(b!.additions).toBe(0)
+        expect(b!.deletions).toBe(0)
+        expect(b!.status).toBe("modified")
+      }
+    },
+  })
+})
+
+test("diffFull preserves git diff order across batch boundaries", async () => {
+  await using tmp = await bootstrap()
+  await Instance.provide({
+    directory: tmp.path,
+    fn: async () => {
+      const ids = Array.from({ length: 140 }, (_, i) => i.toString().padStart(3, "0"))
+
+      await $`mkdir -p ${tmp.path}/order`.quiet()
+      await Promise.all(ids.map((id) => Filesystem.write(`${tmp.path}/order/${id}.txt`, `before-${id}`)))
+
+      const before = await Snapshot.track()
+      expect(before).toBeTruthy()
+
+      await Promise.all(ids.map((id) => Filesystem.write(`${tmp.path}/order/${id}.txt`, `after-${id}`)))
+
+      const after = await Snapshot.track()
+      expect(after).toBeTruthy()
+
+      const expected = ids.map((id) => `order/${id}.txt`)
+
+      const diffs = await Snapshot.diffFull(before!, after!)
+      expect(diffs.map((item) => item.file)).toEqual(expected)
+    },
+  })
+})
+
 test("diffFull with file modifications", async () => {
   await using tmp = await bootstrap()
   await Instance.provide({