extract-text.ts 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234
  1. import * as path from "path"
  2. // @ts-ignore-next-line
  3. import pdf from "pdf-parse/lib/pdf-parse"
  4. import mammoth from "mammoth"
  5. import fs from "fs/promises"
  6. import { isBinaryFile } from "isbinaryfile"
  7. export async function extractTextFromFile(filePath: string): Promise<string> {
  8. try {
  9. await fs.access(filePath)
  10. } catch (error) {
  11. throw new Error(`File not found: ${filePath}`)
  12. }
  13. const fileExtension = path.extname(filePath).toLowerCase()
  14. switch (fileExtension) {
  15. case ".pdf":
  16. return extractTextFromPDF(filePath)
  17. case ".docx":
  18. return extractTextFromDOCX(filePath)
  19. case ".ipynb":
  20. return extractTextFromIPYNB(filePath)
  21. default:
  22. const isBinary = await isBinaryFile(filePath).catch(() => false)
  23. if (!isBinary) {
  24. return addLineNumbers(await fs.readFile(filePath, "utf8"))
  25. } else {
  26. throw new Error(`Cannot read text for file type: ${fileExtension}`)
  27. }
  28. }
  29. }
  30. async function extractTextFromPDF(filePath: string): Promise<string> {
  31. const dataBuffer = await fs.readFile(filePath)
  32. const data = await pdf(dataBuffer)
  33. return addLineNumbers(data.text)
  34. }
  35. async function extractTextFromDOCX(filePath: string): Promise<string> {
  36. const result = await mammoth.extractRawText({ path: filePath })
  37. return addLineNumbers(result.value)
  38. }
  39. async function extractTextFromIPYNB(filePath: string): Promise<string> {
  40. const data = await fs.readFile(filePath, "utf8")
  41. const notebook = JSON.parse(data)
  42. let extractedText = ""
  43. for (const cell of notebook.cells) {
  44. if ((cell.cell_type === "markdown" || cell.cell_type === "code") && cell.source) {
  45. extractedText += cell.source.join("\n") + "\n"
  46. }
  47. }
  48. return addLineNumbers(extractedText)
  49. }
  50. export function addLineNumbers(content: string, startLine: number = 1): string {
  51. // If content is empty, return empty string - empty files should not have line numbers
  52. // If content is empty but startLine > 1, return "startLine | " because we know the file is not empty
  53. // but the content is empty at that line offset
  54. if (content === "") {
  55. return startLine === 1 ? "" : `${startLine} | \n`
  56. }
  57. // Split into lines and handle trailing newlines
  58. const lines = content.split("\n")
  59. const lastLineEmpty = lines[lines.length - 1] === ""
  60. if (lastLineEmpty) {
  61. lines.pop()
  62. }
  63. const maxLineNumberWidth = String(startLine + lines.length - 1).length
  64. const numberedContent = lines
  65. .map((line, index) => {
  66. const lineNumber = String(startLine + index).padStart(maxLineNumberWidth, " ")
  67. return `${lineNumber} | ${line}`
  68. })
  69. .join("\n")
  70. return numberedContent + "\n"
  71. }
  72. // Checks if every line in the content has line numbers prefixed (e.g., "1 | content" or "123 | content")
  73. // Line numbers must be followed by a single pipe character (not double pipes)
  74. export function everyLineHasLineNumbers(content: string): boolean {
  75. const lines = content.split(/\r?\n/)
  76. return lines.length > 0 && lines.every((line) => /^\s*\d+\s+\|(?!\|)/.test(line))
  77. }
  78. /**
  79. * Strips line numbers from content while preserving the actual content.
  80. *
  81. * @param content The content to process
  82. * @param aggressive When false (default): Only strips lines with clear number patterns like "123 | content"
  83. * When true: Uses a more lenient pattern that also matches lines with just a pipe character,
  84. * which can be useful when LLMs don't perfectly format the line numbers in diffs
  85. * @returns The content with line numbers removed
  86. */
  87. export function stripLineNumbers(content: string, aggressive: boolean = false): string {
  88. // Split into lines to handle each line individually
  89. const lines = content.split(/\r?\n/)
  90. // Process each line
  91. const processedLines = lines.map((line) => {
  92. // Match line number pattern and capture everything after the pipe
  93. const match = aggressive ? line.match(/^\s*(?:\d+\s)?\|\s(.*)$/) : line.match(/^\s*\d+\s+\|(?!\|)\s?(.*)$/)
  94. return match ? match[1] : line
  95. })
  96. // Join back with original line endings
  97. const lineEnding = content.includes("\r\n") ? "\r\n" : "\n"
  98. return processedLines.join(lineEnding)
  99. }
  100. /**
  101. * Truncates multi-line output while preserving context from both the beginning and end.
  102. * When truncation is needed, it keeps 20% of the lines from the start and 80% from the end,
  103. * with a clear indicator of how many lines were omitted in between.
  104. *
  105. * @param content The multi-line string to truncate
  106. * @param lineLimit Optional maximum number of lines to keep. If not provided or 0, returns the original content
  107. * @returns The truncated string with an indicator of omitted lines, or the original content if no truncation needed
  108. *
  109. * @example
  110. * // With 10 line limit on 25 lines of content:
  111. * // - Keeps first 2 lines (20% of 10)
  112. * // - Keeps last 8 lines (80% of 10)
  113. * // - Adds "[...15 lines omitted...]" in between
  114. */
  115. export function truncateOutput(content: string, lineLimit?: number): string {
  116. if (!lineLimit) {
  117. return content
  118. }
  119. // Count total lines
  120. let totalLines = 0
  121. let pos = -1
  122. while ((pos = content.indexOf("\n", pos + 1)) !== -1) {
  123. totalLines++
  124. }
  125. totalLines++ // Account for last line without newline
  126. if (totalLines <= lineLimit) {
  127. return content
  128. }
  129. const beforeLimit = Math.floor(lineLimit * 0.2) // 20% of lines before
  130. const afterLimit = lineLimit - beforeLimit // remaining 80% after
  131. // Find start section end position
  132. let startEndPos = -1
  133. let lineCount = 0
  134. pos = 0
  135. while (lineCount < beforeLimit && (pos = content.indexOf("\n", pos)) !== -1) {
  136. startEndPos = pos
  137. lineCount++
  138. pos++
  139. }
  140. // Find end section start position
  141. let endStartPos = content.length
  142. lineCount = 0
  143. pos = content.length
  144. while (lineCount < afterLimit && (pos = content.lastIndexOf("\n", pos - 1)) !== -1) {
  145. endStartPos = pos + 1 // Start after the newline
  146. lineCount++
  147. }
  148. const omittedLines = totalLines - lineLimit
  149. const startSection = content.slice(0, startEndPos + 1)
  150. const endSection = content.slice(endStartPos)
  151. return startSection + `\n[...${omittedLines} lines omitted...]\n\n` + endSection
  152. }
  153. /**
  154. * Applies run-length encoding to compress repeated lines in text.
  155. * Only compresses when the compression description is shorter than the repeated content.
  156. *
  157. * @param content The text content to compress
  158. * @returns The compressed text with run-length encoding applied
  159. */
  160. export function applyRunLengthEncoding(content: string): string {
  161. if (!content) {
  162. return content
  163. }
  164. let result = ""
  165. let pos = 0
  166. let repeatCount = 0
  167. let prevLine = null
  168. let firstOccurrence = true
  169. while (pos < content.length) {
  170. const nextNewlineIdx = content.indexOf("\n", pos)
  171. const currentLine = nextNewlineIdx === -1 ? content.slice(pos) : content.slice(pos, nextNewlineIdx + 1)
  172. if (prevLine === null) {
  173. prevLine = currentLine
  174. } else if (currentLine === prevLine) {
  175. repeatCount++
  176. } else {
  177. if (repeatCount > 0) {
  178. const compressionDesc = `<previous line repeated ${repeatCount} additional times>\n`
  179. if (compressionDesc.length < prevLine.length * (repeatCount + 1)) {
  180. result += prevLine + compressionDesc
  181. } else {
  182. for (let i = 0; i <= repeatCount; i++) {
  183. result += prevLine
  184. }
  185. }
  186. repeatCount = 0
  187. } else {
  188. result += prevLine
  189. }
  190. prevLine = currentLine
  191. }
  192. pos = nextNewlineIdx === -1 ? content.length : nextNewlineIdx + 1
  193. }
  194. if (repeatCount > 0 && prevLine !== null) {
  195. const compressionDesc = `<previous line repeated ${repeatCount} additional times>\n`
  196. if (compressionDesc.length < prevLine.length * repeatCount) {
  197. result += prevLine + compressionDesc
  198. } else {
  199. for (let i = 0; i <= repeatCount; i++) {
  200. result += prevLine
  201. }
  202. }
  203. } else if (prevLine !== null) {
  204. result += prevLine
  205. }
  206. return result
  207. }