Browse Source

Better string normalization for diffs (#2659)

Matt Rubens 9 months ago
parent
commit
51bcade4c5

+ 21 - 0
src/core/diff/strategies/__tests__/multi-search-replace.test.ts

@@ -1711,6 +1711,27 @@ function sum(a, b) {
 			}
 		})
 
+		it("should match content with smart quotes", async () => {
+			const originalContent =
+				"**Enjoy Roo Code!** Whether you keep it on a short leash or let it roam autonomously, we can’t wait to see what you build. If you have questions or feature ideas, drop by our [Reddit community](https://www.reddit.com/r/RooCode/) or [Discord](https://discord.gg/roocode). Happy coding!"
+			const diffContent = `test.ts
+<<<<<<< SEARCH
+**Enjoy Roo Code!** Whether you keep it on a short leash or let it roam autonomously, we can’t wait to see what you build. If you have questions or feature ideas, drop by our [Reddit community](https://www.reddit.com/r/RooCode/) or [Discord](https://discord.gg/roocode). Happy coding!
+=======
+**Enjoy Roo Code!** Whether you keep it on a short leash or let it roam autonomously, we can't wait to see what you build. If you have questions or feature ideas, drop by our [Reddit community](https://www.reddit.com/r/RooCode/) or [Discord](https://discord.gg/roocode). Happy coding!
+
+You're still here?
+>>>>>>> REPLACE`
+
+			const result = await strategy.applyDiff(originalContent, diffContent)
+			expect(result.success).toBe(true)
+			if (result.success) {
+				expect(result.content).toBe(
+					"**Enjoy Roo Code!** Whether you keep it on a short leash or let it roam autonomously, we can't wait to see what you build. If you have questions or feature ideas, drop by our [Reddit community](https://www.reddit.com/r/RooCode/) or [Discord](https://discord.gg/roocode). Happy coding!\n\nYou're still here?",
+				)
+			}
+		})
+
 		it("should not exact match empty lines", async () => {
 			const originalContent = "function sum(a, b) {\n\n    return a + b;\n}"
 			const diffContent = `test.ts

+ 4 - 5
src/core/diff/strategies/multi-search-replace.ts

@@ -3,6 +3,7 @@ import { addLineNumbers, everyLineHasLineNumbers, stripLineNumbers } from "../..
 import { distance } from "fastest-levenshtein"
 import { ToolProgressStatus } from "../../../shared/ExtensionMessage"
 import { ToolUse } from "../../assistant-message"
+import { normalizeString } from "../../../utils/text-normalization"
 
 const BUFFER_LINES = 40 // Number of extra context lines to show before and after matches
 
@@ -12,11 +13,9 @@ function getSimilarity(original: string, search: string): number {
 		return 0
 	}
 
-	// Normalize strings by removing extra whitespace but preserve case
-	const normalizeStr = (str: string) => str.replace(/\s+/g, " ").trim()
-
-	const normalizedOriginal = normalizeStr(original)
-	const normalizedSearch = normalizeStr(search)
+	// Use the normalizeString utility to handle smart quotes and other special characters
+	const normalizedOriginal = normalizeString(original)
+	const normalizedSearch = normalizeString(search)
 
 	if (normalizedOriginal === normalizedSearch) {
 		return 1

+ 33 - 0
src/utils/__tests__/text-normalization.test.ts

@@ -0,0 +1,33 @@
+import { normalizeString } from "../text-normalization"
+
+describe("Text normalization utilities", () => {
+	describe("normalizeString", () => {
+		test("normalizes smart quotes by default", () => {
+			expect(normalizeString("These are \u201Csmart quotes\u201D and \u2018single quotes\u2019")).toBe(
+				"These are \"smart quotes\" and 'single quotes'",
+			)
+		})
+
+		test("normalizes typographic characters by default", () => {
+			expect(normalizeString("This has an em dash \u2014 and ellipsis\u2026")).toBe(
+				"This has an em dash - and ellipsis...",
+			)
+		})
+
+		test("normalizes whitespace by default", () => {
+			expect(normalizeString("Multiple   spaces and\t\ttabs")).toBe("Multiple spaces and tabs")
+		})
+
+		test("can be configured to skip certain normalizations", () => {
+			const input = "Keep \u201Csmart quotes\u201D but normalize   whitespace"
+			expect(normalizeString(input, { smartQuotes: false })).toBe(
+				"Keep \u201Csmart quotes\u201D but normalize whitespace",
+			)
+		})
+
+		test("real-world example with mixed characters", () => {
+			const input = "Let\u2019s test this\u2014with some \u201Cfancy\u201D punctuation\u2026 and   spaces"
+			expect(normalizeString(input)).toBe('Let\'s test this-with some "fancy" punctuation... and spaces')
+		})
+	})
+})

+ 77 - 0
src/utils/text-normalization.ts

@@ -0,0 +1,77 @@
+/**
+ * Common character mappings for normalization
+ */
+export const NORMALIZATION_MAPS = {
+	// Smart quotes to regular quotes
+	SMART_QUOTES: {
+		"\u201C": '"', // Left double quote (U+201C)
+		"\u201D": '"', // Right double quote (U+201D)
+		"\u2018": "'", // Left single quote (U+2018)
+		"\u2019": "'", // Right single quote (U+2019)
+	},
+	// Other typographic characters
+	TYPOGRAPHIC: {
+		"\u2026": "...", // Ellipsis
+		"\u2014": "-", // Em dash
+		"\u2013": "-", // En dash
+		"\u00A0": " ", // Non-breaking space
+	},
+}
+
+/**
+ * Options for string normalization
+ */
+export interface NormalizeOptions {
+	smartQuotes?: boolean // Replace smart quotes with straight quotes
+	typographicChars?: boolean // Replace typographic characters
+	extraWhitespace?: boolean // Collapse multiple whitespace to single space
+	trim?: boolean // Trim whitespace from start and end
+}
+
+/**
+ * Default options for normalization
+ */
+const DEFAULT_OPTIONS: NormalizeOptions = {
+	smartQuotes: true,
+	typographicChars: true,
+	extraWhitespace: true,
+	trim: true,
+}
+
+/**
+ * Normalizes a string based on the specified options
+ *
+ * @param str The string to normalize
+ * @param options Normalization options
+ * @returns The normalized string
+ */
+export function normalizeString(str: string, options: NormalizeOptions = DEFAULT_OPTIONS): string {
+	const opts = { ...DEFAULT_OPTIONS, ...options }
+	let normalized = str
+
+	// Replace smart quotes
+	if (opts.smartQuotes) {
+		for (const [smart, regular] of Object.entries(NORMALIZATION_MAPS.SMART_QUOTES)) {
+			normalized = normalized.replace(new RegExp(smart, "g"), regular)
+		}
+	}
+
+	// Replace typographic characters
+	if (opts.typographicChars) {
+		for (const [typographic, regular] of Object.entries(NORMALIZATION_MAPS.TYPOGRAPHIC)) {
+			normalized = normalized.replace(new RegExp(typographic, "g"), regular)
+		}
+	}
+
+	// Normalize whitespace
+	if (opts.extraWhitespace) {
+		normalized = normalized.replace(/\s+/g, " ")
+	}
+
+	// Trim whitespace
+	if (opts.trim) {
+		normalized = normalized.trim()
+	}
+
+	return normalized
+}