text-normalization.ts 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. /**
  2. * Common character mappings for normalization
  3. */
  4. export const NORMALIZATION_MAPS = {
  5. // Smart quotes to regular quotes
  6. SMART_QUOTES: {
  7. "\u201C": '"', // Left double quote (U+201C)
  8. "\u201D": '"', // Right double quote (U+201D)
  9. "\u2018": "'", // Left single quote (U+2018)
  10. "\u2019": "'", // Right single quote (U+2019)
  11. },
  12. // Other typographic characters
  13. TYPOGRAPHIC: {
  14. "\u2026": "...", // Ellipsis
  15. "\u2014": "-", // Em dash
  16. "\u2013": "-", // En dash
  17. "\u00A0": " ", // Non-breaking space
  18. },
  19. }
  20. /**
  21. * Options for string normalization
  22. */
  23. export interface NormalizeOptions {
  24. smartQuotes?: boolean // Replace smart quotes with straight quotes
  25. typographicChars?: boolean // Replace typographic characters
  26. extraWhitespace?: boolean // Collapse multiple whitespace to single space
  27. trim?: boolean // Trim whitespace from start and end
  28. }
  29. /**
  30. * Default options for normalization
  31. */
  32. const DEFAULT_OPTIONS: NormalizeOptions = {
  33. smartQuotes: true,
  34. typographicChars: true,
  35. extraWhitespace: true,
  36. trim: true,
  37. }
  38. /**
  39. * Normalizes a string based on the specified options
  40. *
  41. * @param str The string to normalize
  42. * @param options Normalization options
  43. * @returns The normalized string
  44. */
  45. export function normalizeString(str: string, options: NormalizeOptions = DEFAULT_OPTIONS): string {
  46. const opts = { ...DEFAULT_OPTIONS, ...options }
  47. let normalized = str
  48. // Replace smart quotes
  49. if (opts.smartQuotes) {
  50. for (const [smart, regular] of Object.entries(NORMALIZATION_MAPS.SMART_QUOTES)) {
  51. normalized = normalized.replace(new RegExp(smart, "g"), regular)
  52. }
  53. }
  54. // Replace typographic characters
  55. if (opts.typographicChars) {
  56. for (const [typographic, regular] of Object.entries(NORMALIZATION_MAPS.TYPOGRAPHIC)) {
  57. normalized = normalized.replace(new RegExp(typographic, "g"), regular)
  58. }
  59. }
  60. // Normalize whitespace
  61. if (opts.extraWhitespace) {
  62. normalized = normalized.replace(/\s+/g, " ")
  63. }
  64. // Trim whitespace
  65. if (opts.trim) {
  66. normalized = normalized.trim()
  67. }
  68. return normalized
  69. }
  70. /**
  71. * Unescapes common HTML entities in a string
  72. *
  73. * @param text The string containing HTML entities to unescape
  74. * @returns The unescaped string with HTML entities converted to their literal characters
  75. */
  76. export function unescapeHtmlEntities(text: string): string {
  77. if (!text) return text
  78. return text
  79. .replace(/&lt;/g, "<")
  80. .replace(/&gt;/g, ">")
  81. .replace(/&quot;/g, '"')
  82. .replace(/&#39;/g, "'")
  83. .replace(/&apos;/g, "'")
  84. .replace(/&amp;/g, "&")
  85. }