BitapAlgorithm.cs 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. namespace Masuit.Tools.TextDiff;
  2. /*
  3. * https://en.wikipedia.org/wiki/Bitap_algorithm
  4. */
  5. /// <summary>
  6. /// 实现Bitap算法,允许近似字符串匹配的文本搜索算法。提供了在文本字符串中定位给定模式的最佳实例的功能,考虑了潜在的不匹配和错误。该算法通过MatchSettings配置,其中包括匹配阈值和距离,确定搜索的灵敏度和灵活性。
  7. /// </summary>
  8. internal class BitapAlgorithm(MatchOption option)
  9. {
  10. // 匹配阈值 (0.0 = 最完美, 1.0 = 非常宽松).
  11. private readonly float _matchThreshold = option.MatchThreshold;
  12. // 搜索匹配的距离(0=精确位置,1000以上=广泛匹配)。与预期位置相差多少字符的匹配将分数增加1.0(0.0是完美匹配)。
  13. private readonly int _matchDistance = option.MatchDistance;
  14. /// <summary>
  15. /// 使用Bitap算法在“loc”附近的“text”中找到“pattern”的最佳实例。如果未找到匹配项,则返回-1。
  16. /// </summary>
  17. /// <param name="text">需要搜索的文本</param>
  18. /// <param name="pattern">被搜索片段</param>
  19. /// <param name="startIndex">搜索位置</param>
  20. /// <returns>最匹配的位置或 -1.</returns>
  21. public int Match(string text, string pattern, int startIndex)
  22. {
  23. double scoreThreshold = _matchThreshold;
  24. var bestMatchIndex = text.IndexOf(pattern, startIndex, StringComparison.Ordinal);
  25. if (bestMatchIndex != -1)
  26. {
  27. scoreThreshold = Math.Min(MatchBitapScore(0, bestMatchIndex, startIndex, pattern), scoreThreshold);
  28. bestMatchIndex = text.LastIndexOf(pattern, Math.Min(startIndex + pattern.Length, text.Length), StringComparison.Ordinal);
  29. if (bestMatchIndex != -1)
  30. {
  31. scoreThreshold = Math.Min(MatchBitapScore(0, bestMatchIndex, startIndex, pattern), scoreThreshold);
  32. }
  33. }
  34. var s = InitAlphabet(pattern);
  35. var matchmask = 1 << (pattern.Length - 1);
  36. bestMatchIndex = -1;
  37. var currentMaxRange = pattern.Length + text.Length;
  38. var lastComputedRow = Array.Empty<int>();
  39. for (var d = 0; d < pattern.Length; d++)
  40. {
  41. var currentMinRange = 0;
  42. var currentMidpoint = currentMaxRange;
  43. while (currentMinRange < currentMidpoint)
  44. {
  45. if (MatchBitapScore(d, startIndex + currentMidpoint, startIndex, pattern) <= scoreThreshold)
  46. currentMinRange = currentMidpoint;
  47. else
  48. currentMaxRange = currentMidpoint;
  49. currentMidpoint = (currentMaxRange - currentMinRange) / 2 + currentMinRange;
  50. }
  51. currentMaxRange = currentMidpoint;
  52. var start = Math.Max(1, startIndex - currentMidpoint + 1);
  53. var finish = Math.Min(startIndex + currentMidpoint, text.Length) + pattern.Length;
  54. var rd = new int[finish + 2];
  55. rd[finish + 1] = (1 << d) - 1;
  56. for (var j = finish; j >= start; j--)
  57. {
  58. int charMatch;
  59. if (text.Length <= j - 1 || !s.ContainsKey(text[j - 1]))
  60. {
  61. charMatch = 0;
  62. }
  63. else
  64. {
  65. charMatch = s[text[j - 1]];
  66. }
  67. if (d == 0)
  68. {
  69. rd[j] = ((rd[j + 1] << 1) | 1) & charMatch;
  70. }
  71. else
  72. {
  73. rd[j] = ((rd[j + 1] << 1) | 1) & charMatch | ((lastComputedRow[j + 1] | lastComputedRow[j]) << 1) | 1 | lastComputedRow[j + 1];
  74. }
  75. if ((rd[j] & matchmask) != 0)
  76. {
  77. var score = MatchBitapScore(d, j - 1, startIndex, pattern);
  78. if (score <= scoreThreshold)
  79. {
  80. scoreThreshold = score;
  81. bestMatchIndex = j - 1;
  82. if (bestMatchIndex > startIndex)
  83. {
  84. start = Math.Max(1, 2 * startIndex - bestMatchIndex);
  85. }
  86. else
  87. {
  88. break;
  89. }
  90. }
  91. }
  92. }
  93. if (MatchBitapScore(d + 1, startIndex, startIndex, pattern) > scoreThreshold)
  94. {
  95. break;
  96. }
  97. lastComputedRow = rd;
  98. }
  99. return bestMatchIndex;
  100. }
  101. /// <summary>
  102. /// 初始化Bitap算法的字母表
  103. /// </summary>
  104. /// <param name="pattern"></param>
  105. /// <returns></returns>
  106. internal static Dictionary<char, int> InitAlphabet(string pattern) => pattern.Select((c, i) => (c, i)).Aggregate(new Dictionary<char, int>(), (d, x) =>
  107. {
  108. d.TryGetValue(x.c, out var value);
  109. d[x.c] = value | (1 << (pattern.Length - x.i - 1));
  110. return d;
  111. });
  112. /// <summary>
  113. /// 计算并匹配得分
  114. /// </summary>
  115. /// <param name="errors">匹配中的错误数</param>
  116. /// <param name="location">匹配位置.</param>
  117. /// <param name="expectedLocation">预期位置</param>
  118. /// <param name="pattern">查找片段</param>
  119. /// <returns>匹配得分 (0.0 = 最好, 1.0 = 最差).</returns>
  120. private double MatchBitapScore(int errors, int location, int expectedLocation, string pattern)
  121. {
  122. var accuracy = (float)errors / pattern.Length;
  123. var proximity = Math.Abs(expectedLocation - location);
  124. return (_matchDistance, proximity) switch
  125. {
  126. (0, 0) => accuracy,
  127. (0, _) => 1.0,
  128. _ => accuracy + proximity / (float)_matchDistance
  129. };
  130. }
  131. }