namespace Masuit.Tools.TextDiff;
/*
* https://en.wikipedia.org/wiki/Bitap_algorithm
*/
///
/// 实现Bitap算法,允许近似字符串匹配的文本搜索算法。提供了在文本字符串中定位给定模式的最佳实例的功能,考虑了潜在的不匹配和错误。该算法通过MatchSettings配置,其中包括匹配阈值和距离,确定搜索的灵敏度和灵活性。
///
internal class BitapAlgorithm(MatchOption option)
{
// 匹配阈值 (0.0 = 最完美, 1.0 = 非常宽松).
private readonly float _matchThreshold = option.MatchThreshold;
// 搜索匹配的距离(0=精确位置,1000以上=广泛匹配)。与预期位置相差多少字符的匹配将分数增加1.0(0.0是完美匹配)。
private readonly int _matchDistance = option.MatchDistance;
///
/// 使用Bitap算法在“loc”附近的“text”中找到“pattern”的最佳实例。如果未找到匹配项,则返回-1。
///
/// 需要搜索的文本
/// 被搜索片段
/// 搜索位置
/// 最匹配的位置或 -1.
public int Match(string text, string pattern, int startIndex)
{
double scoreThreshold = _matchThreshold;
var bestMatchIndex = text.IndexOf(pattern, startIndex, StringComparison.Ordinal);
if (bestMatchIndex != -1)
{
scoreThreshold = Math.Min(MatchBitapScore(0, bestMatchIndex, startIndex, pattern), scoreThreshold);
bestMatchIndex = text.LastIndexOf(pattern, Math.Min(startIndex + pattern.Length, text.Length), StringComparison.Ordinal);
if (bestMatchIndex != -1)
{
scoreThreshold = Math.Min(MatchBitapScore(0, bestMatchIndex, startIndex, pattern), scoreThreshold);
}
}
var s = InitAlphabet(pattern);
var matchmask = 1 << (pattern.Length - 1);
bestMatchIndex = -1;
var currentMaxRange = pattern.Length + text.Length;
var lastComputedRow = Array.Empty();
for (var d = 0; d < pattern.Length; d++)
{
var currentMinRange = 0;
var currentMidpoint = currentMaxRange;
while (currentMinRange < currentMidpoint)
{
if (MatchBitapScore(d, startIndex + currentMidpoint, startIndex, pattern) <= scoreThreshold)
currentMinRange = currentMidpoint;
else
currentMaxRange = currentMidpoint;
currentMidpoint = (currentMaxRange - currentMinRange) / 2 + currentMinRange;
}
currentMaxRange = currentMidpoint;
var start = Math.Max(1, startIndex - currentMidpoint + 1);
var finish = Math.Min(startIndex + currentMidpoint, text.Length) + pattern.Length;
var rd = new int[finish + 2];
rd[finish + 1] = (1 << d) - 1;
for (var j = finish; j >= start; j--)
{
int charMatch;
if (text.Length <= j - 1 || !s.ContainsKey(text[j - 1]))
{
charMatch = 0;
}
else
{
charMatch = s[text[j - 1]];
}
if (d == 0)
{
rd[j] = ((rd[j + 1] << 1) | 1) & charMatch;
}
else
{
rd[j] = ((rd[j + 1] << 1) | 1) & charMatch | ((lastComputedRow[j + 1] | lastComputedRow[j]) << 1) | 1 | lastComputedRow[j + 1];
}
if ((rd[j] & matchmask) != 0)
{
var score = MatchBitapScore(d, j - 1, startIndex, pattern);
if (score <= scoreThreshold)
{
scoreThreshold = score;
bestMatchIndex = j - 1;
if (bestMatchIndex > startIndex)
{
start = Math.Max(1, 2 * startIndex - bestMatchIndex);
}
else
{
break;
}
}
}
}
if (MatchBitapScore(d + 1, startIndex, startIndex, pattern) > scoreThreshold)
{
break;
}
lastComputedRow = rd;
}
return bestMatchIndex;
}
///
/// 初始化Bitap算法的字母表
///
///
///
internal static Dictionary InitAlphabet(string pattern) => pattern.Select((c, i) => (c, i)).Aggregate(new Dictionary(), (d, x) =>
{
d.TryGetValue(x.c, out var value);
d[x.c] = value | (1 << (pattern.Length - x.i - 1));
return d;
});
///
/// 计算并匹配得分
///
/// 匹配中的错误数
/// 匹配位置.
/// 预期位置
/// 查找片段
/// 匹配得分 (0.0 = 最好, 1.0 = 最差).
private double MatchBitapScore(int errors, int location, int expectedLocation, string pattern)
{
var accuracy = (float)errors / pattern.Length;
var proximity = Math.Abs(expectedLocation - location);
return (_matchDistance, proximity) switch
{
(0, 0) => accuracy,
(0, _) => 1.0,
_ => accuracy + proximity / (float)_matchDistance
};
}
}