using System.Text;
using System.Text.RegularExpressions;
namespace Masuit.Tools.TextDiff;
internal static class TextUtil
{
private static readonly Regex HexCode = new("%[0-9A-F][0-9A-F]");
///
/// 求两个字符串的最长公共前子串长度
///
///
///
/// text1子字符串的起始索引
/// text2子字符串的起始索引
/// 每个字符串开头共有的字符数
internal static int CommonPrefix(ReadOnlySpan text1, ReadOnlySpan text2, int i1 = 0, int i2 = 0)
{
var l1 = text1.Length - i1;
var l2 = text2.Length - i2;
var n = Math.Min(l1, l2);
for (var i = 0; i < n; i++)
{
if (text1[i + i1] != text2[i + i2])
{
return i;
}
}
return n;
}
internal static int CommonPrefix(StringBuilder text1, StringBuilder text2)
{
var n = Math.Min(text1.Length, text2.Length);
for (var i = 0; i < n; i++)
{
if (text1[i] != text2[i])
{
return i;
}
}
return n;
}
///
/// 求两个字符串的最长公共后子串长度
///
///
///
/// text1的最大长度
/// text2的最大长度
/// 每个字符串末尾共有的字符数
internal static int CommonSuffix(ReadOnlySpan text1, ReadOnlySpan text2, int? l1 = null, int? l2 = null)
{
var text1Length = l1 ?? text1.Length;
var text2Length = l2 ?? text2.Length;
var n = Math.Min(text1Length, text2Length);
for (var i = 1; i <= n; i++)
{
if (text1[text1Length - i] != text2[text2Length - i])
{
return i - 1;
}
}
return n;
}
internal static int CommonSuffix(StringBuilder text1, StringBuilder text2)
{
var text1Length = text1.Length;
var text2Length = text2.Length;
var n = Math.Min(text1Length, text2Length);
for (var i = 1; i <= n; i++)
{
if (text1[text1Length - i] != text2[text2Length - i])
{
return i - 1;
}
}
return n;
}
///
/// 确定一个字符串的后缀是否是另一个字符串。返回第一个字符串末尾和第二个字符串开头共有的字符数。
///
///
///
///
internal static int CommonOverlap(ReadOnlySpan text1, ReadOnlySpan text2)
{
var text1Length = text1.Length;
var text2Length = text2.Length;
if (text1Length == 0 || text2Length == 0)
{
return 0;
}
if (text1Length > text2Length)
{
text1 = text1[(text1Length - text2Length)..];
}
else if (text1Length < text2Length)
{
text2 = text2[..text1Length];
}
var last = text1[^1];
for (var length = text2.Length; length > 0; length--)
{
if (text2[length - 1] == last && text1.EndsWith(text2[..length]))
{
return length;
}
}
return 0;
}
///
/// 长文本中是否存在短文本的子字符串,使得子字符串至少是长文本长度的一半
///
///
///
/// 在长文本内开始四分之一长度的子字符串索引位置
///
private static HalfMatchResult HalfMatchI(ReadOnlySpan longtext, ReadOnlySpan shorttext, int i)
{
var seed = longtext.Slice(i, longtext.Length / 4);
var j = -1;
var bestCommon = string.Empty;
string bestLongtextA = string.Empty, bestLongtextB = string.Empty;
string bestShorttextA = string.Empty, bestShorttextB = string.Empty;
int n = j;
while (n < shorttext.Length && (j = shorttext[(j + 1)..].IndexOf(seed, StringComparison.Ordinal)) != -1)
{
j = n = j + n + 1;
var prefixLength = CommonPrefix(longtext, shorttext, i, j);
var suffixLength = CommonSuffix(longtext, shorttext, i, j);
if (bestCommon.Length < suffixLength + prefixLength)
{
bestCommon = shorttext.Slice(j - suffixLength, suffixLength).ToString() + shorttext.Slice(j, prefixLength).ToString();
bestLongtextA = longtext[..(i - suffixLength)].ToString();
bestLongtextB = longtext[(i + prefixLength)..].ToString();
bestShorttextA = shorttext[..(j - suffixLength)].ToString();
bestShorttextB = shorttext[(j + prefixLength)..].ToString();
}
}
return bestCommon.Length * 2 >= longtext.Length ? new(bestLongtextA, bestLongtextB, bestShorttextA, bestShorttextB, bestCommon) : HalfMatchResult.Empty;
}
///
/// 这两个文本是否共享一个子字符串,子字符串的长度至少是较长文本的一半?
/// 这种加速会产生非最小的差异。
///
///
///
///
internal static HalfMatchResult HalfMatch(ReadOnlySpan text1, ReadOnlySpan text2)
{
var longtext = text1.Length > text2.Length ? text1 : text2;
var shorttext = text1.Length > text2.Length ? text2 : text1;
if (longtext.Length < 4 || shorttext.Length * 2 < longtext.Length)
{
return HalfMatchResult.Empty;
}
var hm1 = HalfMatchI(longtext, shorttext, (longtext.Length + 3) / 4);
var hm2 = HalfMatchI(longtext, shorttext, (longtext.Length + 1) / 2);
var hm = (hm1, hm2) switch
{
{ hm1.IsEmpty: true } and { hm2.IsEmpty: true } => hm1,
{ hm2.IsEmpty: true } => hm1,
{ hm1.IsEmpty: true } => hm2,
_ when hm1 > hm2 => hm1,
_ => hm2
};
return text1.Length > text2.Length ? hm : -hm;
}
internal static string UrlEncoded(this string str)
{
const int maxLength = 0xFFEF;
StringBuilder sb = new();
var index = 0;
while (index + maxLength < str.Length)
{
sb.Append(Uri.EscapeDataString(str.Substring(index, maxLength)));
index += maxLength;
}
sb.Append(Uri.EscapeDataString(str[index..]));
sb = sb.Replace('+', ' ').Replace("%20", " ").Replace("%21", "!").Replace("%2A", "*").Replace("%27", "'").Replace("%28", "(").Replace("%29", ")").Replace("%3B", ";").Replace("%2F", "/").Replace("%3F", "?").Replace("%3A", ":").Replace("%40", "@").Replace("%26", "&").Replace("%3D", "=").Replace("%2B", "+").Replace("%24", "$").Replace("%2C", ",").Replace("%23", "#");
return HexCode.Replace(sb.ToString(), s => s.Value.ToLower());
}
internal static string UrlDecoded(this string str) => Uri.UnescapeDataString(str);
///
/// 查找最匹配的索引位置
/// 返回 -1 则未匹配到
///
///
///
///
///
///
internal static int FindBestMatchIndex(this string text, string pattern, int loc, MatchOption option)
{
loc = Math.Max(0, Math.Min(loc, text.Length));
if (text == pattern)
{
return 0;
}
if (text.Length == 0)
{
return -1;
}
if (loc + pattern.Length <= text.Length && text.AsSpan(loc, pattern.Length).SequenceEqual(pattern))
{
return loc;
}
var bitap = new BitapAlgorithm(option);
return bitap.Match(text, pattern, loc);
}
}