TextUtil.cs 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. using System.Text;
  2. using System.Text.RegularExpressions;
  3. namespace Masuit.Tools.TextDiff;
  4. internal static class TextUtil
  5. {
  6. private static readonly Regex HexCode = new("%[0-9A-F][0-9A-F]");
  7. /// <summary>
  8. /// 求两个字符串的最长公共前子串长度
  9. /// </summary>
  10. /// <param name="text1"></param>
  11. /// <param name="text2"></param>
  12. /// <param name="i1">text1子字符串的起始索引</param>
  13. /// <param name="i2">text2子字符串的起始索引</param>
  14. /// <returns>每个字符串开头共有的字符数</returns>
  15. internal static int CommonPrefix(ReadOnlySpan<char> text1, ReadOnlySpan<char> text2, int i1 = 0, int i2 = 0)
  16. {
  17. var l1 = text1.Length - i1;
  18. var l2 = text2.Length - i2;
  19. var n = Math.Min(l1, l2);
  20. for (var i = 0; i < n; i++)
  21. {
  22. if (text1[i + i1] != text2[i + i2])
  23. {
  24. return i;
  25. }
  26. }
  27. return n;
  28. }
  29. internal static int CommonPrefix(StringBuilder text1, StringBuilder text2)
  30. {
  31. var n = Math.Min(text1.Length, text2.Length);
  32. for (var i = 0; i < n; i++)
  33. {
  34. if (text1[i] != text2[i])
  35. {
  36. return i;
  37. }
  38. }
  39. return n;
  40. }
  41. /// <summary>
  42. /// 求两个字符串的最长公共后子串长度
  43. /// </summary>
  44. /// <param name="text1"></param>
  45. /// <param name="text2"></param>
  46. /// <param name="l1">text1的最大长度</param>
  47. /// <param name="l2">text2的最大长度</param>
  48. /// <returns>每个字符串末尾共有的字符数</returns>
  49. internal static int CommonSuffix(ReadOnlySpan<char> text1, ReadOnlySpan<char> text2, int? l1 = null, int? l2 = null)
  50. {
  51. var text1Length = l1 ?? text1.Length;
  52. var text2Length = l2 ?? text2.Length;
  53. var n = Math.Min(text1Length, text2Length);
  54. for (var i = 1; i <= n; i++)
  55. {
  56. if (text1[text1Length - i] != text2[text2Length - i])
  57. {
  58. return i - 1;
  59. }
  60. }
  61. return n;
  62. }
  63. internal static int CommonSuffix(StringBuilder text1, StringBuilder text2)
  64. {
  65. var text1Length = text1.Length;
  66. var text2Length = text2.Length;
  67. var n = Math.Min(text1Length, text2Length);
  68. for (var i = 1; i <= n; i++)
  69. {
  70. if (text1[text1Length - i] != text2[text2Length - i])
  71. {
  72. return i - 1;
  73. }
  74. }
  75. return n;
  76. }
  77. /// <summary>
  78. /// 确定一个字符串的后缀是否是另一个字符串。返回第一个字符串末尾和第二个字符串开头共有的字符数。
  79. /// </summary>
  80. /// <param name="text1"></param>
  81. /// <param name="text2"></param>
  82. /// <returns></returns>
  83. internal static int CommonOverlap(ReadOnlySpan<char> text1, ReadOnlySpan<char> text2)
  84. {
  85. var text1Length = text1.Length;
  86. var text2Length = text2.Length;
  87. if (text1Length == 0 || text2Length == 0)
  88. {
  89. return 0;
  90. }
  91. if (text1Length > text2Length)
  92. {
  93. text1 = text1[(text1Length - text2Length)..];
  94. }
  95. else if (text1Length < text2Length)
  96. {
  97. text2 = text2[..text1Length];
  98. }
  99. var last = text1[^1];
  100. for (var length = text2.Length; length > 0; length--)
  101. {
  102. if (text2[length - 1] == last && text1.EndsWith(text2[..length]))
  103. {
  104. return length;
  105. }
  106. }
  107. return 0;
  108. }
  109. /// <summary>
  110. /// 长文本中是否存在短文本的子字符串,使得子字符串至少是长文本长度的一半
  111. /// </summary>
  112. /// <param name="longtext"></param>
  113. /// <param name="shorttext"></param>
  114. /// <param name="i">在长文本内开始四分之一长度的子字符串索引位置</param>
  115. /// <returns></returns>
  116. private static HalfMatchResult HalfMatchI(ReadOnlySpan<char> longtext, ReadOnlySpan<char> shorttext, int i)
  117. {
  118. var seed = longtext.Slice(i, longtext.Length / 4);
  119. var j = -1;
  120. var bestCommon = string.Empty;
  121. string bestLongtextA = string.Empty, bestLongtextB = string.Empty;
  122. string bestShorttextA = string.Empty, bestShorttextB = string.Empty;
  123. int n = j;
  124. while (n < shorttext.Length && (j = shorttext[(j + 1)..].IndexOf(seed, StringComparison.Ordinal)) != -1)
  125. {
  126. j = n = j + n + 1;
  127. var prefixLength = CommonPrefix(longtext, shorttext, i, j);
  128. var suffixLength = CommonSuffix(longtext, shorttext, i, j);
  129. if (bestCommon.Length < suffixLength + prefixLength)
  130. {
  131. bestCommon = shorttext.Slice(j - suffixLength, suffixLength).ToString() + shorttext.Slice(j, prefixLength).ToString();
  132. bestLongtextA = longtext[..(i - suffixLength)].ToString();
  133. bestLongtextB = longtext[(i + prefixLength)..].ToString();
  134. bestShorttextA = shorttext[..(j - suffixLength)].ToString();
  135. bestShorttextB = shorttext[(j + prefixLength)..].ToString();
  136. }
  137. }
  138. return bestCommon.Length * 2 >= longtext.Length ? new(bestLongtextA, bestLongtextB, bestShorttextA, bestShorttextB, bestCommon) : HalfMatchResult.Empty;
  139. }
  140. /// <summary>
  141. /// 这两个文本是否共享一个子字符串,子字符串的长度至少是较长文本的一半?
  142. /// 这种加速会产生非最小的差异。
  143. /// </summary>
  144. /// <param name="text1"></param>
  145. /// <param name="text2"></param>
  146. /// <returns></returns>
  147. internal static HalfMatchResult HalfMatch(ReadOnlySpan<char> text1, ReadOnlySpan<char> text2)
  148. {
  149. var longtext = text1.Length > text2.Length ? text1 : text2;
  150. var shorttext = text1.Length > text2.Length ? text2 : text1;
  151. if (longtext.Length < 4 || shorttext.Length * 2 < longtext.Length)
  152. {
  153. return HalfMatchResult.Empty;
  154. }
  155. var hm1 = HalfMatchI(longtext, shorttext, (longtext.Length + 3) / 4);
  156. var hm2 = HalfMatchI(longtext, shorttext, (longtext.Length + 1) / 2);
  157. var hm = (hm1, hm2) switch
  158. {
  159. { hm1.IsEmpty: true } and { hm2.IsEmpty: true } => hm1,
  160. { hm2.IsEmpty: true } => hm1,
  161. { hm1.IsEmpty: true } => hm2,
  162. _ when hm1 > hm2 => hm1,
  163. _ => hm2
  164. };
  165. return text1.Length > text2.Length ? hm : -hm;
  166. }
  167. internal static string UrlEncoded(this string str)
  168. {
  169. const int maxLength = 0xFFEF;
  170. StringBuilder sb = new();
  171. var index = 0;
  172. while (index + maxLength < str.Length)
  173. {
  174. sb.Append(Uri.EscapeDataString(str.Substring(index, maxLength)));
  175. index += maxLength;
  176. }
  177. sb.Append(Uri.EscapeDataString(str[index..]));
  178. sb = sb.Replace('+', ' ').Replace("%20", " ").Replace("%21", "!").Replace("%2A", "*").Replace("%27", "'").Replace("%28", "(").Replace("%29", ")").Replace("%3B", ";").Replace("%2F", "/").Replace("%3F", "?").Replace("%3A", ":").Replace("%40", "@").Replace("%26", "&").Replace("%3D", "=").Replace("%2B", "+").Replace("%24", "$").Replace("%2C", ",").Replace("%23", "#");
  179. return HexCode.Replace(sb.ToString(), s => s.Value.ToLower());
  180. }
  181. internal static string UrlDecoded(this string str) => Uri.UnescapeDataString(str);
  182. /// <summary>
  183. /// 查找最匹配的索引位置
  184. /// 返回 -1 则未匹配到
  185. /// </summary>
  186. /// <param name="text"></param>
  187. /// <param name="pattern"></param>
  188. /// <param name="loc"></param>
  189. /// <param name="option"></param>
  190. /// <returns></returns>
  191. internal static int FindBestMatchIndex(this string text, string pattern, int loc, MatchOption option)
  192. {
  193. loc = Math.Max(0, Math.Min(loc, text.Length));
  194. if (text == pattern)
  195. {
  196. return 0;
  197. }
  198. if (text.Length == 0)
  199. {
  200. return -1;
  201. }
  202. #if NETSTANDARD2_1_OR_GREATER
  203. if (loc + pattern.Length <= text.Length && text.AsSpan(loc, pattern.Length).SequenceEqual(pattern))
  204. #else
  205. if (loc + pattern.Length <= text.Length && text.AsSpan(loc, pattern.Length).SequenceEqual(pattern.AsSpan()))
  206. #endif
  207. {
  208. return loc;
  209. }
  210. var bitap = new BitapAlgorithm(option);
  211. return bitap.Match(text, pattern, loc);
  212. }
  213. }