DiffAlgorithm.cs 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. using System.Text;
  2. using static Masuit.Tools.TextDiff.DiffOperation;
  3. namespace Masuit.Tools.TextDiff;
  4. internal static class DiffAlgorithm
  5. {
  6. /// <summary>
  7. /// 找出两段文本之间的差异。通过在区分之前剥离文本中的常见前缀或后缀来简化
  8. /// </summary>
  9. /// <param name="text1"></param>
  10. /// <param name="text2"></param>
  11. /// <param name="checklines">加速标记。如果为false,则不要先运行行级差异来识别更改的区域。如果为true,则运行一个速度稍快但不太理想的差异</param>
  12. /// <param name="optimized">启用优化</param>
  13. /// <param name="token"></param>
  14. /// <returns></returns>
  15. internal static IEnumerable<TextDiffer> Compute(ReadOnlySpan<char> text1, ReadOnlySpan<char> text2, bool checklines, bool optimized, CancellationToken token)
  16. {
  17. if (text1.Length == text2.Length && text1.Length == 0)
  18. {
  19. return [];
  20. }
  21. var commonlength = TextUtil.CommonPrefix(text1, text2);
  22. if (commonlength == text1.Length && commonlength == text2.Length)
  23. {
  24. return new List<TextDiffer>()
  25. {
  26. TextDiffer.Equal(text1)
  27. };
  28. }
  29. var commonprefix = text1.Slice(0, commonlength);
  30. text1 = text1[commonlength..];
  31. text2 = text2[commonlength..];
  32. commonlength = TextUtil.CommonSuffix(text1, text2);
  33. var commonsuffix = text1[^commonlength..];
  34. text1 = text1.Slice(0, text1.Length - commonlength);
  35. text2 = text2.Slice(0, text2.Length - commonlength);
  36. List<TextDiffer> diffs = new();
  37. if (commonprefix.Length != 0)
  38. {
  39. diffs.Insert(0, TextDiffer.Equal(commonprefix));
  40. }
  41. diffs.AddRange(ComputeCore(text1, text2, checklines, optimized, token));
  42. if (commonsuffix.Length != 0)
  43. {
  44. diffs.Add(TextDiffer.Equal(commonsuffix));
  45. }
  46. return diffs.CleanupMerge();
  47. }
  48. private static IEnumerable<TextDiffer> ComputeCore(ReadOnlySpan<char> text1, ReadOnlySpan<char> text2, bool checklines, bool optimized, CancellationToken token)
  49. {
  50. if (text1.Length == 0)
  51. {
  52. return TextDiffer.Insert(text2).ItemAsEnumerable();
  53. }
  54. if (text2.Length == 0)
  55. {
  56. return TextDiffer.Delete(text1).ItemAsEnumerable();
  57. }
  58. var longtext = text1.Length > text2.Length ? text1 : text2;
  59. var shorttext = text1.Length > text2.Length ? text2 : text1;
  60. var i = longtext.IndexOf(shorttext, StringComparison.Ordinal);
  61. if (i != -1)
  62. {
  63. if (text1.Length > text2.Length)
  64. {
  65. return new[]
  66. {
  67. TextDiffer.Delete(longtext.Slice(0, i)),
  68. TextDiffer.Equal(shorttext),
  69. TextDiffer.Delete(longtext[(i + shorttext.Length)..])
  70. };
  71. }
  72. return new[]
  73. {
  74. TextDiffer.Insert(longtext.Slice(0, i)),
  75. TextDiffer.Equal(shorttext),
  76. TextDiffer.Insert(longtext[(i + shorttext.Length)..])
  77. };
  78. }
  79. if (shorttext.Length == 1)
  80. {
  81. return new[]
  82. {
  83. TextDiffer.Delete(text1),
  84. TextDiffer.Insert(text2)
  85. };
  86. }
  87. if (optimized)
  88. {
  89. var result = TextUtil.HalfMatch(text1, text2);
  90. if (!result.IsEmpty)
  91. {
  92. #if NETSTANDARD2_1_OR_GREATER
  93. var diffsA = Compute(result.Prefix1, result.Prefix2, checklines, true, token);
  94. var diffsB = Compute(result.Suffix1, result.Suffix2, checklines, true, token);
  95. return diffsA.Concat(TextDiffer.Equal(result.CommonMiddle)).Concat(diffsB);
  96. #else
  97. var diffsA = Compute(result.Prefix1.AsSpan(), result.Prefix2.AsSpan(), checklines, true, token);
  98. var diffsB = Compute(result.Suffix1.AsSpan(), result.Suffix2.AsSpan(), checklines, true, token);
  99. return diffsA.Concat(TextDiffer.Equal(result.CommonMiddle.AsSpan())).Concat(diffsB);
  100. #endif
  101. }
  102. }
  103. if (checklines && text1.Length > 100 && text2.Length > 100)
  104. {
  105. return DiffLines(text1, text2, optimized, token);
  106. }
  107. return MyersDiffBisect(text1, text2, optimized, token);
  108. }
  109. /// <summary>
  110. /// 对两个字符串进行快速的行级差分,然后重新划分部分以获得更高的精度。这种加速会产生非最小的差异。
  111. /// </summary>
  112. /// <param name="text1"></param>
  113. /// <param name="text2"></param>
  114. /// <param name="optimized"></param>
  115. /// <param name="token"></param>
  116. /// <returns></returns>
  117. private static List<TextDiffer> DiffLines(ReadOnlySpan<char> text1, ReadOnlySpan<char> text2, bool optimized, CancellationToken token)
  118. {
  119. var compressor = new StringCompressor();
  120. #if NETSTANDARD2_1_OR_GREATER
  121. text1 = compressor.Compress(text1, char.MaxValue * 2 / 3);
  122. text2 = compressor.Compress(text2);
  123. var diffs = Compute(text1, text2, false, optimized, token).Select(diff => diff.Replace(compressor.Decompress(diff.Text))).ToList().CleanupSemantic();
  124. #else
  125. var diffs = Compute(compressor.Compress(text1, char.MaxValue * 2 / 3).AsSpan(), compressor.Compress(text2).AsSpan(), false, optimized, token).Select(diff => diff.Replace(compressor.Decompress(diff.Text))).ToList().CleanupSemantic();
  126. #endif
  127. return RediffAfterDiffLines(diffs, optimized, token).ToList();
  128. }
  129. /// <summary>
  130. /// 逐个字符清除所有替换块
  131. /// </summary>
  132. /// <param name="diffs"></param>
  133. /// <param name="optimized"></param>
  134. /// <param name="token"></param>
  135. /// <returns></returns>
  136. private static IEnumerable<TextDiffer> RediffAfterDiffLines(IEnumerable<TextDiffer> diffs, bool optimized, CancellationToken token)
  137. {
  138. var ins = new StringBuilder();
  139. var del = new StringBuilder();
  140. foreach (var diff in diffs.Concat(TextDiffer.Empty))
  141. {
  142. (ins, del) = diff.Operation switch
  143. {
  144. Insert => (ins.Append(diff.Text), del),
  145. Delete => (ins, del.Append(diff.Text)),
  146. _ => (ins, del)
  147. };
  148. if (diff.Operation != Equal)
  149. {
  150. continue;
  151. }
  152. var consolidatedDiffsBeforeEqual = diff.Operation switch
  153. {
  154. #if NETSTANDARD2_1_OR_GREATER
  155. Equal when ins.Length > 0 && del.Length > 0 => Compute(del.ToString(), ins.ToString(), false, optimized, token),
  156. Equal when del.Length > 0 => TextDiffer.Delete(del.ToString()).ItemAsEnumerable(),
  157. Equal when ins.Length > 0 => TextDiffer.Insert(ins.ToString()).ItemAsEnumerable(),
  158. #else
  159. Equal when ins.Length > 0 && del.Length > 0 => Compute(del.ToString().AsSpan(), ins.ToString().AsSpan(), false, optimized, token),
  160. Equal when del.Length > 0 => TextDiffer.Delete(del.ToString().AsSpan()).ItemAsEnumerable(),
  161. Equal when ins.Length > 0 => TextDiffer.Insert(ins.ToString().AsSpan()).ItemAsEnumerable(),
  162. #endif
  163. _ => []
  164. };
  165. foreach (var d in consolidatedDiffsBeforeEqual)
  166. {
  167. yield return d;
  168. }
  169. if (!diff.IsEmpty)
  170. yield return diff;
  171. ins.Clear();
  172. del.Clear();
  173. }
  174. }
  175. /// <summary>
  176. /// 找到diff的“中间值”,将问题一分为二,并返回递归构造的diff。
  177. /// </summary>
  178. /// <param name="text1"></param>
  179. /// <param name="text2"></param>
  180. /// <param name="optimizeForSpeed"></param>
  181. /// <param name="token"></param>
  182. /// <returns></returns>
  183. internal static IEnumerable<TextDiffer> MyersDiffBisect(ReadOnlySpan<char> text1, ReadOnlySpan<char> text2, bool optimizeForSpeed, CancellationToken token)
  184. {
  185. var text1Length = text1.Length;
  186. var text2Length = text2.Length;
  187. var maxD = (text1Length + text2Length + 1) / 2;
  188. var vOffset = maxD;
  189. var vLength = 2 * maxD;
  190. var v1 = new int[vLength];
  191. var v2 = new int[vLength];
  192. for (var x = 0; x < vLength; x++)
  193. {
  194. v1[x] = -1;
  195. }
  196. for (var x = 0; x < vLength; x++)
  197. {
  198. v2[x] = -1;
  199. }
  200. v1[vOffset + 1] = 0;
  201. v2[vOffset + 1] = 0;
  202. var delta = text1Length - text2Length;
  203. var front = delta % 2 != 0;
  204. var k1Start = 0;
  205. var k1End = 0;
  206. var k2Start = 0;
  207. var k2End = 0;
  208. for (var d = 0; d < maxD; d++)
  209. {
  210. if (token.IsCancellationRequested)
  211. {
  212. break;
  213. }
  214. for (var k1 = -d + k1Start; k1 <= d - k1End; k1 += 2)
  215. {
  216. var k1Offset = vOffset + k1;
  217. int x1;
  218. if (k1 == -d || k1 != d && v1[k1Offset - 1] < v1[k1Offset + 1])
  219. {
  220. x1 = v1[k1Offset + 1];
  221. }
  222. else
  223. {
  224. x1 = v1[k1Offset - 1] + 1;
  225. }
  226. var y1 = x1 - k1;
  227. while (x1 < text1Length && y1 < text2Length && text1[x1] == text2[y1])
  228. {
  229. x1++;
  230. y1++;
  231. }
  232. v1[k1Offset] = x1;
  233. if (x1 > text1Length)
  234. {
  235. k1End += 2;
  236. }
  237. else if (y1 > text2Length)
  238. {
  239. k1Start += 2;
  240. }
  241. else if (front)
  242. {
  243. var k2Offset = vOffset + delta - k1;
  244. if (k2Offset >= 0 && k2Offset < vLength && v2[k2Offset] != -1)
  245. {
  246. var x2 = text1Length - v2[k2Offset];
  247. if (x1 >= x2)
  248. {
  249. return BisectSplit(text1, text2, x1, y1, optimizeForSpeed, token);
  250. }
  251. }
  252. }
  253. }
  254. for (var k2 = -d + k2Start; k2 <= d - k2End; k2 += 2)
  255. {
  256. var k2Offset = vOffset + k2;
  257. int x2;
  258. if (k2 == -d || k2 != d && v2[k2Offset - 1] < v2[k2Offset + 1])
  259. {
  260. x2 = v2[k2Offset + 1];
  261. }
  262. else
  263. {
  264. x2 = v2[k2Offset - 1] + 1;
  265. }
  266. var y2 = x2 - k2;
  267. while (x2 < text1Length && y2 < text2Length && text1[text1Length - x2 - 1] == text2[text2Length - y2 - 1])
  268. {
  269. x2++;
  270. y2++;
  271. }
  272. v2[k2Offset] = x2;
  273. if (x2 > text1Length)
  274. {
  275. k2End += 2;
  276. }
  277. else if (y2 > text2Length)
  278. {
  279. k2Start += 2;
  280. }
  281. else if (!front)
  282. {
  283. var k1Offset = vOffset + delta - k2;
  284. if (k1Offset >= 0 && k1Offset < vLength && v1[k1Offset] != -1)
  285. {
  286. var x1 = v1[k1Offset];
  287. var y1 = vOffset + x1 - k1Offset;
  288. x2 = text1Length - v2[k2Offset];
  289. if (x1 >= x2)
  290. {
  291. return BisectSplit(text1, text2, x1, y1, optimizeForSpeed, token);
  292. }
  293. }
  294. }
  295. }
  296. }
  297. return new[]
  298. {
  299. TextDiffer.Delete(text1),
  300. TextDiffer.Insert(text2)
  301. };
  302. }
  303. /// <summary>
  304. /// 给定“中值”的位置,将diff分成两部分并递归。
  305. /// </summary>
  306. /// <param name="text1"></param>
  307. /// <param name="text2"></param>
  308. /// <param name="x">文本1的中值位置.</param>
  309. /// <param name="y">文本2的中值位置.</param>
  310. /// <param name="optimized"></param>
  311. /// <param name="token"></param>
  312. /// <returns></returns>
  313. private static IEnumerable<TextDiffer> BisectSplit(ReadOnlySpan<char> text1, ReadOnlySpan<char> text2, int x, int y, bool optimized, CancellationToken token)
  314. {
  315. var text1A = text1[..x];
  316. var text2A = text2[..y];
  317. var text1B = text1[x..];
  318. var text2B = text2[y..];
  319. var diffsa = Compute(text1A, text2A, false, optimized, token);
  320. var diffsb = Compute(text1B, text2B, false, optimized, token);
  321. return diffsa.Concat(diffsb);
  322. }
  323. }