DiffAlgorithm.cs 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. using System.Text;
  2. using static Masuit.Tools.TextDiff.DiffOperation;
  3. namespace Masuit.Tools.TextDiff;
  4. internal static class DiffAlgorithm
  5. {
  6. /// <summary>
  7. /// 找出两段文本之间的差异。通过在区分之前剥离文本中的常见前缀或后缀来简化
  8. /// </summary>
  9. /// <param name="text1"></param>
  10. /// <param name="text2"></param>
  11. /// <param name="checklines">加速标记。如果为false,则不要先运行行级差异来识别更改的区域。如果为true,则运行一个速度稍快但不太理想的差异</param>
  12. /// <param name="optimized">启用优化</param>
  13. /// <param name="token"></param>
  14. /// <returns></returns>
  15. internal static IEnumerable<TextDiffer> Compute(ReadOnlySpan<char> text1, ReadOnlySpan<char> text2, bool checklines, bool optimized, CancellationToken token)
  16. {
  17. if (text1.Length == text2.Length && text1.Length == 0)
  18. {
  19. return [];
  20. }
  21. var commonlength = TextUtil.CommonPrefix(text1, text2);
  22. if (commonlength == text1.Length && commonlength == text2.Length)
  23. {
  24. return new List<TextDiffer>()
  25. {
  26. TextDiffer.Equal(text1)
  27. };
  28. }
  29. var commonprefix = text1.Slice(0, commonlength);
  30. text1 = text1[commonlength..];
  31. text2 = text2[commonlength..];
  32. commonlength = TextUtil.CommonSuffix(text1, text2);
  33. var commonsuffix = text1[^commonlength..];
  34. text1 = text1.Slice(0, text1.Length - commonlength);
  35. text2 = text2.Slice(0, text2.Length - commonlength);
  36. List<TextDiffer> diffs = new();
  37. if (commonprefix.Length != 0)
  38. {
  39. diffs.Insert(0, TextDiffer.Equal(commonprefix));
  40. }
  41. diffs.AddRange(ComputeCore(text1, text2, checklines, optimized, token));
  42. if (commonsuffix.Length != 0)
  43. {
  44. diffs.Add(TextDiffer.Equal(commonsuffix));
  45. }
  46. return diffs.CleanupMerge();
  47. }
  48. private static IEnumerable<TextDiffer> ComputeCore(ReadOnlySpan<char> text1, ReadOnlySpan<char> text2, bool checklines, bool optimized, CancellationToken token)
  49. {
  50. if (text1.Length == 0)
  51. {
  52. return TextDiffer.Insert(text2).ItemAsEnumerable();
  53. }
  54. if (text2.Length == 0)
  55. {
  56. return TextDiffer.Delete(text1).ItemAsEnumerable();
  57. }
  58. var longtext = text1.Length > text2.Length ? text1 : text2;
  59. var shorttext = text1.Length > text2.Length ? text2 : text1;
  60. var i = longtext.IndexOf(shorttext, StringComparison.Ordinal);
  61. if (i != -1)
  62. {
  63. if (text1.Length > text2.Length)
  64. {
  65. return new[]
  66. {
  67. TextDiffer.Delete(longtext.Slice(0, i)),
  68. TextDiffer.Equal(shorttext),
  69. TextDiffer.Delete(longtext[(i + shorttext.Length)..])
  70. };
  71. }
  72. return new[]
  73. {
  74. TextDiffer.Insert(longtext.Slice(0, i)),
  75. TextDiffer.Equal(shorttext),
  76. TextDiffer.Insert(longtext[(i + shorttext.Length)..])
  77. };
  78. }
  79. if (shorttext.Length == 1)
  80. {
  81. return new[]
  82. {
  83. TextDiffer.Delete(text1),
  84. TextDiffer.Insert(text2)
  85. };
  86. }
  87. if (optimized)
  88. {
  89. var result = TextUtil.HalfMatch(text1, text2);
  90. if (!result.IsEmpty)
  91. {
  92. var diffsA = Compute(result.Prefix1, result.Prefix2, checklines, true, token);
  93. var diffsB = Compute(result.Suffix1, result.Suffix2, checklines, true, token);
  94. return diffsA.Concat(TextDiffer.Equal(result.CommonMiddle)).Concat(diffsB);
  95. }
  96. }
  97. if (checklines && text1.Length > 100 && text2.Length > 100)
  98. {
  99. return DiffLines(text1, text2, optimized, token);
  100. }
  101. return MyersDiffBisect(text1, text2, optimized, token);
  102. }
  103. /// <summary>
  104. /// 对两个字符串进行快速的行级差分,然后重新划分部分以获得更高的精度。这种加速会产生非最小的差异。
  105. /// </summary>
  106. /// <param name="text1"></param>
  107. /// <param name="text2"></param>
  108. /// <param name="optimized"></param>
  109. /// <param name="token"></param>
  110. /// <returns></returns>
  111. private static List<TextDiffer> DiffLines(ReadOnlySpan<char> text1, ReadOnlySpan<char> text2, bool optimized, CancellationToken token)
  112. {
  113. var compressor = new StringCompressor();
  114. text1 = compressor.Compress(text1, char.MaxValue * 2 / 3);
  115. text2 = compressor.Compress(text2);
  116. var diffs = Compute(text1, text2, false, optimized, token).Select(diff => diff.Replace(compressor.Decompress(diff.Text))).ToList().CleanupSemantic();
  117. return RediffAfterDiffLines(diffs, optimized, token).ToList();
  118. }
  119. /// <summary>
  120. /// 逐个字符清除所有替换块
  121. /// </summary>
  122. /// <param name="diffs"></param>
  123. /// <param name="optimized"></param>
  124. /// <param name="token"></param>
  125. /// <returns></returns>
  126. private static IEnumerable<TextDiffer> RediffAfterDiffLines(IEnumerable<TextDiffer> diffs, bool optimized, CancellationToken token)
  127. {
  128. var ins = new StringBuilder();
  129. var del = new StringBuilder();
  130. foreach (var diff in diffs.Concat(TextDiffer.Empty))
  131. {
  132. (ins, del) = diff.Operation switch
  133. {
  134. Insert => (ins.Append(diff.Text), del),
  135. Delete => (ins, del.Append(diff.Text)),
  136. _ => (ins, del)
  137. };
  138. if (diff.Operation != Equal)
  139. {
  140. continue;
  141. }
  142. var consolidatedDiffsBeforeEqual = diff.Operation switch
  143. {
  144. Equal when ins.Length > 0 && del.Length > 0 => Compute(del.ToString(), ins.ToString(), false, optimized, token),
  145. Equal when del.Length > 0 => TextDiffer.Delete(del.ToString()).ItemAsEnumerable(),
  146. Equal when ins.Length > 0 => TextDiffer.Insert(ins.ToString()).ItemAsEnumerable(),
  147. _ => []
  148. };
  149. foreach (var d in consolidatedDiffsBeforeEqual)
  150. {
  151. yield return d;
  152. }
  153. if (!diff.IsEmpty)
  154. yield return diff;
  155. ins.Clear();
  156. del.Clear();
  157. }
  158. }
  159. /// <summary>
  160. /// 找到diff的“中间值”,将问题一分为二,并返回递归构造的diff。
  161. /// </summary>
  162. /// <param name="text1"></param>
  163. /// <param name="text2"></param>
  164. /// <param name="optimizeForSpeed"></param>
  165. /// <param name="token"></param>
  166. /// <returns></returns>
  167. internal static IEnumerable<TextDiffer> MyersDiffBisect(ReadOnlySpan<char> text1, ReadOnlySpan<char> text2, bool optimizeForSpeed, CancellationToken token)
  168. {
  169. var text1Length = text1.Length;
  170. var text2Length = text2.Length;
  171. var maxD = (text1Length + text2Length + 1) / 2;
  172. var vOffset = maxD;
  173. var vLength = 2 * maxD;
  174. var v1 = new int[vLength];
  175. var v2 = new int[vLength];
  176. for (var x = 0; x < vLength; x++)
  177. {
  178. v1[x] = -1;
  179. }
  180. for (var x = 0; x < vLength; x++)
  181. {
  182. v2[x] = -1;
  183. }
  184. v1[vOffset + 1] = 0;
  185. v2[vOffset + 1] = 0;
  186. var delta = text1Length - text2Length;
  187. var front = delta % 2 != 0;
  188. var k1Start = 0;
  189. var k1End = 0;
  190. var k2Start = 0;
  191. var k2End = 0;
  192. for (var d = 0; d < maxD; d++)
  193. {
  194. if (token.IsCancellationRequested)
  195. {
  196. break;
  197. }
  198. for (var k1 = -d + k1Start; k1 <= d - k1End; k1 += 2)
  199. {
  200. var k1Offset = vOffset + k1;
  201. int x1;
  202. if (k1 == -d || k1 != d && v1[k1Offset - 1] < v1[k1Offset + 1])
  203. {
  204. x1 = v1[k1Offset + 1];
  205. }
  206. else
  207. {
  208. x1 = v1[k1Offset - 1] + 1;
  209. }
  210. var y1 = x1 - k1;
  211. while (x1 < text1Length && y1 < text2Length && text1[x1] == text2[y1])
  212. {
  213. x1++;
  214. y1++;
  215. }
  216. v1[k1Offset] = x1;
  217. if (x1 > text1Length)
  218. {
  219. k1End += 2;
  220. }
  221. else if (y1 > text2Length)
  222. {
  223. k1Start += 2;
  224. }
  225. else if (front)
  226. {
  227. var k2Offset = vOffset + delta - k1;
  228. if (k2Offset >= 0 && k2Offset < vLength && v2[k2Offset] != -1)
  229. {
  230. var x2 = text1Length - v2[k2Offset];
  231. if (x1 >= x2)
  232. {
  233. return BisectSplit(text1, text2, x1, y1, optimizeForSpeed, token);
  234. }
  235. }
  236. }
  237. }
  238. for (var k2 = -d + k2Start; k2 <= d - k2End; k2 += 2)
  239. {
  240. var k2Offset = vOffset + k2;
  241. int x2;
  242. if (k2 == -d || k2 != d && v2[k2Offset - 1] < v2[k2Offset + 1])
  243. {
  244. x2 = v2[k2Offset + 1];
  245. }
  246. else
  247. {
  248. x2 = v2[k2Offset - 1] + 1;
  249. }
  250. var y2 = x2 - k2;
  251. while (x2 < text1Length && y2 < text2Length && text1[text1Length - x2 - 1] == text2[text2Length - y2 - 1])
  252. {
  253. x2++;
  254. y2++;
  255. }
  256. v2[k2Offset] = x2;
  257. if (x2 > text1Length)
  258. {
  259. k2End += 2;
  260. }
  261. else if (y2 > text2Length)
  262. {
  263. k2Start += 2;
  264. }
  265. else if (!front)
  266. {
  267. var k1Offset = vOffset + delta - k2;
  268. if (k1Offset >= 0 && k1Offset < vLength && v1[k1Offset] != -1)
  269. {
  270. var x1 = v1[k1Offset];
  271. var y1 = vOffset + x1 - k1Offset;
  272. x2 = text1Length - v2[k2Offset];
  273. if (x1 >= x2)
  274. {
  275. return BisectSplit(text1, text2, x1, y1, optimizeForSpeed, token);
  276. }
  277. }
  278. }
  279. }
  280. }
  281. return new[]
  282. {
  283. TextDiffer.Delete(text1),
  284. TextDiffer.Insert(text2)
  285. };
  286. }
  287. /// <summary>
  288. /// 给定“中值”的位置,将diff分成两部分并递归。
  289. /// </summary>
  290. /// <param name="text1"></param>
  291. /// <param name="text2"></param>
  292. /// <param name="x">文本1的中值位置.</param>
  293. /// <param name="y">文本2的中值位置.</param>
  294. /// <param name="optimized"></param>
  295. /// <param name="token"></param>
  296. /// <returns></returns>
  297. private static IEnumerable<TextDiffer> BisectSplit(ReadOnlySpan<char> text1, ReadOnlySpan<char> text2, int x, int y, bool optimized, CancellationToken token)
  298. {
  299. var text1A = text1[..x];
  300. var text2A = text2[..y];
  301. var text1B = text1[x..];
  302. var text2B = text2[y..];
  303. var diffsa = Compute(text1A, text2A, false, optimized, token);
  304. var diffsb = Compute(text1B, text2B, false, optimized, token);
  305. return diffsa.Concat(diffsb);
  306. }
  307. }