using System.Collections.Immutable;
using System.Text;
using System.Text.RegularExpressions;
using static Masuit.Tools.TextDiff.DiffOperation;
namespace Masuit.Tools.TextDiff;
public static class DiffExtension
{
///
/// 还原文本1
///
///
///
public static string Text1(this IEnumerable diffs) => diffs.Where(d => d.Operation != Insert).Aggregate(new StringBuilder(), (sb, diff) => sb.Append(diff.Text)).ToString();
///
/// 还原文本2
///
///
///
public static string Text2(this IEnumerable diffs) => diffs.Where(d => d.Operation != Delete).Aggregate(new StringBuilder(), (sb, diff) => sb.Append(diff.Text)).ToString();
private readonly record struct LevenshteinState(int Insertions, int Deletions, int Levenshtein)
{
public LevenshteinState Consolidate() => new(0, 0, Levenshtein + Math.Max(Insertions, Deletions));
}
///
/// 计算Levenshtein距离;插入、删除或替换的字符数
///
///
///
internal static int Levenshtein(this IEnumerable diffs)
{
var state = new LevenshteinState(0, 0, 0);
state = diffs.Aggregate(state, (current, aDiff) => aDiff.Operation switch
{
Insert => current with
{
Insertions = current.Insertions + aDiff.Text.Length
},
Delete => current with
{
Deletions = current.Deletions + aDiff.Text.Length
},
Equal => current.Consolidate(),
_ => throw new IndexOutOfRangeException()
});
return state.Consolidate().Levenshtein;
}
private static char ToDelta(this DiffOperation o) => o switch
{
Delete => '-',
Insert => '+',
Equal => '=',
_ => throw new ArgumentException($"Unknown Operation: {o}")
};
///
/// 将diff压缩成一个编码字符串,该字符串描述了将text1转换为text2所需的操作。例如,=3\t-2\t+ing->:保留3个字符,删除2个字符,插入“ing”。操作以制表符分隔。插入的文本使用%xx符号转义。
///
///
///
public static string ToDelta(this IEnumerable diffs)
{
var s = from aDiff in diffs
let sign = aDiff.Operation.ToDelta()
let textToAppend = aDiff.Operation == Insert ? aDiff.Text.UrlEncoded() : aDiff.Text.Length.ToString()
select string.Concat(sign, textToAppend);
return s.Join("\t");
}
private static DiffOperation FromDelta(char c) => c switch
{
'-' => Delete,
'+' => Insert,
'=' => Equal,
_ => throw new ArgumentException($"Invalid Delta Token: {c}")
};
///
/// 从差异字符串中恢复diff对象
///
///
///
///
public static IEnumerable FromDelta(this string text1, string delta)
{
var pointer = 0;
foreach (var token in delta.SplitBy('\t'))
{
if (token.Length == 0)
{
continue;
}
var param = token[1..];
var operation = FromDelta(token[0]);
int n = 0;
if (operation != Insert)
{
if (!int.TryParse(param, out n))
{
throw new ArgumentException($"Invalid number in Diff.FromDelta: {param}");
}
if (pointer > text1.Length - n)
{
throw new ArgumentException($"Delta length ({pointer}) larger than source text length ({text1.Length}).");
}
}
(var text, pointer) = operation switch
{
Insert => (param.Replace("+", "%2b").UrlDecoded(), pointer),
Equal => (text1.Substring(pointer, n), pointer + n),
Delete => (text1.Substring(pointer, n), pointer + n),
_ => throw new ArgumentException($"Unknown Operation: {operation}")
};
yield return TextDiffer.Create(operation, text);
}
if (pointer != text1.Length)
{
throw new ArgumentException($"Delta length ({pointer}) smaller than source text length ({text1.Length}).");
}
}
internal static IEnumerable CleanupMergePass1(this IEnumerable diffs)
{
var sbDelete = new StringBuilder();
var sbInsert = new StringBuilder();
var lastEquality = TextDiffer.Empty;
using var enumerator = diffs.Concat(TextDiffer.Empty).GetEnumerator();
while (enumerator.MoveNext())
{
var diff = enumerator.Current;
(sbInsert, sbDelete) = diff.Operation switch
{
Insert => (sbInsert.Append(diff.Text), sbDelete),
Delete => (sbInsert, sbDelete.Append(diff.Text)),
_ => (sbInsert, sbDelete)
};
if (diff.Operation == Equal)
{
if (sbInsert.Length > 0 || sbDelete.Length > 0)
{
var prefixLength = TextUtil.CommonPrefix(sbInsert, sbDelete);
if (prefixLength > 0)
{
var commonprefix = sbInsert.ToString(0, prefixLength);
sbInsert.Remove(0, prefixLength);
sbDelete.Remove(0, prefixLength);
lastEquality = lastEquality.Append(commonprefix);
}
var suffixLength = TextUtil.CommonSuffix(sbInsert, sbDelete);
if (suffixLength > 0)
{
var commonsuffix = sbInsert.ToString(sbInsert.Length - suffixLength, suffixLength);
sbInsert.Remove(sbInsert.Length - suffixLength, suffixLength);
sbDelete.Remove(sbDelete.Length - suffixLength, suffixLength);
diff = diff.Prepend(commonsuffix);
}
if (!lastEquality.IsEmpty)
{
yield return lastEquality;
}
if (sbDelete.Length > 0) yield return TextDiffer.Delete(sbDelete.ToString());
if (sbInsert.Length > 0) yield return TextDiffer.Insert(sbInsert.ToString());
lastEquality = diff;
sbDelete.Clear();
sbInsert.Clear();
}
else
{
lastEquality = lastEquality.Append(diff.Text);
}
}
}
if (!lastEquality.IsEmpty)
{
yield return lastEquality;
}
}
internal static IEnumerable CleanupMergePass2(this IEnumerable input, out bool haschanges)
{
haschanges = false;
var diffs = input.ToList();
for (var i = 1; i < diffs.Count - 1; i++)
{
var previous = diffs[i - 1];
var current = diffs[i];
var next = diffs[i + 1];
if (previous.Operation == Equal && next.Operation == Equal)
{
var currentSpan = current.Text.AsSpan();
var previousSpan = previous.Text.AsSpan();
var nextSpan = next.Text.AsSpan();
if (currentSpan.Length >= previousSpan.Length && currentSpan[^previousSpan.Length..].SequenceEqual(previousSpan))
{
var text = previous.Text + current.Text[..^previous.Text.Length];
diffs[i] = current.Replace(text);
diffs[i + 1] = next.Replace(previous.Text + next.Text);
diffs.Splice(i - 1, 1);
haschanges = true;
}
else if (currentSpan.Length >= nextSpan.Length && currentSpan[..nextSpan.Length].SequenceEqual(nextSpan))
{
diffs[i - 1] = previous.Replace(previous.Text + next.Text);
diffs[i] = current.Replace(current.Text[next.Text.Length..] + next.Text);
diffs.Splice(i + 1, 1);
haschanges = true;
}
}
}
return diffs;
}
internal static IEnumerable CleanupMerge(this IEnumerable diffs)
{
bool changes;
do
{
diffs = diffs.CleanupMergePass1().CleanupMergePass2(out changes).ToList(); // required to detect if anything changed
} while (changes);
return diffs;
}
readonly record struct EditBetweenEqualities(string Equality1, string Edit, string Equality2)
{
public int Score => DiffCleanupSemanticScore(Equality1, Edit) + DiffCleanupSemanticScore(Edit, Equality2);
private readonly record struct ScoreHelper(string Str, Index I, Regex Regex)
{
char C => Str[I];
public bool IsEmpty => Str.Length == 0;
public bool NonAlphaNumeric => !char.IsLetterOrDigit(C);
public bool IsWhitespace => char.IsWhiteSpace(C);
public bool IsLineBreak => C == '\n' || C == '\r';
public bool IsBlankLine => IsLineBreak && Regex.IsMatch(Str);
}
/// 给定两个字符串,计算一个分数,表示内部边界是否落在逻辑边界上
private static int DiffCleanupSemanticScore(string one, string two) => (h1: new ScoreHelper(one, ^1, BlankLineEnd), h2: new ScoreHelper(two, 0, BlankLineStart)) switch
{
{ h1.IsEmpty: true } or { h2.IsEmpty: true } => 6,
{ h1.IsBlankLine: true } or { h2.IsBlankLine: true } => 5,
{ h1.IsLineBreak: true } or { h2.IsLineBreak: true } => 4,
{ h1.NonAlphaNumeric: true } and { h1.IsWhitespace: false } and { h2.IsWhitespace: true } => 3,
{ h1.IsWhitespace: true } or { h2.IsWhitespace: true } => 2,
{ h1.NonAlphaNumeric: true } or { h2.NonAlphaNumeric: true } => 1,
_ => 0
};
// 将编辑尽可能向左移动
public EditBetweenEqualities ShiftLeft()
{
var commonOffset = TextUtil.CommonSuffix(Equality1, Edit);
if (commonOffset > 0)
{
var commonString = Edit[^commonOffset..];
var equality1 = Equality1[..^commonOffset];
var edit = commonString + Edit[..^commonOffset];
var equality2 = commonString + Equality2;
return new EditBetweenEqualities(Equality1: equality1, Edit: edit, Equality2: equality2);
}
return this;
}
private EditBetweenEqualities ShiftRight() => new(Equality1: Equality1 + Edit[0], Edit: Edit[1..] + Equality2[0], Equality2: Equality2[1..]);
public IEnumerable TraverseRight()
{
var item = this;
while (item.Edit.Length != 0 && item.Equality2.Length != 0 && item.Edit[0] == item.Equality2[0])
{
yield return item = item.ShiftRight();
}
}
public IEnumerable ToDiffs(DiffOperation edit)
{
yield return TextDiffer.Equal(Equality1);
yield return TextDiffer.Create(edit, Edit);
yield return TextDiffer.Equal(Equality2);
}
}
///
/// 寻找两侧被等式包围的单个编辑,等式可以侧向移动,将编辑与单词边界对齐。
/// e.g: The cat came. -> The cat came.
///
///
internal static IEnumerable CleanupSemanticLossless(this IEnumerable diffs)
{
using var enumerator = diffs.GetEnumerator();
if (!enumerator.MoveNext())
{
yield break;
}
var previous = enumerator.Current;
if (!enumerator.MoveNext())
{
yield return previous;
yield break;
}
var current = enumerator.Current;
while (true)
{
if (!enumerator.MoveNext())
{
yield return previous;
yield return current;
yield break;
}
var next = enumerator.Current;
if (previous.Operation == Equal && next.Operation == Equal)
{
var item = new EditBetweenEqualities(previous.Text, current.Text, next.Text).ShiftLeft();
var best = item.TraverseRight().Aggregate(item, (best, x) => best.Score > x.Score ? best : x);
if (previous.Text != best.Equality1)
{
foreach (var d in best.ToDiffs(current.Operation).Where(d => !d.IsEmpty))
{
yield return d;
}
if (!enumerator.MoveNext())
{
yield break;
}
previous = current;
current = next;
next = enumerator.Current;
}
else
{
yield return previous;
}
}
else
{
yield return previous;
}
previous = current;
current = next;
}
}
private static readonly Regex BlankLineEnd = new("\\n\\r?\\n\\Z", RegexOptions.Compiled);
private static readonly Regex BlankLineStart = new("\\A\\r?\\n\\r?\\n", RegexOptions.Compiled);
///
/// 从效率上清理一些无用的差异对象
///
///
///
internal static IEnumerable CleanupEfficiency(this IEnumerable input, short diffEditCost = 4)
{
var diffs = input.ToList();
var changes = false;
var equalities = new Stack();
var lastEquality = string.Empty;
var insertionBeforeLastEquality = false;
var deletionBeforeLastEquality = false;
var insertionAfterLastEquality = false;
var deletionAfterLastEquality = false;
for (var i = 0; i < diffs.Count; i++)
{
var diff = diffs[i];
if (diff.Operation == Equal)
{
if (diff.Text.Length < diffEditCost && (insertionAfterLastEquality || deletionAfterLastEquality))
{
equalities.Push(i);
(insertionBeforeLastEquality, deletionBeforeLastEquality) = (insertionAfterLastEquality, deletionAfterLastEquality);
lastEquality = diff.Text;
}
else
{
equalities.Clear();
lastEquality = string.Empty;
}
insertionAfterLastEquality = deletionAfterLastEquality = false;
}
else
{
if (diff.Operation == Delete)
{
deletionAfterLastEquality = true;
}
else
{
insertionAfterLastEquality = true;
}
/*
* 分割以下几种情况:
* ABXYCD
* AXCD
* ABXC
* AXCD
* ABXC
*/
if ((lastEquality.Length != 0) && ((insertionBeforeLastEquality && deletionBeforeLastEquality && insertionAfterLastEquality && deletionAfterLastEquality) || ((lastEquality.Length < diffEditCost / 2) && (insertionBeforeLastEquality ? 1 : 0) + (deletionBeforeLastEquality ? 1 : 0) + (insertionAfterLastEquality ? 1 : 0) + (deletionAfterLastEquality ? 1 : 0) == 3)))
{
diffs.Splice(equalities.Peek(), 1, TextDiffer.Delete(lastEquality), TextDiffer.Insert(lastEquality));
equalities.Pop();
lastEquality = string.Empty;
if (insertionBeforeLastEquality && deletionBeforeLastEquality)
{
insertionAfterLastEquality = deletionAfterLastEquality = true;
equalities.Clear();
}
else
{
if (equalities.Count > 0)
{
equalities.Pop();
}
i = equalities.Count > 0 ? equalities.Peek() : -1;
insertionAfterLastEquality = deletionAfterLastEquality = false;
}
changes = true;
}
}
}
if (changes)
{
return diffs.CleanupMerge();
}
return input;
}
///
/// 两个不相关文本的差异可以用巧合的匹配来填充。例如,“mouse”和“sofas”的区别是
/// `[(-1, "m"), (1, "s"), (0, "o"), (-1, "u"), (1, "fa"), (0, "s"), (-1, "e")]`.
/// 虽然这是最佳差异,但人类很难理解。语义清理重写了差异,将其扩展为更易于理解的格式。上述示例将变为: `[(-1, "mouse"), (1, "sofas")]`.
///
///
///
public static IImmutableList MakeHumanReadable(this IEnumerable input) => input.CleanupSemantic().ToImmutableList();
///
/// 此函数类似于“OptimizeForReadability”,除了它不是将差异优化为人类可读,而是优化差异以提高机器处理的效率。两种清理类型的结果通常是相同的。CleanupEfficiency就是基于这样的,即由大量小差异编辑组成的差异可能需要更长的时间来处理(在下游应用程序中),或者需要比少量较大差异更多的存储或传输容量。
///
///
/// 处理新编辑的成本,即处理现有编辑中的额外字符。默认值为4,这意味着如果将差异的长度扩展三个字符可以消除一次编辑,那么这种优化将降低总成本
///
public static IImmutableList OptimizeForMachineProcessing(this IEnumerable input, short diffEditCost = 4) => input.CleanupEfficiency(diffEditCost).ToImmutableList();
///
/// 从语法上清理一些无用的差异对象
///
///
internal static List CleanupSemantic(this IEnumerable input)
{
var diffs = input.ToList();
var equalities = new Stack();
string lastEquality = null;
var pointer = 0;
var lengthInsertions1 = 0;
var lengthDeletions1 = 0;
var lengthInsertions2 = 0;
var lengthDeletions2 = 0;
while (pointer < diffs.Count)
{
if (diffs[pointer].Operation == Equal)
{
equalities.Push(pointer);
lengthInsertions1 = lengthInsertions2;
lengthDeletions1 = lengthDeletions2;
lengthInsertions2 = 0;
lengthDeletions2 = 0;
lastEquality = diffs[pointer].Text;
}
else
{
if (diffs[pointer].Operation == Insert)
{
lengthInsertions2 += diffs[pointer].Text.Length;
}
else
{
lengthDeletions2 += diffs[pointer].Text.Length;
}
if (lastEquality != null && (lastEquality.Length <= Math.Max(lengthInsertions1, lengthDeletions1)) && (lastEquality.Length <= Math.Max(lengthInsertions2, lengthDeletions2)))
{
diffs.Splice(equalities.Peek(), 1, TextDiffer.Delete(lastEquality), TextDiffer.Insert(lastEquality));
equalities.Pop();
if (equalities.Count > 0)
{
equalities.Pop();
}
pointer = equalities.Count > 0 ? equalities.Peek() : -1;
lengthInsertions1 = 0; // Reset the counters.
lengthDeletions1 = 0;
lengthInsertions2 = 0;
lengthDeletions2 = 0;
lastEquality = null;
}
}
pointer++;
}
diffs = diffs.CleanupMerge().CleanupSemanticLossless().ToList();
// 查找删除和插入之间的重叠.
// e.g: abcxxxxxxdef
// -> abcxxxdef
// e.g: xxxabcdefxxx
// -> defxxxabc
// 只有当重叠与前面或后面的编辑一样大时,才提取重叠
pointer = 1;
while (pointer < diffs.Count)
{
if (diffs[pointer - 1].Operation == Delete && diffs[pointer].Operation == Insert)
{
var deletion = diffs[pointer - 1].Text.AsSpan();
var insertion = diffs[pointer].Text.AsSpan();
var overlapLength1 = TextUtil.CommonOverlap(deletion, insertion);
var overlapLength2 = TextUtil.CommonOverlap(insertion, deletion);
var minLength = Math.Min(deletion.Length, insertion.Length);
TextDiffer[] newdiffs = null;
if ((overlapLength1 >= overlapLength2) && (overlapLength1 >= minLength / 2.0))
{
newdiffs = new[]
{
TextDiffer.Delete(deletion.Slice(0, deletion.Length - overlapLength1).ToArray()),
TextDiffer.Equal(insertion.Slice(0, overlapLength1).ToArray()),
TextDiffer.Insert(insertion[overlapLength1..].ToArray())
};
}
else if ((overlapLength2 >= overlapLength1) && overlapLength2 >= minLength / 2.0)
{
newdiffs = new[]
{
TextDiffer.Insert(insertion.Slice(0, insertion.Length - overlapLength2)),
TextDiffer.Equal(deletion.Slice(0, overlapLength2)),
TextDiffer.Delete(deletion[overlapLength2..])
};
}
if (newdiffs != null)
{
diffs.Splice(pointer - 1, 2, newdiffs);
pointer++;
}
pointer++;
}
pointer++;
}
return diffs;
}
///
/// 计算并返回目标文本中的等效位置
///
///
///
///
internal static int FindEquivalentLocation2(this IEnumerable diffs, int location1)
{
var chars1 = 0;
var chars2 = 0;
var lastChars1 = 0;
var lastChars2 = 0;
var lastDiff = TextDiffer.Empty;
foreach (var aDiff in diffs)
{
if (aDiff.Operation != Insert)
{
chars1 += aDiff.Text.Length;
}
if (aDiff.Operation != Delete)
{
chars2 += aDiff.Text.Length;
}
if (chars1 > location1)
{
lastDiff = aDiff;
break;
}
lastChars1 = chars1;
lastChars2 = chars2;
}
if (lastDiff.Operation == Delete)
{
return lastChars2;
}
return lastChars2 + (location1 - lastChars1);
}
}