// Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. // Ported from: https://github.com/SixLabors/Fonts/ using System; using System.Collections.Generic; namespace Avalonia.Media.TextFormatting.Unicode { /// /// Implementation of the Unicode Line Break Algorithm. UAX:14 /// /// public ref struct LineBreakEnumerator { private readonly IReadOnlyList _text; private int _position; private int _lastPosition; private LineBreakClass _currentClass; private LineBreakClass _nextClass; private bool _first; private int _alphaNumericCount; private bool _lb8a; private bool _lb21a; private bool _lb22ex; private bool _lb24ex; private bool _lb25ex; private bool _lb30; private int _lb30a; private bool _lb31; public LineBreakEnumerator(IReadOnlyList text) : this() { _text = text; _position = 0; _currentClass = LineBreakClass.Unknown; _nextClass = LineBreakClass.Unknown; _first = true; _lb8a = false; _lb21a = false; _lb22ex = false; _lb24ex = false; _lb25ex = false; _alphaNumericCount = 0; _lb31 = false; _lb30 = false; _lb30a = 0; } public LineBreak Current { get; private set; } public bool MoveNext() { // Get the first char if we're at the beginning of the string. if (_first) { var firstClass = NextCharClass(); _first = false; _currentClass = MapFirst(firstClass); _nextClass = firstClass; _lb8a = firstClass == LineBreakClass.ZWJ; _lb30a = 0; } while (_position < _text.Count) { _lastPosition = _position; var lastClass = _nextClass; _nextClass = NextCharClass(); // Explicit newline switch (_currentClass) { case LineBreakClass.MandatoryBreak: case LineBreakClass.CarriageReturn when _nextClass != LineBreakClass.LineFeed: { _currentClass = MapFirst(_nextClass); Current = new LineBreak(FindPriorNonWhitespace(_lastPosition), _lastPosition, true); return true; } } var shouldBreak = GetSimpleBreak() ?? GetPairTableBreak(lastClass); // Rule LB8a _lb8a = _nextClass == LineBreakClass.ZWJ; if (shouldBreak) { Current = new LineBreak(FindPriorNonWhitespace(_lastPosition), _lastPosition); return true; } } if (_position >= _text.Count) { if (_lastPosition < _text.Count) { _lastPosition = _text.Count; var required = false; switch (_currentClass) { case LineBreakClass.MandatoryBreak: case LineBreakClass.CarriageReturn when _nextClass != LineBreakClass.LineFeed: required = true; break; } Current = new LineBreak(FindPriorNonWhitespace(_lastPosition), _lastPosition, required); return true; } } Current = default; return false; } private static LineBreakClass MapClass(Codepoint cp) { if (cp.Value == 327685) { return LineBreakClass.Alphabetic; } // LB 1 // ========================================== // Resolved Original General_Category // ========================================== // AL AI, SG, XX Any // CM SA Only Mn or Mc // AL SA Any except Mn and Mc // NS CJ Any switch (cp.LineBreakClass) { case LineBreakClass.Ambiguous: case LineBreakClass.Surrogate: case LineBreakClass.Unknown: return LineBreakClass.Alphabetic; case LineBreakClass.ComplexContext: return cp.GeneralCategory == GeneralCategory.NonspacingMark || cp.GeneralCategory == GeneralCategory.SpacingMark ? LineBreakClass.CombiningMark : LineBreakClass.Alphabetic; case LineBreakClass.ConditionalJapaneseStarter: return LineBreakClass.Nonstarter; default: return cp.LineBreakClass; } } private static LineBreakClass MapFirst(LineBreakClass c) { switch (c) { case LineBreakClass.LineFeed: case LineBreakClass.NextLine: return LineBreakClass.MandatoryBreak; case LineBreakClass.Space: return LineBreakClass.WordJoiner; default: return c; } } private static bool IsAlphaNumeric(LineBreakClass cls) => cls == LineBreakClass.Alphabetic || cls == LineBreakClass.HebrewLetter || cls == LineBreakClass.Numeric; private LineBreakClass PeekNextCharClass() { var cp = Codepoint.ReadAt(_text, _position, out _); return MapClass(cp); } // Get the next character class private LineBreakClass NextCharClass() { var cp = Codepoint.ReadAt(_text, _position, out var count); var cls = MapClass(cp); _position += count; // Keep track of alphanumeric + any combining marks. // This is used for LB22 and LB30. if (IsAlphaNumeric(_currentClass) || _alphaNumericCount > 0 && cls == LineBreakClass.CombiningMark) { _alphaNumericCount++; } // Track combining mark exceptions. LB22 if (cls == LineBreakClass.CombiningMark) { switch (_currentClass) { case LineBreakClass.MandatoryBreak: case LineBreakClass.ContingentBreak: case LineBreakClass.Exclamation: case LineBreakClass.LineFeed: case LineBreakClass.NextLine: case LineBreakClass.Space: case LineBreakClass.ZWSpace: case LineBreakClass.CarriageReturn: _lb22ex = true; break; } } // Track combining mark exceptions. LB31 if (_first && cls == LineBreakClass.CombiningMark) { _lb31 = true; } if (cls == LineBreakClass.CombiningMark) { switch (_currentClass) { case LineBreakClass.MandatoryBreak: case LineBreakClass.ContingentBreak: case LineBreakClass.Exclamation: case LineBreakClass.LineFeed: case LineBreakClass.NextLine: case LineBreakClass.Space: case LineBreakClass.ZWSpace: case LineBreakClass.CarriageReturn: case LineBreakClass.ZWJ: _lb31 = true; break; } } if (_first && (cls == LineBreakClass.PostfixNumeric || cls == LineBreakClass.PrefixNumeric || cls == LineBreakClass.Space)) { _lb31 = true; } if (_currentClass == LineBreakClass.Alphabetic && (cls == LineBreakClass.PostfixNumeric || cls == LineBreakClass.PrefixNumeric || cls == LineBreakClass.Space)) { _lb31 = true; } // Reset LB31 if next is U+0028 (Left Opening Parenthesis) if (_lb31 && _currentClass != LineBreakClass.PostfixNumeric && _currentClass != LineBreakClass.PrefixNumeric && cls == LineBreakClass.OpenPunctuation && cp.Value == 0x0028) { _lb31 = false; } // Rule LB24 if (_first && (cls == LineBreakClass.ClosePunctuation || cls == LineBreakClass.CloseParenthesis)) { _lb24ex = true; } // Rule LB25 if (_first && (cls == LineBreakClass.ClosePunctuation || cls == LineBreakClass.InfixNumeric || cls == LineBreakClass.BreakSymbols)) { _lb25ex = true; } if (cls == LineBreakClass.Space || cls == LineBreakClass.WordJoiner || cls == LineBreakClass.Alphabetic) { var next = PeekNextCharClass(); if (next == LineBreakClass.ClosePunctuation || next == LineBreakClass.InfixNumeric || next == LineBreakClass.BreakSymbols) { _lb25ex = true; } } // AlphaNumeric + and combining marks can break for OP except. // - U+0028 (Left Opening Parenthesis) // - U+005B (Opening Square Bracket) // - U+007B (Left Curly Bracket) // See custom columns|rules in the text pair table. // https://www.unicode.org/Public/13.0.0/ucd/auxiliary/LineBreakTest.html _lb30 = _alphaNumericCount > 0 && cls == LineBreakClass.OpenPunctuation && cp.Value != 0x0028 && cp.Value != 0x005B && cp.Value != 0x007B; return cls; } private bool? GetSimpleBreak() { // handle classes not handled by the pair table switch (_nextClass) { case LineBreakClass.Space: return false; case LineBreakClass.MandatoryBreak: case LineBreakClass.LineFeed: case LineBreakClass.NextLine: _currentClass = LineBreakClass.MandatoryBreak; return false; case LineBreakClass.CarriageReturn: _currentClass = LineBreakClass.CarriageReturn; return false; } return null; } private bool GetPairTableBreak(LineBreakClass lastClass) { // If not handled already, use the pair table bool shouldBreak = false; switch (LineBreakPairTable.Table[(int)_currentClass][(int)_nextClass]) { case LineBreakPairTable.DIBRK: // Direct break shouldBreak = true; break; // TODO: Rewrite this so that it defaults to true and rules are set as exceptions. case LineBreakPairTable.INBRK: // Possible indirect break // LB31 if (_lb31 && _nextClass == LineBreakClass.OpenPunctuation) { shouldBreak = true; _lb31 = false; break; } // LB30 if (_lb30) { shouldBreak = true; _lb30 = false; _alphaNumericCount = 0; break; } // LB25 if (_lb25ex && (_nextClass == LineBreakClass.PrefixNumeric || _nextClass == LineBreakClass.Numeric)) { shouldBreak = true; _lb25ex = false; break; } // LB24 if (_lb24ex && (_nextClass == LineBreakClass.PostfixNumeric || _nextClass == LineBreakClass.PrefixNumeric)) { shouldBreak = true; _lb24ex = false; break; } // LB18 shouldBreak = lastClass == LineBreakClass.Space; break; case LineBreakPairTable.CIBRK: shouldBreak = lastClass == LineBreakClass.Space; if (!shouldBreak) { return false; } break; case LineBreakPairTable.CPBRK: // prohibited for combining marks if (lastClass != LineBreakClass.Space) { return false; } break; case LineBreakPairTable.PRBRK: break; } // Rule LB22 if (_nextClass == LineBreakClass.Inseparable) { switch (lastClass) { case LineBreakClass.MandatoryBreak: case LineBreakClass.ContingentBreak: case LineBreakClass.Exclamation: case LineBreakClass.LineFeed: case LineBreakClass.NextLine: case LineBreakClass.Space: case LineBreakClass.ZWSpace: // Allow break break; case LineBreakClass.CombiningMark: if (_lb22ex) { // Allow break _lb22ex = false; break; } shouldBreak = false; break; default: shouldBreak = false; break; } } if (_lb8a) { shouldBreak = false; } // Rule LB21a if (_lb21a && (_currentClass == LineBreakClass.Hyphen || _currentClass == LineBreakClass.BreakAfter)) { shouldBreak = false; _lb21a = false; } else { _lb21a = _currentClass == LineBreakClass.HebrewLetter; } // Rule LB30a if (_currentClass == LineBreakClass.RegionalIndicator) { _lb30a++; if (_lb30a == 2 && _nextClass == LineBreakClass.RegionalIndicator) { shouldBreak = true; _lb30a = 0; } } else { _lb30a = 0; } // Rule LB30b if (_nextClass == LineBreakClass.EModifier && _lastPosition > 0) { // Mahjong Tiles (Unicode block) are extended pictographics but have a class of ID // Unassigned codepoints with Line_Break=ID in some blocks are also assigned the Extended_Pictographic property. // Those blocks are intended for future allocation of emoji characters. var cp = Codepoint.ReadAt(_text, _lastPosition - 1, out int _); if (Codepoint.IsInRangeInclusive(cp, 0x1F000, 0x1F02F)) { shouldBreak = false; } } _currentClass = _nextClass; return shouldBreak; } private int FindPriorNonWhitespace(int from) { if (from > 0) { var cp = Codepoint.ReadAt(_text, from - 1, out var count); var cls = cp.LineBreakClass; if (cls == LineBreakClass.MandatoryBreak || cls == LineBreakClass.LineFeed || cls == LineBreakClass.CarriageReturn) { from -= count; } } while (from > 0) { var cp = Codepoint.ReadAt(_text, from - 1, out var count); var cls = cp.LineBreakClass; if (cls == LineBreakClass.Space) { from -= count; } else { break; } } return from; } } }