| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263 |
- // This source file is adapted from the .NET cross-platform runtime project.
- // (https://github.com/dotnet/runtime/)
- //
- // Licensed to The Avalonia Project under MIT License, courtesy of The .NET Foundation.
- using System.Runtime.InteropServices;
- using Avalonia.Utility;
- namespace Avalonia.Media.TextFormatting.Unicode
- {
- public ref struct GraphemeEnumerator
- {
- private ReadOnlySlice<char> _text;
- public GraphemeEnumerator(ReadOnlySlice<char> text)
- {
- _text = text;
- Current = default;
- }
- /// <summary>
- /// Gets the current <see cref="Grapheme"/>.
- /// </summary>
- public Grapheme Current { get; private set; }
- /// <summary>
- /// Moves to the next <see cref="Grapheme"/>.
- /// </summary>
- /// <returns></returns>
- public bool MoveNext()
- {
- if (_text.IsEmpty)
- {
- return false;
- }
- // Algorithm given at https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules.
- var processor = new Processor(_text);
- processor.MoveNext();
- var firstCodepoint = processor.CurrentCodepoint;
- // First, consume as many Prepend scalars as we can (rule GB9b).
- while (processor.CurrentType == GraphemeBreakClass.Prepend)
- {
- processor.MoveNext();
- }
- // Next, make sure we're not about to violate control character restrictions.
- // Essentially, if we saw Prepend data, we can't have Control | CR | LF data afterward (rule GB5).
- if (processor.CurrentCodeUnitOffset > 0)
- {
- if (processor.CurrentType == GraphemeBreakClass.Control
- || processor.CurrentType == GraphemeBreakClass.CR
- || processor.CurrentType == GraphemeBreakClass.LF)
- {
- goto Return;
- }
- }
- // Now begin the main state machine.
- var previousClusterBreakType = processor.CurrentType;
- processor.MoveNext();
- switch (previousClusterBreakType)
- {
- case GraphemeBreakClass.CR:
- if (processor.CurrentType != GraphemeBreakClass.LF)
- {
- goto Return; // rules GB3 & GB4 (only <LF> can follow <CR>)
- }
- processor.MoveNext();
- goto case GraphemeBreakClass.LF;
- case GraphemeBreakClass.Control:
- case GraphemeBreakClass.LF:
- goto Return; // rule GB4 (no data after Control | LF)
- case GraphemeBreakClass.L:
- if (processor.CurrentType == GraphemeBreakClass.L)
- {
- processor.MoveNext(); // rule GB6 (L x L)
- goto case GraphemeBreakClass.L;
- }
- else if (processor.CurrentType == GraphemeBreakClass.V)
- {
- processor.MoveNext(); // rule GB6 (L x V)
- goto case GraphemeBreakClass.V;
- }
- else if (processor.CurrentType == GraphemeBreakClass.LV)
- {
- processor.MoveNext(); // rule GB6 (L x LV)
- goto case GraphemeBreakClass.LV;
- }
- else if (processor.CurrentType == GraphemeBreakClass.LVT)
- {
- processor.MoveNext(); // rule GB6 (L x LVT)
- goto case GraphemeBreakClass.LVT;
- }
- else
- {
- break;
- }
- case GraphemeBreakClass.LV:
- case GraphemeBreakClass.V:
- if (processor.CurrentType == GraphemeBreakClass.V)
- {
- processor.MoveNext(); // rule GB7 (LV | V x V)
- goto case GraphemeBreakClass.V;
- }
- else if (processor.CurrentType == GraphemeBreakClass.T)
- {
- processor.MoveNext(); // rule GB7 (LV | V x T)
- goto case GraphemeBreakClass.T;
- }
- else
- {
- break;
- }
- case GraphemeBreakClass.LVT:
- case GraphemeBreakClass.T:
- if (processor.CurrentType == GraphemeBreakClass.T)
- {
- processor.MoveNext(); // rule GB8 (LVT | T x T)
- goto case GraphemeBreakClass.T;
- }
- else
- {
- break;
- }
- case GraphemeBreakClass.ExtendedPictographic:
- // Attempt processing extended pictographic (rules GB11, GB9).
- // First, drain any Extend scalars that might exist
- while (processor.CurrentType == GraphemeBreakClass.Extend)
- {
- processor.MoveNext();
- }
- // Now see if there's a ZWJ + extended pictograph again.
- if (processor.CurrentType != GraphemeBreakClass.ZWJ)
- {
- break;
- }
- processor.MoveNext();
- if (processor.CurrentType != GraphemeBreakClass.ExtendedPictographic)
- {
- break;
- }
- processor.MoveNext();
- goto case GraphemeBreakClass.ExtendedPictographic;
- case GraphemeBreakClass.RegionalIndicator:
- // We've consumed a single RI scalar. Try to consume another (to make it a pair).
- if (processor.CurrentType == GraphemeBreakClass.RegionalIndicator)
- {
- processor.MoveNext();
- }
- // Standlone RI scalars (or a single pair of RI scalars) can only be followed by trailers.
- break; // nothing but trailers after the final RI
- default:
- break;
- }
- // rules GB9, GB9a
- while (processor.CurrentType == GraphemeBreakClass.Extend
- || processor.CurrentType == GraphemeBreakClass.ZWJ
- || processor.CurrentType == GraphemeBreakClass.SpacingMark)
- {
- processor.MoveNext();
- }
- Return:
- var text = _text.Take(processor.CurrentCodeUnitOffset);
- Current = new Grapheme(firstCodepoint, text);
- _text = _text.Skip(processor.CurrentCodeUnitOffset);
- return true; // rules GB2, GB999
- }
- [StructLayout(LayoutKind.Auto)]
- private ref struct Processor
- {
- private readonly ReadOnlySlice<char> _buffer;
- private int _codeUnitLengthOfCurrentScalar;
- internal Processor(ReadOnlySlice<char> buffer)
- {
- _buffer = buffer;
- _codeUnitLengthOfCurrentScalar = 0;
- CurrentCodepoint = Codepoint.ReplacementCodepoint;
- CurrentType = GraphemeBreakClass.Other;
- CurrentCodeUnitOffset = 0;
- }
- public int CurrentCodeUnitOffset { get; private set; }
- /// <summary>
- /// Will be <see cref="GraphemeBreakClass.Other"/> if invalid data or EOF reached.
- /// Caller shouldn't need to special-case this since the normal rules will halt on this condition.
- /// </summary>
- public GraphemeBreakClass CurrentType { get; private set; }
- /// <summary>
- /// Get the currently processed <see cref="Codepoint"/>.
- /// </summary>
- public Codepoint CurrentCodepoint { get; private set; }
- public void MoveNext()
- {
- // For ill-formed subsequences (like unpaired UTF-16 surrogate code points), we rely on
- // the decoder's default behavior of interpreting these ill-formed subsequences as
- // equivalent to U+FFFD REPLACEMENT CHARACTER. This code point has a boundary property
- // of Other (XX), which matches the modifications made to UAX#29, Rev. 35.
- // See: https://www.unicode.org/reports/tr29/tr29-35.html#Modifications
- // This change is also reflected in the UCD files. For example, Unicode 11.0's UCD file
- // https://www.unicode.org/Public/11.0.0/ucd/auxiliary/GraphemeBreakProperty.txt
- // has the line "D800..DFFF ; Control # Cs [2048] <surrogate-D800>..<surrogate-DFFF>",
- // but starting with Unicode 12.0 that line has been removed.
- //
- // If a later version of the Unicode Standard further modifies this guidance we should reflect
- // that here.
- if (CurrentCodeUnitOffset == _buffer.Length)
- {
- CurrentCodepoint = Codepoint.ReplacementCodepoint;
- }
- else
- {
- CurrentCodeUnitOffset += _codeUnitLengthOfCurrentScalar;
- if (CurrentCodeUnitOffset < _buffer.Length)
- {
- CurrentCodepoint = Codepoint.ReadAt(_buffer, CurrentCodeUnitOffset,
- out _codeUnitLengthOfCurrentScalar);
- }
- else
- {
- CurrentCodepoint = Codepoint.ReplacementCodepoint;
- }
- }
- CurrentType = CurrentCodepoint.GraphemeBreakClass;
- }
- }
- }
- }
|