GraphemeEnumerator.cs 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263
  1. // This source file is adapted from the .NET cross-platform runtime project.
  2. // (https://github.com/dotnet/runtime/)
  3. //
  4. // Licensed to The Avalonia Project under MIT License, courtesy of The .NET Foundation.
  5. using System.Runtime.InteropServices;
  6. using Avalonia.Utility;
  7. namespace Avalonia.Media.TextFormatting.Unicode
  8. {
  9. public ref struct GraphemeEnumerator
  10. {
  11. private ReadOnlySlice<char> _text;
  12. public GraphemeEnumerator(ReadOnlySlice<char> text)
  13. {
  14. _text = text;
  15. Current = default;
  16. }
  17. /// <summary>
  18. /// Gets the current <see cref="Grapheme"/>.
  19. /// </summary>
  20. public Grapheme Current { get; private set; }
  21. /// <summary>
  22. /// Moves to the next <see cref="Grapheme"/>.
  23. /// </summary>
  24. /// <returns></returns>
  25. public bool MoveNext()
  26. {
  27. if (_text.IsEmpty)
  28. {
  29. return false;
  30. }
  31. // Algorithm given at https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules.
  32. var processor = new Processor(_text);
  33. processor.MoveNext();
  34. var firstCodepoint = processor.CurrentCodepoint;
  35. // First, consume as many Prepend scalars as we can (rule GB9b).
  36. while (processor.CurrentType == GraphemeBreakClass.Prepend)
  37. {
  38. processor.MoveNext();
  39. }
  40. // Next, make sure we're not about to violate control character restrictions.
  41. // Essentially, if we saw Prepend data, we can't have Control | CR | LF data afterward (rule GB5).
  42. if (processor.CurrentCodeUnitOffset > 0)
  43. {
  44. if (processor.CurrentType == GraphemeBreakClass.Control
  45. || processor.CurrentType == GraphemeBreakClass.CR
  46. || processor.CurrentType == GraphemeBreakClass.LF)
  47. {
  48. goto Return;
  49. }
  50. }
  51. // Now begin the main state machine.
  52. var previousClusterBreakType = processor.CurrentType;
  53. processor.MoveNext();
  54. switch (previousClusterBreakType)
  55. {
  56. case GraphemeBreakClass.CR:
  57. if (processor.CurrentType != GraphemeBreakClass.LF)
  58. {
  59. goto Return; // rules GB3 & GB4 (only <LF> can follow <CR>)
  60. }
  61. processor.MoveNext();
  62. goto case GraphemeBreakClass.LF;
  63. case GraphemeBreakClass.Control:
  64. case GraphemeBreakClass.LF:
  65. goto Return; // rule GB4 (no data after Control | LF)
  66. case GraphemeBreakClass.L:
  67. if (processor.CurrentType == GraphemeBreakClass.L)
  68. {
  69. processor.MoveNext(); // rule GB6 (L x L)
  70. goto case GraphemeBreakClass.L;
  71. }
  72. else if (processor.CurrentType == GraphemeBreakClass.V)
  73. {
  74. processor.MoveNext(); // rule GB6 (L x V)
  75. goto case GraphemeBreakClass.V;
  76. }
  77. else if (processor.CurrentType == GraphemeBreakClass.LV)
  78. {
  79. processor.MoveNext(); // rule GB6 (L x LV)
  80. goto case GraphemeBreakClass.LV;
  81. }
  82. else if (processor.CurrentType == GraphemeBreakClass.LVT)
  83. {
  84. processor.MoveNext(); // rule GB6 (L x LVT)
  85. goto case GraphemeBreakClass.LVT;
  86. }
  87. else
  88. {
  89. break;
  90. }
  91. case GraphemeBreakClass.LV:
  92. case GraphemeBreakClass.V:
  93. if (processor.CurrentType == GraphemeBreakClass.V)
  94. {
  95. processor.MoveNext(); // rule GB7 (LV | V x V)
  96. goto case GraphemeBreakClass.V;
  97. }
  98. else if (processor.CurrentType == GraphemeBreakClass.T)
  99. {
  100. processor.MoveNext(); // rule GB7 (LV | V x T)
  101. goto case GraphemeBreakClass.T;
  102. }
  103. else
  104. {
  105. break;
  106. }
  107. case GraphemeBreakClass.LVT:
  108. case GraphemeBreakClass.T:
  109. if (processor.CurrentType == GraphemeBreakClass.T)
  110. {
  111. processor.MoveNext(); // rule GB8 (LVT | T x T)
  112. goto case GraphemeBreakClass.T;
  113. }
  114. else
  115. {
  116. break;
  117. }
  118. case GraphemeBreakClass.ExtendedPictographic:
  119. // Attempt processing extended pictographic (rules GB11, GB9).
  120. // First, drain any Extend scalars that might exist
  121. while (processor.CurrentType == GraphemeBreakClass.Extend)
  122. {
  123. processor.MoveNext();
  124. }
  125. // Now see if there's a ZWJ + extended pictograph again.
  126. if (processor.CurrentType != GraphemeBreakClass.ZWJ)
  127. {
  128. break;
  129. }
  130. processor.MoveNext();
  131. if (processor.CurrentType != GraphemeBreakClass.ExtendedPictographic)
  132. {
  133. break;
  134. }
  135. processor.MoveNext();
  136. goto case GraphemeBreakClass.ExtendedPictographic;
  137. case GraphemeBreakClass.RegionalIndicator:
  138. // We've consumed a single RI scalar. Try to consume another (to make it a pair).
  139. if (processor.CurrentType == GraphemeBreakClass.RegionalIndicator)
  140. {
  141. processor.MoveNext();
  142. }
  143. // Standlone RI scalars (or a single pair of RI scalars) can only be followed by trailers.
  144. break; // nothing but trailers after the final RI
  145. default:
  146. break;
  147. }
  148. // rules GB9, GB9a
  149. while (processor.CurrentType == GraphemeBreakClass.Extend
  150. || processor.CurrentType == GraphemeBreakClass.ZWJ
  151. || processor.CurrentType == GraphemeBreakClass.SpacingMark)
  152. {
  153. processor.MoveNext();
  154. }
  155. Return:
  156. var text = _text.Take(processor.CurrentCodeUnitOffset);
  157. Current = new Grapheme(firstCodepoint, text);
  158. _text = _text.Skip(processor.CurrentCodeUnitOffset);
  159. return true; // rules GB2, GB999
  160. }
  161. [StructLayout(LayoutKind.Auto)]
  162. private ref struct Processor
  163. {
  164. private readonly ReadOnlySlice<char> _buffer;
  165. private int _codeUnitLengthOfCurrentScalar;
  166. internal Processor(ReadOnlySlice<char> buffer)
  167. {
  168. _buffer = buffer;
  169. _codeUnitLengthOfCurrentScalar = 0;
  170. CurrentCodepoint = Codepoint.ReplacementCodepoint;
  171. CurrentType = GraphemeBreakClass.Other;
  172. CurrentCodeUnitOffset = 0;
  173. }
  174. public int CurrentCodeUnitOffset { get; private set; }
  175. /// <summary>
  176. /// Will be <see cref="GraphemeBreakClass.Other"/> if invalid data or EOF reached.
  177. /// Caller shouldn't need to special-case this since the normal rules will halt on this condition.
  178. /// </summary>
  179. public GraphemeBreakClass CurrentType { get; private set; }
  180. /// <summary>
  181. /// Get the currently processed <see cref="Codepoint"/>.
  182. /// </summary>
  183. public Codepoint CurrentCodepoint { get; private set; }
  184. public void MoveNext()
  185. {
  186. // For ill-formed subsequences (like unpaired UTF-16 surrogate code points), we rely on
  187. // the decoder's default behavior of interpreting these ill-formed subsequences as
  188. // equivalent to U+FFFD REPLACEMENT CHARACTER. This code point has a boundary property
  189. // of Other (XX), which matches the modifications made to UAX#29, Rev. 35.
  190. // See: https://www.unicode.org/reports/tr29/tr29-35.html#Modifications
  191. // This change is also reflected in the UCD files. For example, Unicode 11.0's UCD file
  192. // https://www.unicode.org/Public/11.0.0/ucd/auxiliary/GraphemeBreakProperty.txt
  193. // has the line "D800..DFFF ; Control # Cs [2048] <surrogate-D800>..<surrogate-DFFF>",
  194. // but starting with Unicode 12.0 that line has been removed.
  195. //
  196. // If a later version of the Unicode Standard further modifies this guidance we should reflect
  197. // that here.
  198. if (CurrentCodeUnitOffset == _buffer.Length)
  199. {
  200. CurrentCodepoint = Codepoint.ReplacementCodepoint;
  201. }
  202. else
  203. {
  204. CurrentCodeUnitOffset += _codeUnitLengthOfCurrentScalar;
  205. if (CurrentCodeUnitOffset < _buffer.Length)
  206. {
  207. CurrentCodepoint = Codepoint.ReadAt(_buffer, CurrentCodeUnitOffset,
  208. out _codeUnitLengthOfCurrentScalar);
  209. }
  210. else
  211. {
  212. CurrentCodepoint = Codepoint.ReplacementCodepoint;
  213. }
  214. }
  215. CurrentType = CurrentCodepoint.GraphemeBreakClass;
  216. }
  217. }
  218. }
  219. }