LineBreakEnumerator.cs 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506
  1. // Copyright (c) Six Labors.
  2. // Licensed under the Apache License, Version 2.0.
  3. // Ported from: https://github.com/SixLabors/Fonts/
  4. using System;
  5. namespace Avalonia.Media.TextFormatting.Unicode
  6. {
  7. /// <summary>
  8. /// Implementation of the Unicode Line Break Algorithm. UAX:14
  9. /// <see href="https://www.unicode.org/reports/tr14/tr14-37.html"/>
  10. /// </summary>
  11. public ref struct LineBreakEnumerator
  12. {
  13. private readonly ReadOnlySpan<char> _text;
  14. private int _position;
  15. private int _lastPosition;
  16. private LineBreakClass _currentClass;
  17. private LineBreakClass _nextClass;
  18. private bool _first;
  19. private int _alphaNumericCount;
  20. private bool _lb8a;
  21. private bool _lb21a;
  22. private bool _lb22ex;
  23. private bool _lb24ex;
  24. private bool _lb25ex;
  25. private bool _lb30;
  26. private int _lb30a;
  27. private bool _lb31;
  28. public LineBreakEnumerator(ReadOnlySpan<char> text)
  29. : this()
  30. {
  31. _text = text;
  32. _position = 0;
  33. _currentClass = LineBreakClass.Unknown;
  34. _nextClass = LineBreakClass.Unknown;
  35. _first = true;
  36. _lb8a = false;
  37. _lb21a = false;
  38. _lb22ex = false;
  39. _lb24ex = false;
  40. _lb25ex = false;
  41. _alphaNumericCount = 0;
  42. _lb31 = false;
  43. _lb30 = false;
  44. _lb30a = 0;
  45. }
  46. public LineBreak Current { get; private set; }
  47. public bool MoveNext()
  48. {
  49. // Get the first char if we're at the beginning of the string.
  50. if (_first)
  51. {
  52. var firstClass = NextCharClass();
  53. _first = false;
  54. _currentClass = MapFirst(firstClass);
  55. _nextClass = firstClass;
  56. _lb8a = firstClass == LineBreakClass.ZWJ;
  57. _lb30a = 0;
  58. }
  59. while (_position < _text.Length)
  60. {
  61. _lastPosition = _position;
  62. var lastClass = _nextClass;
  63. _nextClass = NextCharClass();
  64. // Explicit newline
  65. switch (_currentClass)
  66. {
  67. case LineBreakClass.MandatoryBreak:
  68. case LineBreakClass.CarriageReturn when _nextClass != LineBreakClass.LineFeed:
  69. {
  70. _currentClass = MapFirst(_nextClass);
  71. Current = new LineBreak(FindPriorNonWhitespace(_lastPosition), _lastPosition, true);
  72. return true;
  73. }
  74. }
  75. var shouldBreak = GetSimpleBreak() ?? GetPairTableBreak(lastClass);
  76. // Rule LB8a
  77. _lb8a = _nextClass == LineBreakClass.ZWJ;
  78. if (shouldBreak)
  79. {
  80. Current = new LineBreak(FindPriorNonWhitespace(_lastPosition), _lastPosition);
  81. return true;
  82. }
  83. }
  84. if (_position >= _text.Length)
  85. {
  86. if (_lastPosition < _text.Length)
  87. {
  88. _lastPosition = _text.Length;
  89. var required = false;
  90. switch (_currentClass)
  91. {
  92. case LineBreakClass.MandatoryBreak:
  93. case LineBreakClass.CarriageReturn when _nextClass != LineBreakClass.LineFeed:
  94. required = true;
  95. break;
  96. }
  97. Current = new LineBreak(FindPriorNonWhitespace(_lastPosition), _lastPosition, required);
  98. return true;
  99. }
  100. }
  101. Current = default;
  102. return false;
  103. }
  104. private static LineBreakClass MapClass(Codepoint cp)
  105. {
  106. if (cp.Value == 327685)
  107. {
  108. return LineBreakClass.Alphabetic;
  109. }
  110. // LB 1
  111. // ==========================================
  112. // Resolved Original General_Category
  113. // ==========================================
  114. // AL AI, SG, XX Any
  115. // CM SA Only Mn or Mc
  116. // AL SA Any except Mn and Mc
  117. // NS CJ Any
  118. switch (cp.LineBreakClass)
  119. {
  120. case LineBreakClass.Ambiguous:
  121. case LineBreakClass.Surrogate:
  122. case LineBreakClass.Unknown:
  123. return LineBreakClass.Alphabetic;
  124. case LineBreakClass.ComplexContext:
  125. return cp.GeneralCategory == GeneralCategory.NonspacingMark || cp.GeneralCategory == GeneralCategory.SpacingMark
  126. ? LineBreakClass.CombiningMark
  127. : LineBreakClass.Alphabetic;
  128. case LineBreakClass.ConditionalJapaneseStarter:
  129. return LineBreakClass.Nonstarter;
  130. default:
  131. return cp.LineBreakClass;
  132. }
  133. }
  134. private static LineBreakClass MapFirst(LineBreakClass c)
  135. {
  136. switch (c)
  137. {
  138. case LineBreakClass.LineFeed:
  139. case LineBreakClass.NextLine:
  140. return LineBreakClass.MandatoryBreak;
  141. case LineBreakClass.Space:
  142. return LineBreakClass.WordJoiner;
  143. default:
  144. return c;
  145. }
  146. }
  147. private static bool IsAlphaNumeric(LineBreakClass cls)
  148. => cls == LineBreakClass.Alphabetic
  149. || cls == LineBreakClass.HebrewLetter
  150. || cls == LineBreakClass.Numeric;
  151. private LineBreakClass PeekNextCharClass()
  152. {
  153. var cp = Codepoint.ReadAt(_text, _position, out _);
  154. return MapClass(cp);
  155. }
  156. // Get the next character class
  157. private LineBreakClass NextCharClass()
  158. {
  159. var cp = Codepoint.ReadAt(_text, _position, out var count);
  160. var cls = MapClass(cp);
  161. _position += count;
  162. // Keep track of alphanumeric + any combining marks.
  163. // This is used for LB22 and LB30.
  164. if (IsAlphaNumeric(_currentClass) || _alphaNumericCount > 0 && cls == LineBreakClass.CombiningMark)
  165. {
  166. _alphaNumericCount++;
  167. }
  168. // Track combining mark exceptions. LB22
  169. if (cls == LineBreakClass.CombiningMark)
  170. {
  171. switch (_currentClass)
  172. {
  173. case LineBreakClass.MandatoryBreak:
  174. case LineBreakClass.ContingentBreak:
  175. case LineBreakClass.Exclamation:
  176. case LineBreakClass.LineFeed:
  177. case LineBreakClass.NextLine:
  178. case LineBreakClass.Space:
  179. case LineBreakClass.ZWSpace:
  180. case LineBreakClass.CarriageReturn:
  181. _lb22ex = true;
  182. break;
  183. }
  184. }
  185. // Track combining mark exceptions. LB31
  186. if (_first && cls == LineBreakClass.CombiningMark)
  187. {
  188. _lb31 = true;
  189. }
  190. if (cls == LineBreakClass.CombiningMark)
  191. {
  192. switch (_currentClass)
  193. {
  194. case LineBreakClass.MandatoryBreak:
  195. case LineBreakClass.ContingentBreak:
  196. case LineBreakClass.Exclamation:
  197. case LineBreakClass.LineFeed:
  198. case LineBreakClass.NextLine:
  199. case LineBreakClass.Space:
  200. case LineBreakClass.ZWSpace:
  201. case LineBreakClass.CarriageReturn:
  202. case LineBreakClass.ZWJ:
  203. _lb31 = true;
  204. break;
  205. }
  206. }
  207. if (_first
  208. && (cls == LineBreakClass.PostfixNumeric || cls == LineBreakClass.PrefixNumeric || cls == LineBreakClass.Space))
  209. {
  210. _lb31 = true;
  211. }
  212. if (_currentClass == LineBreakClass.Alphabetic &&
  213. (cls == LineBreakClass.PostfixNumeric || cls == LineBreakClass.PrefixNumeric || cls == LineBreakClass.Space))
  214. {
  215. _lb31 = true;
  216. }
  217. // Reset LB31 if next is U+0028 (Left Opening Parenthesis)
  218. if (_lb31
  219. && _currentClass != LineBreakClass.PostfixNumeric
  220. && _currentClass != LineBreakClass.PrefixNumeric
  221. && cls == LineBreakClass.OpenPunctuation && cp.Value == 0x0028)
  222. {
  223. _lb31 = false;
  224. }
  225. // Rule LB24
  226. if (_first && (cls == LineBreakClass.ClosePunctuation || cls == LineBreakClass.CloseParenthesis))
  227. {
  228. _lb24ex = true;
  229. }
  230. // Rule LB25
  231. if (_first
  232. && (cls == LineBreakClass.ClosePunctuation || cls == LineBreakClass.InfixNumeric || cls == LineBreakClass.BreakSymbols))
  233. {
  234. _lb25ex = true;
  235. }
  236. if (cls == LineBreakClass.Space || cls == LineBreakClass.WordJoiner || cls == LineBreakClass.Alphabetic)
  237. {
  238. var next = PeekNextCharClass();
  239. if (next == LineBreakClass.ClosePunctuation || next == LineBreakClass.InfixNumeric || next == LineBreakClass.BreakSymbols)
  240. {
  241. _lb25ex = true;
  242. }
  243. }
  244. // AlphaNumeric + and combining marks can break for OP except.
  245. // - U+0028 (Left Opening Parenthesis)
  246. // - U+005B (Opening Square Bracket)
  247. // - U+007B (Left Curly Bracket)
  248. // See custom columns|rules in the text pair table.
  249. // https://www.unicode.org/Public/13.0.0/ucd/auxiliary/LineBreakTest.html
  250. _lb30 = _alphaNumericCount > 0
  251. && cls == LineBreakClass.OpenPunctuation
  252. && cp.Value != 0x0028
  253. && cp.Value != 0x005B
  254. && cp.Value != 0x007B;
  255. return cls;
  256. }
  257. private bool? GetSimpleBreak()
  258. {
  259. // handle classes not handled by the pair table
  260. switch (_nextClass)
  261. {
  262. case LineBreakClass.Space:
  263. return false;
  264. case LineBreakClass.MandatoryBreak:
  265. case LineBreakClass.LineFeed:
  266. case LineBreakClass.NextLine:
  267. _currentClass = LineBreakClass.MandatoryBreak;
  268. return false;
  269. case LineBreakClass.CarriageReturn:
  270. _currentClass = LineBreakClass.CarriageReturn;
  271. return false;
  272. }
  273. return null;
  274. }
  275. private bool GetPairTableBreak(LineBreakClass lastClass)
  276. {
  277. // If not handled already, use the pair table
  278. bool shouldBreak = false;
  279. switch (LineBreakPairTable.Table[(int)_currentClass][(int)_nextClass])
  280. {
  281. case LineBreakPairTable.DIBRK: // Direct break
  282. shouldBreak = true;
  283. break;
  284. // TODO: Rewrite this so that it defaults to true and rules are set as exceptions.
  285. case LineBreakPairTable.INBRK: // Possible indirect break
  286. // LB31
  287. if (_lb31 && _nextClass == LineBreakClass.OpenPunctuation)
  288. {
  289. shouldBreak = true;
  290. _lb31 = false;
  291. break;
  292. }
  293. // LB30
  294. if (_lb30)
  295. {
  296. shouldBreak = true;
  297. _lb30 = false;
  298. _alphaNumericCount = 0;
  299. break;
  300. }
  301. // LB25
  302. if (_lb25ex && (_nextClass == LineBreakClass.PrefixNumeric || _nextClass == LineBreakClass.Numeric))
  303. {
  304. shouldBreak = true;
  305. _lb25ex = false;
  306. break;
  307. }
  308. // LB24
  309. if (_lb24ex && (_nextClass == LineBreakClass.PostfixNumeric || _nextClass == LineBreakClass.PrefixNumeric))
  310. {
  311. shouldBreak = true;
  312. _lb24ex = false;
  313. break;
  314. }
  315. // LB18
  316. shouldBreak = lastClass == LineBreakClass.Space;
  317. break;
  318. case LineBreakPairTable.CIBRK:
  319. shouldBreak = lastClass == LineBreakClass.Space;
  320. if (!shouldBreak)
  321. {
  322. return false;
  323. }
  324. break;
  325. case LineBreakPairTable.CPBRK: // prohibited for combining marks
  326. if (lastClass != LineBreakClass.Space)
  327. {
  328. return false;
  329. }
  330. break;
  331. case LineBreakPairTable.PRBRK:
  332. break;
  333. }
  334. // Rule LB22
  335. if (_nextClass == LineBreakClass.Inseparable)
  336. {
  337. switch (lastClass)
  338. {
  339. case LineBreakClass.MandatoryBreak:
  340. case LineBreakClass.ContingentBreak:
  341. case LineBreakClass.Exclamation:
  342. case LineBreakClass.LineFeed:
  343. case LineBreakClass.NextLine:
  344. case LineBreakClass.Space:
  345. case LineBreakClass.ZWSpace:
  346. // Allow break
  347. break;
  348. case LineBreakClass.CombiningMark:
  349. if (_lb22ex)
  350. {
  351. // Allow break
  352. _lb22ex = false;
  353. break;
  354. }
  355. shouldBreak = false;
  356. break;
  357. default:
  358. shouldBreak = false;
  359. break;
  360. }
  361. }
  362. if (_lb8a)
  363. {
  364. shouldBreak = false;
  365. }
  366. // Rule LB21a
  367. if (_lb21a && (_currentClass == LineBreakClass.Hyphen || _currentClass == LineBreakClass.BreakAfter))
  368. {
  369. shouldBreak = false;
  370. _lb21a = false;
  371. }
  372. else
  373. {
  374. _lb21a = _currentClass == LineBreakClass.HebrewLetter;
  375. }
  376. // Rule LB30a
  377. if (_currentClass == LineBreakClass.RegionalIndicator)
  378. {
  379. _lb30a++;
  380. if (_lb30a == 2 && _nextClass == LineBreakClass.RegionalIndicator)
  381. {
  382. shouldBreak = true;
  383. _lb30a = 0;
  384. }
  385. }
  386. else
  387. {
  388. _lb30a = 0;
  389. }
  390. // Rule LB30b
  391. if (_nextClass == LineBreakClass.EModifier && _lastPosition > 0)
  392. {
  393. // Mahjong Tiles (Unicode block) are extended pictographics but have a class of ID
  394. // Unassigned codepoints with Line_Break=ID in some blocks are also assigned the Extended_Pictographic property.
  395. // Those blocks are intended for future allocation of emoji characters.
  396. var cp = Codepoint.ReadAt(_text, _lastPosition - 1, out int _);
  397. if (Codepoint.IsInRangeInclusive(cp, 0x1F000, 0x1F02F))
  398. {
  399. shouldBreak = false;
  400. }
  401. }
  402. _currentClass = _nextClass;
  403. return shouldBreak;
  404. }
  405. private int FindPriorNonWhitespace(int from)
  406. {
  407. if (from > 0)
  408. {
  409. var cp = Codepoint.ReadAt(_text, from - 1, out var count);
  410. var cls = cp.LineBreakClass;
  411. if (cls == LineBreakClass.MandatoryBreak || cls == LineBreakClass.LineFeed ||
  412. cls == LineBreakClass.CarriageReturn)
  413. {
  414. from -= count;
  415. }
  416. }
  417. while (from > 0)
  418. {
  419. var cp = Codepoint.ReadAt(_text, from - 1, out var count);
  420. var cls = cp.LineBreakClass;
  421. if (cls == LineBreakClass.Space)
  422. {
  423. from -= count;
  424. }
  425. else
  426. {
  427. break;
  428. }
  429. }
  430. return from;
  431. }
  432. }
  433. }