LineBreakEnumerator.cs 17 KB


  1. // Copyright (c) Six Labors.
  2. // Licensed under the Apache License, Version 2.0.
  3. // Ported from: https://github.com/SixLabors/Fonts/
  4. using System;
  5. using System.Collections.Generic;
  6. namespace Avalonia.Media.TextFormatting.Unicode
  7. {
  8. /// <summary>
  9. /// Implementation of the Unicode Line Break Algorithm. UAX:14
  10. /// <see href="https://www.unicode.org/reports/tr14/tr14-37.html"/>
  11. /// </summary>
  12. public ref struct LineBreakEnumerator
  13. {
  14. private readonly IReadOnlyList<char> _text;
  15. private int _position;
  16. private int _lastPosition;
  17. private LineBreakClass _currentClass;
  18. private LineBreakClass _nextClass;
  19. private bool _first;
  20. private int _alphaNumericCount;
  21. private bool _lb8a;
  22. private bool _lb21a;
  23. private bool _lb22ex;
  24. private bool _lb24ex;
  25. private bool _lb25ex;
  26. private bool _lb30;
  27. private int _lb30a;
  28. private bool _lb31;
  29. public LineBreakEnumerator(IReadOnlyList<char> text)
  30. : this()
  31. {
  32. _text = text;
  33. _position = 0;
  34. _currentClass = LineBreakClass.Unknown;
  35. _nextClass = LineBreakClass.Unknown;
  36. _first = true;
  37. _lb8a = false;
  38. _lb21a = false;
  39. _lb22ex = false;
  40. _lb24ex = false;
  41. _lb25ex = false;
  42. _alphaNumericCount = 0;
  43. _lb31 = false;
  44. _lb30 = false;
  45. _lb30a = 0;
  46. }
  47. public LineBreak Current { get; private set; }
  48. public bool MoveNext()
  49. {
  50. // Get the first char if we're at the beginning of the string.
  51. if (_first)
  52. {
  53. var firstClass = NextCharClass();
  54. _first = false;
  55. _currentClass = MapFirst(firstClass);
  56. _nextClass = firstClass;
  57. _lb8a = firstClass == LineBreakClass.ZWJ;
  58. _lb30a = 0;
  59. }
  60. while (_position < _text.Count)
  61. {
  62. _lastPosition = _position;
  63. var lastClass = _nextClass;
  64. _nextClass = NextCharClass();
  65. // Explicit newline
  66. switch (_currentClass)
  67. {
  68. case LineBreakClass.MandatoryBreak:
  69. case LineBreakClass.CarriageReturn when _nextClass != LineBreakClass.LineFeed:
  70. {
  71. _currentClass = MapFirst(_nextClass);
  72. Current = new LineBreak(FindPriorNonWhitespace(_lastPosition), _lastPosition, true);
  73. return true;
  74. }
  75. }
  76. var shouldBreak = GetSimpleBreak() ?? GetPairTableBreak(lastClass);
  77. // Rule LB8a
  78. _lb8a = _nextClass == LineBreakClass.ZWJ;
  79. if (shouldBreak)
  80. {
  81. Current = new LineBreak(FindPriorNonWhitespace(_lastPosition), _lastPosition);
  82. return true;
  83. }
  84. }
  85. if (_position >= _text.Count)
  86. {
  87. if (_lastPosition < _text.Count)
  88. {
  89. _lastPosition = _text.Count;
  90. var required = false;
  91. switch (_currentClass)
  92. {
  93. case LineBreakClass.MandatoryBreak:
  94. case LineBreakClass.CarriageReturn when _nextClass != LineBreakClass.LineFeed:
  95. required = true;
  96. break;
  97. }
  98. Current = new LineBreak(FindPriorNonWhitespace(_lastPosition), _lastPosition, required);
  99. return true;
  100. }
  101. }
  102. Current = default;
  103. return false;
  104. }
  105. private static LineBreakClass MapClass(Codepoint cp)
  106. {
  107. if (cp.Value == 327685)
  108. {
  109. return LineBreakClass.Alphabetic;
  110. }
  111. // LB 1
  112. // ==========================================
  113. // Resolved Original General_Category
  114. // ==========================================
  115. // AL AI, SG, XX Any
  116. // CM SA Only Mn or Mc
  117. // AL SA Any except Mn and Mc
  118. // NS CJ Any
  119. switch (cp.LineBreakClass)
  120. {
  121. case LineBreakClass.Ambiguous:
  122. case LineBreakClass.Surrogate:
  123. case LineBreakClass.Unknown:
  124. return LineBreakClass.Alphabetic;
  125. case LineBreakClass.ComplexContext:
  126. return cp.GeneralCategory == GeneralCategory.NonspacingMark || cp.GeneralCategory == GeneralCategory.SpacingMark
  127. ? LineBreakClass.CombiningMark
  128. : LineBreakClass.Alphabetic;
  129. case LineBreakClass.ConditionalJapaneseStarter:
  130. return LineBreakClass.Nonstarter;
  131. default:
  132. return cp.LineBreakClass;
  133. }
  134. }
  135. private static LineBreakClass MapFirst(LineBreakClass c)
  136. {
  137. switch (c)
  138. {
  139. case LineBreakClass.LineFeed:
  140. case LineBreakClass.NextLine:
  141. return LineBreakClass.MandatoryBreak;
  142. case LineBreakClass.Space:
  143. return LineBreakClass.WordJoiner;
  144. default:
  145. return c;
  146. }
  147. }
  148. private static bool IsAlphaNumeric(LineBreakClass cls)
  149. => cls == LineBreakClass.Alphabetic
  150. || cls == LineBreakClass.HebrewLetter
  151. || cls == LineBreakClass.Numeric;
  152. private LineBreakClass PeekNextCharClass()
  153. {
  154. var cp = Codepoint.ReadAt(_text, _position, out _);
  155. return MapClass(cp);
  156. }
  157. // Get the next character class
  158. private LineBreakClass NextCharClass()
  159. {
  160. var cp = Codepoint.ReadAt(_text, _position, out var count);
  161. var cls = MapClass(cp);
  162. _position += count;
  163. // Keep track of alphanumeric + any combining marks.
  164. // This is used for LB22 and LB30.
  165. if (IsAlphaNumeric(_currentClass) || _alphaNumericCount > 0 && cls == LineBreakClass.CombiningMark)
  166. {
  167. _alphaNumericCount++;
  168. }
  169. // Track combining mark exceptions. LB22
  170. if (cls == LineBreakClass.CombiningMark)
  171. {
  172. switch (_currentClass)
  173. {
  174. case LineBreakClass.MandatoryBreak:
  175. case LineBreakClass.ContingentBreak:
  176. case LineBreakClass.Exclamation:
  177. case LineBreakClass.LineFeed:
  178. case LineBreakClass.NextLine:
  179. case LineBreakClass.Space:
  180. case LineBreakClass.ZWSpace:
  181. case LineBreakClass.CarriageReturn:
  182. _lb22ex = true;
  183. break;
  184. }
  185. }
  186. // Track combining mark exceptions. LB31
  187. if (_first && cls == LineBreakClass.CombiningMark)
  188. {
  189. _lb31 = true;
  190. }
  191. if (cls == LineBreakClass.CombiningMark)
  192. {
  193. switch (_currentClass)
  194. {
  195. case LineBreakClass.MandatoryBreak:
  196. case LineBreakClass.ContingentBreak:
  197. case LineBreakClass.Exclamation:
  198. case LineBreakClass.LineFeed:
  199. case LineBreakClass.NextLine:
  200. case LineBreakClass.Space:
  201. case LineBreakClass.ZWSpace:
  202. case LineBreakClass.CarriageReturn:
  203. case LineBreakClass.ZWJ:
  204. _lb31 = true;
  205. break;
  206. }
  207. }
  208. if (_first
  209. && (cls == LineBreakClass.PostfixNumeric || cls == LineBreakClass.PrefixNumeric || cls == LineBreakClass.Space))
  210. {
  211. _lb31 = true;
  212. }
  213. if (_currentClass == LineBreakClass.Alphabetic &&
  214. (cls == LineBreakClass.PostfixNumeric || cls == LineBreakClass.PrefixNumeric || cls == LineBreakClass.Space))
  215. {
  216. _lb31 = true;
  217. }
  218. // Reset LB31 if next is U+0028 (Left Opening Parenthesis)
  219. if (_lb31
  220. && _currentClass != LineBreakClass.PostfixNumeric
  221. && _currentClass != LineBreakClass.PrefixNumeric
  222. && cls == LineBreakClass.OpenPunctuation && cp.Value == 0x0028)
  223. {
  224. _lb31 = false;
  225. }
  226. // Rule LB24
  227. if (_first && (cls == LineBreakClass.ClosePunctuation || cls == LineBreakClass.CloseParenthesis))
  228. {
  229. _lb24ex = true;
  230. }
  231. // Rule LB25
  232. if (_first
  233. && (cls == LineBreakClass.ClosePunctuation || cls == LineBreakClass.InfixNumeric || cls == LineBreakClass.BreakSymbols))
  234. {
  235. _lb25ex = true;
  236. }
  237. if (cls == LineBreakClass.Space || cls == LineBreakClass.WordJoiner || cls == LineBreakClass.Alphabetic)
  238. {
  239. var next = PeekNextCharClass();
  240. if (next == LineBreakClass.ClosePunctuation || next == LineBreakClass.InfixNumeric || next == LineBreakClass.BreakSymbols)
  241. {
  242. _lb25ex = true;
  243. }
  244. }
  245. // AlphaNumeric + and combining marks can break for OP except.
  246. // - U+0028 (Left Opening Parenthesis)
  247. // - U+005B (Opening Square Bracket)
  248. // - U+007B (Left Curly Bracket)
  249. // See custom columns|rules in the text pair table.
  250. // https://www.unicode.org/Public/13.0.0/ucd/auxiliary/LineBreakTest.html
  251. _lb30 = _alphaNumericCount > 0
  252. && cls == LineBreakClass.OpenPunctuation
  253. && cp.Value != 0x0028
  254. && cp.Value != 0x005B
  255. && cp.Value != 0x007B;
  256. return cls;
  257. }
  258. private bool? GetSimpleBreak()
  259. {
  260. // handle classes not handled by the pair table
  261. switch (_nextClass)
  262. {
  263. case LineBreakClass.Space:
  264. return false;
  265. case LineBreakClass.MandatoryBreak:
  266. case LineBreakClass.LineFeed:
  267. case LineBreakClass.NextLine:
  268. _currentClass = LineBreakClass.MandatoryBreak;
  269. return false;
  270. case LineBreakClass.CarriageReturn:
  271. _currentClass = LineBreakClass.CarriageReturn;
  272. return false;
  273. }
  274. return null;
  275. }
  276. private bool GetPairTableBreak(LineBreakClass lastClass)
  277. {
  278. // If not handled already, use the pair table
  279. bool shouldBreak = false;
  280. switch (LineBreakPairTable.Table[(int)_currentClass][(int)_nextClass])
  281. {
  282. case LineBreakPairTable.DIBRK: // Direct break
  283. shouldBreak = true;
  284. break;
  285. // TODO: Rewrite this so that it defaults to true and rules are set as exceptions.
  286. case LineBreakPairTable.INBRK: // Possible indirect break
  287. // LB31
  288. if (_lb31 && _nextClass == LineBreakClass.OpenPunctuation)
  289. {
  290. shouldBreak = true;
  291. _lb31 = false;
  292. break;
  293. }
  294. // LB30
  295. if (_lb30)
  296. {
  297. shouldBreak = true;
  298. _lb30 = false;
  299. _alphaNumericCount = 0;
  300. break;
  301. }
  302. // LB25
  303. if (_lb25ex && (_nextClass == LineBreakClass.PrefixNumeric || _nextClass == LineBreakClass.Numeric))
  304. {
  305. shouldBreak = true;
  306. _lb25ex = false;
  307. break;
  308. }
  309. // LB24
  310. if (_lb24ex && (_nextClass == LineBreakClass.PostfixNumeric || _nextClass == LineBreakClass.PrefixNumeric))
  311. {
  312. shouldBreak = true;
  313. _lb24ex = false;
  314. break;
  315. }
  316. // LB18
  317. shouldBreak = lastClass == LineBreakClass.Space;
  318. break;
  319. case LineBreakPairTable.CIBRK:
  320. shouldBreak = lastClass == LineBreakClass.Space;
  321. if (!shouldBreak)
  322. {
  323. return false;
  324. }
  325. break;
  326. case LineBreakPairTable.CPBRK: // prohibited for combining marks
  327. if (lastClass != LineBreakClass.Space)
  328. {
  329. return false;
  330. }
  331. break;
  332. case LineBreakPairTable.PRBRK:
  333. break;
  334. }
  335. // Rule LB22
  336. if (_nextClass == LineBreakClass.Inseparable)
  337. {
  338. switch (lastClass)
  339. {
  340. case LineBreakClass.MandatoryBreak:
  341. case LineBreakClass.ContingentBreak:
  342. case LineBreakClass.Exclamation:
  343. case LineBreakClass.LineFeed:
  344. case LineBreakClass.NextLine:
  345. case LineBreakClass.Space:
  346. case LineBreakClass.ZWSpace:
  347. // Allow break
  348. break;
  349. case LineBreakClass.CombiningMark:
  350. if (_lb22ex)
  351. {
  352. // Allow break
  353. _lb22ex = false;
  354. break;
  355. }
  356. shouldBreak = false;
  357. break;
  358. default:
  359. shouldBreak = false;
  360. break;
  361. }
  362. }
  363. if (_lb8a)
  364. {
  365. shouldBreak = false;
  366. }
  367. // Rule LB21a
  368. if (_lb21a && (_currentClass == LineBreakClass.Hyphen || _currentClass == LineBreakClass.BreakAfter))
  369. {
  370. shouldBreak = false;
  371. _lb21a = false;
  372. }
  373. else
  374. {
  375. _lb21a = _currentClass == LineBreakClass.HebrewLetter;
  376. }
  377. // Rule LB30a
  378. if (_currentClass == LineBreakClass.RegionalIndicator)
  379. {
  380. _lb30a++;
  381. if (_lb30a == 2 && _nextClass == LineBreakClass.RegionalIndicator)
  382. {
  383. shouldBreak = true;
  384. _lb30a = 0;
  385. }
  386. }
  387. else
  388. {
  389. _lb30a = 0;
  390. }
  391. // Rule LB30b
  392. if (_nextClass == LineBreakClass.EModifier && _lastPosition > 0)
  393. {
  394. // Mahjong Tiles (Unicode block) are extended pictographics but have a class of ID
  395. // Unassigned codepoints with Line_Break=ID in some blocks are also assigned the Extended_Pictographic property.
  396. // Those blocks are intended for future allocation of emoji characters.
  397. var cp = Codepoint.ReadAt(_text, _lastPosition - 1, out int _);
  398. if (Codepoint.IsInRangeInclusive(cp, 0x1F000, 0x1F02F))
  399. {
  400. shouldBreak = false;
  401. }
  402. }
  403. _currentClass = _nextClass;
  404. return shouldBreak;
  405. }
  406. private int FindPriorNonWhitespace(int from)
  407. {
  408. if (from > 0)
  409. {
  410. var cp = Codepoint.ReadAt(_text, from - 1, out var count);
  411. var cls = cp.LineBreakClass;
  412. if (cls == LineBreakClass.MandatoryBreak || cls == LineBreakClass.LineFeed ||
  413. cls == LineBreakClass.CarriageReturn)
  414. {
  415. from -= count;
  416. }
  417. }
  418. while (from > 0)
  419. {
  420. var cp = Codepoint.ReadAt(_text, from - 1, out var count);
  421. var cls = cp.LineBreakClass;
  422. if (cls == LineBreakClass.Space)
  423. {
  424. from -= count;
  425. }
  426. else
  427. {
  428. break;
  429. }
  430. }
  431. return from;
  432. }
  433. }
  434. }