UnicodeTrie.cs 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. // RichTextKit
  2. // Copyright © 2019 Topten Software. All Rights Reserved.
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License"); you may
  5. // not use this product except in compliance with the License. You may obtain
  6. // a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  12. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  13. // License for the specific language governing permissions and limitations
  14. // under the License.
  15. // Ported from: https://github.com/foliojs/unicode-trie
  16. // Copied from: https://github.com/toptensoftware/RichTextKit
  17. using System;
  18. using System.IO;
  19. using System.Runtime.CompilerServices;
  20. using System.Runtime.InteropServices;
  21. using System.Text;
  22. namespace Avalonia.Media.TextFormatting.Unicode
  23. {
  24. internal class UnicodeTrie
  25. {
  26. private readonly uint[] _data;
  27. private readonly int _highStart;
  28. private readonly uint _errorValue;
  29. /// <summary>
  30. /// Initializes a new instance of the <see cref="UnicodeTrie"/> class.
  31. /// </summary>
  32. /// <param name="rawData">The uncompressed trie data.</param>
  33. public UnicodeTrie(ReadOnlySpan<byte> rawData)
  34. {
  35. var header = UnicodeTrieHeader.Parse(rawData);
  36. int length = header.DataLength;
  37. uint[] data = new uint[length / sizeof(uint)];
  38. MemoryMarshal.Cast<byte, uint>(rawData.Slice(rawData.Length - length))
  39. .CopyTo(data);
  40. _highStart = header.HighStart;
  41. _errorValue = header.ErrorValue;
  42. _data = data;
  43. }
  44. /// <summary>
  45. /// Initializes a new instance of the <see cref="UnicodeTrie"/> class.
  46. /// </summary>
  47. /// <param name="stream">The stream containing the data.</param>
  48. public UnicodeTrie(Stream stream)
  49. {
  50. // Read the header info
  51. using (var br = new BinaryReader(stream, Encoding.UTF8, true))
  52. {
  53. _highStart = br.ReadInt32();
  54. _errorValue = br.ReadUInt32();
  55. _data = new uint[br.ReadInt32() / sizeof(uint)];
  56. }
  57. // Read the data in compressed format.
  58. using (var br = new BinaryReader(stream, Encoding.UTF8, true))
  59. {
  60. for (int i = 0; i < _data.Length; i++)
  61. {
  62. _data[i] = br.ReadUInt32();
  63. }
  64. }
  65. }
  66. /// <summary>
  67. /// Initializes a new instance of the <see cref="UnicodeTrie"/> class.
  68. /// </summary>
  69. /// <param name="data">The uncompressed trie data.</param>
  70. /// <param name="highStart">The start of the last range which ends at U+10ffff.</param>
  71. /// <param name="errorValue">The value for out-of-range code points and illegal UTF-8.</param>
  72. public UnicodeTrie(uint[] data, int highStart, uint errorValue)
  73. {
  74. _data = data;
  75. _highStart = highStart;
  76. _errorValue = errorValue;
  77. }
  78. /// <summary>
  79. /// Saves the <see cref="UnicodeTrie"/> to the stream in a compressed format.
  80. /// </summary>
  81. /// <param name="stream">The output stream.</param>
  82. internal void Save(Stream stream)
  83. {
  84. // Write the header info
  85. using (var bw = new BinaryWriter(stream, Encoding.UTF8, true))
  86. {
  87. bw.Write(_highStart);
  88. bw.Write(_errorValue);
  89. bw.Write(_data.Length * sizeof(uint));
  90. }
  91. // Write the data.
  92. using (var bw = new BinaryWriter(stream, Encoding.UTF8, true))
  93. {
  94. for (int i = 0; i < _data.Length; i++)
  95. {
  96. bw.Write(_data[i]);
  97. }
  98. }
  99. }
  100. /// <summary>
  101. /// Get the value for a code point as stored in the trie.
  102. /// </summary>
  103. /// <param name="codePoint">The code point.</param>
  104. /// <returns>The <see cref="uint"/> value.</returns>
  105. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  106. public uint Get(uint codePoint)
  107. {
  108. uint index;
  109. ref uint dataBase = ref MemoryMarshal.GetReference(_data.AsSpan());
  110. if (codePoint is < 0x0d800 or (> 0x0dbff and <= 0x0ffff))
  111. {
  112. // Ordinary BMP code point, excluding leading surrogates.
  113. // BMP uses a single level lookup. BMP index starts at offset 0 in the Trie2 index.
  114. // 16 bit data is stored in the index array itself.
  115. index = _data[codePoint >> UnicodeTrieBuilder.SHIFT_2];
  116. index = (index << UnicodeTrieBuilder.INDEX_SHIFT) + (codePoint & UnicodeTrieBuilder.DATA_MASK);
  117. return Unsafe.Add(ref dataBase, (nint)index);
  118. }
  119. if (codePoint <= 0xffff)
  120. {
  121. // Lead Surrogate Code Point. A Separate index section is stored for
  122. // lead surrogate code units and code points.
  123. // The main index has the code unit data.
  124. // For this function, we need the code point data.
  125. // Note: this expression could be refactored for slightly improved efficiency, but
  126. // surrogate code points will be so rare in practice that it's not worth it.
  127. index = _data[UnicodeTrieBuilder.LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> UnicodeTrieBuilder.SHIFT_2)];
  128. index = (index << UnicodeTrieBuilder.INDEX_SHIFT) + (codePoint & UnicodeTrieBuilder.DATA_MASK);
  129. return Unsafe.Add(ref dataBase, (nint)index);
  130. }
  131. if (codePoint < _highStart)
  132. {
  133. // Supplemental code point, use two-level lookup.
  134. index = UnicodeTrieBuilder.INDEX_1_OFFSET - UnicodeTrieBuilder.OMITTED_BMP_INDEX_1_LENGTH + (codePoint >> UnicodeTrieBuilder.SHIFT_1);
  135. index = _data[index];
  136. index += (codePoint >> UnicodeTrieBuilder.SHIFT_2) & UnicodeTrieBuilder.INDEX_2_MASK;
  137. index = _data[index];
  138. index = (index << UnicodeTrieBuilder.INDEX_SHIFT) + (codePoint & UnicodeTrieBuilder.DATA_MASK);
  139. return Unsafe.Add(ref dataBase, (nint)index);
  140. }
  141. if (codePoint <= 0x10ffff)
  142. {
  143. return Unsafe.Add(ref dataBase, (nint)(_data.Length - UnicodeTrieBuilder.DATA_GRANULARITY));
  144. }
  145. // Fall through. The code point is outside of the legal range of 0..0x10ffff.
  146. return _errorValue;
  147. }
  148. [StructLayout(LayoutKind.Sequential, Pack = 1)]
  149. private struct UnicodeTrieHeader
  150. {
  151. public int HighStart
  152. {
  153. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  154. get;
  155. }
  156. public uint ErrorValue
  157. {
  158. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  159. get;
  160. }
  161. public int DataLength
  162. {
  163. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  164. get;
  165. }
  166. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  167. public static UnicodeTrieHeader Parse(ReadOnlySpan<byte> data)
  168. => MemoryMarshal.Cast<byte, UnicodeTrieHeader>(data)[0];
  169. }
  170. }
  171. }