utf8.c 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. /**********************************************************************************************/
  2. /* The MIT License */
  3. /* */
  4. /* Copyright 2016-2017 Twitch Interactive, Inc. or its affiliates. All Rights Reserved. */
  5. /* */
  6. /* Permission is hereby granted, free of charge, to any person obtaining a copy */
  7. /* of this software and associated documentation files (the "Software"), to deal */
  8. /* in the Software without restriction, including without limitation the rights */
  9. /* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell */
  10. /* copies of the Software, and to permit persons to whom the Software is */
  11. /* furnished to do so, subject to the following conditions: */
  12. /* */
  13. /* The above copyright notice and this permission notice shall be included in */
  14. /* all copies or substantial portions of the Software. */
  15. /* */
  16. /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR */
  17. /* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, */
  18. /* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE */
  19. /* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */
  20. /* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, */
  21. /* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN */
  22. /* THE SOFTWARE. */
  23. /**********************************************************************************************/
  24. #include "utf8.h"
  25. #include <stdio.h>
  26. #include <stdlib.h>
  27. #include <string.h>
  28. const utf8_char_t* utf8_char_next(const utf8_char_t* c)
  29. {
  30. const utf8_char_t* n = c + utf8_char_length(c);
  31. return n == c ? 0 : n;
  32. }
  33. // returnes the length of the char in bytes
  34. size_t utf8_char_length(const utf8_char_t* c)
  35. {
  36. // count null term as zero size
  37. if (!c || 0x00 == c[0]) {
  38. return 0;
  39. }
  40. static const size_t _utf8_char_length[] = {
  41. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0
  42. };
  43. return _utf8_char_length[(c[0] >> 3) & 0x1F];
  44. }
  45. int utf8_char_whitespace(const utf8_char_t* c)
  46. {
  47. // 0x7F is DEL
  48. if (!c || (unsigned char)c[0] <= ' ' || c[0] == 0x7F) {
  49. return 1;
  50. }
  51. // EIA608_CHAR_NO_BREAK_SPACE TODO other utf8 spaces
  52. if (0xC2 == (unsigned char)c[0] && 0xA0 == (unsigned char)c[1]) {
  53. return 1;
  54. }
  55. return 0;
  56. }
  57. // returns length of the string in bytes
  58. // size is number of charcter to count (0 to count until NULL term)
  59. size_t utf8_string_length(const utf8_char_t* data, utf8_size_t size)
  60. {
  61. size_t char_length, byts = 0;
  62. if (0 == size) {
  63. size = utf8_char_count(data, 0);
  64. }
  65. for (; 0 < size; --size) {
  66. if (0 == (char_length = utf8_char_length(data))) {
  67. break;
  68. }
  69. data += char_length;
  70. byts += char_length;
  71. }
  72. return byts;
  73. }
  74. size_t utf8_char_copy(utf8_char_t* dst, const utf8_char_t* src)
  75. {
  76. size_t bytes = utf8_char_length(src);
  77. if (bytes && dst) {
  78. memcpy(dst, src, bytes);
  79. dst[bytes] = '\0';
  80. }
  81. return bytes;
  82. }
  83. // returnes the number of utf8 charcters in a string given the number of bytes
  84. // to count until the a null terminator, pass 0 for size
  85. utf8_size_t utf8_char_count(const char* data, size_t size)
  86. {
  87. size_t i, bytes = 0;
  88. utf8_size_t count = 0;
  89. if (0 == size) {
  90. size = strlen(data);
  91. }
  92. for (i = 0; i < size; ++count, i += bytes) {
  93. if (0 == (bytes = utf8_char_length(&data[i]))) {
  94. break;
  95. }
  96. }
  97. return count;
  98. }
  99. // returnes the length of the line in bytes triming not printable charcters at the end
  100. size_t utf8_trimmed_length(const utf8_char_t* data, utf8_size_t charcters)
  101. {
  102. size_t l, t = 0, split_at = 0;
  103. for (size_t c = 0; (*data) && c < charcters; ++c) {
  104. l = utf8_char_length(data);
  105. t += l, data += l;
  106. if (!utf8_char_whitespace(data)) {
  107. split_at = t;
  108. }
  109. }
  110. return split_at;
  111. }
  112. size_t _utf8_newline(const utf8_char_t* data)
  113. {
  114. if ('\r' == data[0]) {
  115. return '\n' == data[1] ? 2 : 1; // windows/unix
  116. } else if ('\n' == data[0]) {
  117. return '\r' == data[1] ? 2 : 1; // riscos/macos
  118. } else {
  119. return 0;
  120. }
  121. }
  122. // returns the length in bytes of the line including the new line charcter(s)
  123. // auto detects between windows(CRLF), unix(LF), mac(CR) and riscos (LFCR) line endings
  124. size_t utf8_line_length(const utf8_char_t* data)
  125. {
  126. size_t n, len = 0;
  127. for (len = 0; 0 != data[len]; ++len) {
  128. if (0 < (n = _utf8_newline(data))) {
  129. return len + n;
  130. }
  131. }
  132. return len;
  133. }
  134. // returns number of chars to include before split
  135. utf8_size_t utf8_wrap_length(const utf8_char_t* data, utf8_size_t size)
  136. {
  137. // Set split_at to size, so if a split point cna not be found, retuns the size passed in
  138. size_t char_length, char_count, split_at = size;
  139. for (char_count = 0; char_count <= size; ++char_count) {
  140. if (_utf8_newline(data)) {
  141. return char_count;
  142. } else if (utf8_char_whitespace(data)) {
  143. split_at = char_count;
  144. }
  145. char_length = utf8_char_length(data);
  146. data += char_length;
  147. }
  148. return split_at;
  149. }
  150. int utf8_line_count(const utf8_char_t* data)
  151. {
  152. size_t len = 0;
  153. int count = 0;
  154. do {
  155. len = utf8_line_length(data);
  156. data += len;
  157. ++count;
  158. } while (0 < len);
  159. return count - 1;
  160. }
  161. utf8_char_t* utf8_load_text_file(const char* path, size_t* size)
  162. {
  163. utf8_char_t* data = NULL;
  164. FILE* file = fopen(path, "r");
  165. if (file) {
  166. fseek(file, 0, SEEK_END);
  167. size_t file_size = ftell(file);
  168. fseek(file, 0, SEEK_SET);
  169. if (0 == (*size) || file_size <= (*size)) {
  170. (*size) = 0;
  171. data = (utf8_char_t*)malloc(1 + file_size);
  172. memset(data, '\0', file_size);
  173. if (data) {
  174. utf8_char_t* pos = data;
  175. size_t bytes_read = 0;
  176. while (0 < (bytes_read = fread(pos, 1, file_size - (*size), file))) {
  177. pos += bytes_read;
  178. (*size) += bytes_read;
  179. }
  180. }
  181. fclose(file);
  182. }
  183. }
  184. if (data) {
  185. data[*size] = 0;
  186. }
  187. return data;
  188. }
  189. #ifndef strnstr
  190. char* strnstr(const char* string1, const char* string2, size_t len)
  191. {
  192. size_t length2;
  193. length2 = strlen(string2);
  194. if (!length2) {
  195. return (char*)string1;
  196. }
  197. while (len >= length2) {
  198. len--;
  199. if (!memcmp(string1, string2, length2))
  200. return (char*)string1;
  201. string1++;
  202. }
  203. return NULL;
  204. }
  205. #endif