testUTF8.cxx 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. /* Distributed under the OSI-approved BSD 3-Clause License. See accompanying
  2. file Copyright.txt or https://cmake.org/licensing for details. */
  3. #include <cm_utf8.h>
  4. #include <stdio.h>
  5. typedef char test_utf8_char[5];
  6. static void test_utf8_char_print(test_utf8_char const c)
  7. {
  8. unsigned char const* d = reinterpret_cast<unsigned char const*>(c);
  9. printf("[0x%02X,0x%02X,0x%02X,0x%02X]", static_cast<int>(d[0]),
  10. static_cast<int>(d[1]), static_cast<int>(d[2]),
  11. static_cast<int>(d[3]));
  12. }
  13. static void byte_array_print(char const* s)
  14. {
  15. unsigned char const* d = reinterpret_cast<unsigned char const*>(s);
  16. bool started = false;
  17. printf("[");
  18. for (; *d; ++d) {
  19. if (started) {
  20. printf(",");
  21. }
  22. started = true;
  23. printf("0x%02X", static_cast<int>(*d));
  24. }
  25. printf("]");
  26. }
  27. struct test_utf8_entry
  28. {
  29. int n;
  30. test_utf8_char str;
  31. unsigned int chr;
  32. };
  33. static test_utf8_entry const good_entry[] = {
  34. { 1, "\x20\x00\x00\x00", 0x0020 }, /* Space. */
  35. { 2, "\xC2\xA9\x00\x00", 0x00A9 }, /* Copyright. */
  36. { 3, "\xE2\x80\x98\x00", 0x2018 }, /* Open-single-quote. */
  37. { 3, "\xE2\x80\x99\x00", 0x2019 }, /* Close-single-quote. */
  38. { 4, "\xF0\xA3\x8E\xB4", 0x233B4 }, /* Example from RFC 3629. */
  39. { 3, "\xED\x80\x80\x00", 0xD000 }, /* Valid 0xED prefixed codepoint. */
  40. { 4, "\xF4\x8F\xBF\xBF", 0x10FFFF }, /* Highest valid RFC codepoint. */
  41. { 0, { 0, 0, 0, 0, 0 }, 0 }
  42. };
  43. static test_utf8_char const bad_chars[] = {
  44. "\x80\x00\x00\x00", /* Leading continuation byte. */
  45. "\xC0\x80\x00\x00", /* Overlong encoding. */
  46. "\xC1\x80\x00\x00", /* Overlong encoding. */
  47. "\xC2\x00\x00\x00", /* Missing continuation byte. */
  48. "\xE0\x00\x00\x00", /* Missing continuation bytes. */
  49. "\xE0\x80\x80\x00", /* Overlong encoding. */
  50. "\xF0\x80\x80\x80", /* Overlong encoding. */
  51. "\xED\xA0\x80\x00", /* UTF-16 surrogate half. */
  52. "\xED\xBF\xBF\x00", /* UTF-16 surrogate half. */
  53. "\xF4\x90\x80\x80", /* Lowest out-of-range codepoint. */
  54. "\xF5\x80\x80\x80", /* Prefix forces out-of-range codepoints. */
  55. { 0, 0, 0, 0, 0 }
  56. };
  57. static char const* good_strings[] = { "", "ASCII", "\xC2\xA9 Kitware", 0 };
  58. static char const* bad_strings[] = {
  59. "\xC0\x80", /* Modified UTF-8 for embedded 0-byte. */
  60. 0
  61. };
  62. static void report_good(bool passed, test_utf8_char const c)
  63. {
  64. printf("%s: decoding good ", passed ? "pass" : "FAIL");
  65. test_utf8_char_print(c);
  66. printf(" (%s) ", c);
  67. }
  68. static void report_bad(bool passed, test_utf8_char const c)
  69. {
  70. printf("%s: decoding bad ", passed ? "pass" : "FAIL");
  71. test_utf8_char_print(c);
  72. printf(" ");
  73. }
  74. static bool decode_good(test_utf8_entry const entry)
  75. {
  76. unsigned int uc;
  77. if (const char* e =
  78. cm_utf8_decode_character(entry.str, entry.str + 4, &uc)) {
  79. int used = static_cast<int>(e - entry.str);
  80. if (uc != entry.chr) {
  81. report_good(false, entry.str);
  82. printf("expected 0x%04X, got 0x%04X\n", entry.chr, uc);
  83. return false;
  84. }
  85. if (used != entry.n) {
  86. report_good(false, entry.str);
  87. printf("had %d bytes, used %d\n", entry.n, used);
  88. return false;
  89. }
  90. report_good(true, entry.str);
  91. printf("got 0x%04X\n", uc);
  92. return true;
  93. }
  94. report_good(false, entry.str);
  95. printf("failed\n");
  96. return false;
  97. }
  98. static bool decode_bad(test_utf8_char const s)
  99. {
  100. unsigned int uc = 0xFFFFu;
  101. const char* e = cm_utf8_decode_character(s, s + 4, &uc);
  102. if (e) {
  103. report_bad(false, s);
  104. printf("expected failure, got 0x%04X\n", uc);
  105. return false;
  106. }
  107. report_bad(true, s);
  108. printf("failed as expected\n");
  109. return true;
  110. }
  111. static void report_valid(bool passed, char const* s)
  112. {
  113. printf("%s: validity good ", passed ? "pass" : "FAIL");
  114. byte_array_print(s);
  115. printf(" (%s) ", s);
  116. }
  117. static void report_invalid(bool passed, char const* s)
  118. {
  119. printf("%s: validity bad ", passed ? "pass" : "FAIL");
  120. byte_array_print(s);
  121. printf(" ");
  122. }
  123. static bool is_valid(const char* s)
  124. {
  125. bool valid = cm_utf8_is_valid(s) != 0;
  126. if (!valid) {
  127. report_valid(false, s);
  128. printf("expected valid, reported as invalid\n");
  129. return false;
  130. }
  131. report_valid(true, s);
  132. printf("valid as expected\n");
  133. return true;
  134. }
  135. static bool is_invalid(const char* s)
  136. {
  137. bool valid = cm_utf8_is_valid(s) != 0;
  138. if (valid) {
  139. report_invalid(false, s);
  140. printf("expected invalid, reported as valid\n");
  141. return false;
  142. }
  143. report_invalid(true, s);
  144. printf("invalid as expected\n");
  145. return true;
  146. }
  147. int testUTF8(int /*unused*/, char* /*unused*/ [])
  148. {
  149. int result = 0;
  150. for (test_utf8_entry const* e = good_entry; e->n; ++e) {
  151. if (!decode_good(*e)) {
  152. result = 1;
  153. }
  154. if (!is_valid(e->str)) {
  155. result = 1;
  156. }
  157. }
  158. for (test_utf8_char const* c = bad_chars; (*c)[0]; ++c) {
  159. if (!decode_bad(*c)) {
  160. result = 1;
  161. }
  162. if (!is_invalid(*c)) {
  163. result = 1;
  164. }
  165. }
  166. for (char const** s = good_strings; *s; ++s) {
  167. if (!is_valid(*s)) {
  168. result = 1;
  169. }
  170. }
  171. for (char const** s = bad_strings; *s; ++s) {
  172. if (!is_invalid(*s)) {
  173. result = 1;
  174. }
  175. }
  176. return result;
  177. }