1
0

decode_utf8.c 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. /*
  2. * Decode a single UTF-8 character.
  3. */
  4. #include "putty.h"
  5. #include "misc.h"
  6. unsigned decode_utf8(BinarySource *src, DecodeUTF8Failure *err)
  7. {
  8. /* Permit user to pass NULL as the err pointer */
  9. DecodeUTF8Failure dummy;
  10. if (!err) err = &dummy;
  11. /* If the source has no byte available, this will return 0, which
  12. * we'll return immediately and is a reasonable error return anyway */
  13. { // WINSCP
  14. unsigned char c = get_byte(src);
  15. /* One-byte cases. */
  16. if (c < 0x80) {
  17. *err = DUTF8_SUCCESS;
  18. return c;
  19. } else if (c < 0xC0) {
  20. *err = DUTF8_SPURIOUS_CONTINUATION;
  21. return 0xFFFD;
  22. }
  23. { // WINSCP
  24. unsigned long wc, min;
  25. size_t ncont;
  26. if (c < 0xE0) {
  27. wc = c & 0x1F; ncont = 1; min = 0x80;
  28. } else if (c < 0xF0) {
  29. wc = c & 0x0F; ncont = 2; min = 0x800;
  30. } else if (c < 0xF8) {
  31. wc = c & 0x07; ncont = 3; min = 0x10000;
  32. } else if (c < 0xFC) {
  33. wc = c & 0x03; ncont = 4; min = 0x200000;
  34. } else if (c < 0xFE) {
  35. wc = c & 0x01; ncont = 5; min = 0x4000000;
  36. } else {
  37. *err = DUTF8_ILLEGAL_BYTE; /* FE or FF */
  38. return 0xFFFD;
  39. }
  40. while (ncont-- > 0) {
  41. if (!get_avail(src)) {
  42. *err = DUTF8_E_OUT_OF_DATA;
  43. return 0xFFFD;
  44. }
  45. { // WINSCP
  46. unsigned char cont = get_byte(src);
  47. if (!(0x80 <= cont && cont < 0xC0)) {
  48. BinarySource_REWIND_TO(src, src->pos - 1);
  49. *err = DUTF8_TRUNCATED_SEQUENCE;
  50. return 0xFFFD;
  51. }
  52. wc = (wc << 6) | (cont & 0x3F);
  53. } // WINSCP
  54. }
  55. if (wc < min) {
  56. *err = DUTF8_OVERLONG_ENCODING;
  57. return 0xFFFD;
  58. }
  59. if (0xD800 <= wc && wc < 0xE000) {
  60. *err = DUTF8_ENCODED_SURROGATE;
  61. return 0xFFFD;
  62. }
  63. if (wc > 0x10FFFF) {
  64. *err = DUTF8_CODE_POINT_TOO_BIG;
  65. return 0xFFFD; /* outside Unicode range */
  66. }
  67. *err = DUTF8_SUCCESS;
  68. return wc;
  69. } // WINSCP
  70. } // WINSCP
  71. }
  72. const char *const decode_utf8_error_strings[DUTF8_N_FAILURE_CODES] = {
  73. #define MSG_ENTRY(sym, string) string,
  74. DECODE_UTF8_FAILURE_LIST(MSG_ENTRY)
  75. #undef MSG_ENTRY
  76. };
  77. #ifdef TEST
  78. #include <stdio.h>
  79. void out_of_memory(void)
  80. {
  81. fprintf(stderr, "out of memory!\n");
  82. exit(2);
  83. }
  84. static const char *const decode_utf8_error_syms[DUTF8_N_FAILURE_CODES] = {
  85. #define SYM_ENTRY(sym, string) #sym,
  86. DECODE_UTF8_FAILURE_LIST(SYM_ENTRY)
  87. #undef SYM_ENTRY
  88. };
  89. bool dotest(const char *file, int line, const char *input, size_t ninput,
  90. const unsigned long *chars, size_t nchars)
  91. {
  92. BinarySource src[1];
  93. BinarySource_BARE_INIT(src, input, ninput);
  94. size_t noutput = 0;
  95. printf("%s:%d: test start\n", file, line);
  96. while (get_avail(src)) {
  97. size_t before = src->pos;
  98. DecodeUTF8Failure err;
  99. unsigned long wc = decode_utf8(src, &err);
  100. printf("%s:%d in+%"SIZEu" out+%"SIZEu":", file, line, before, noutput);
  101. while (before < src->pos)
  102. printf(" %02x", (unsigned)(unsigned char)(input[before++]));
  103. printf(" -> U-%08lx %s\n", wc, decode_utf8_error_syms[err]);
  104. if (noutput >= nchars) {
  105. printf("%s:%d: FAIL: expected no further output\n", file, line);
  106. return false;
  107. }
  108. if (chars[noutput] != wc) {
  109. printf("%s:%d: FAIL: expected U-%08lx\n",
  110. file, line, chars[noutput]);
  111. return false;
  112. }
  113. noutput++;
  114. DecodeUTF8Failure expected_err;
  115. if (wc == 0xFFFD) {
  116. /* In the 'chars' array, any occurrence of 0xFFFD is followed
  117. * by the expected error code */
  118. assert(noutput < nchars && "bad test data");
  119. expected_err = chars[noutput++];
  120. } else {
  121. /* Expect success status to go with any non-FFFD character */
  122. expected_err = DUTF8_SUCCESS;
  123. }
  124. if (err != expected_err) {
  125. printf("%s:%d: FAIL: expected %s\n", file, line,
  126. decode_utf8_error_syms[expected_err]);
  127. return false;
  128. }
  129. }
  130. if (noutput < nchars) {
  131. printf("%s:%d: FAIL: expected further output\n", file, line);
  132. return false;
  133. }
  134. printf("%s:%d: pass\n", file, line);
  135. return true;
  136. }
  137. #define DOTEST(input, ...) do { \
  138. static const unsigned long chars[] = { __VA_ARGS__ }; \
  139. ntest++; \
  140. if (dotest(__FILE__, __LINE__, input, sizeof(input)-1, \
  141. chars, lenof(chars))) \
  142. npass++; \
  143. } while (0)
  144. int main(void)
  145. {
  146. int ntest = 0, npass = 0;
  147. DOTEST("\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5",
  148. 0x03BA, 0x1F79, 0x03C3, 0x03BC, 0x03B5);
  149. /* First sequence of each length */
  150. DOTEST("\x00", 0x0000);
  151. DOTEST("\xC2\x80", 0x0080);
  152. DOTEST("\xE0\xA0\x80", 0x0800);
  153. DOTEST("\xF0\x90\x80\x80", 0x00010000);
  154. DOTEST("\xF8\x88\x80\x80\x80",
  155. 0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x00200000 */
  156. DOTEST("\xFC\x84\x80\x80\x80\x80",
  157. 0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x04000000 */
  158. /* Last sequence of each length */
  159. DOTEST("\x7F", 0x007F);
  160. DOTEST("\xDF\xBF", 0x07FF);
  161. DOTEST("\xEF\xBF\xBF", 0xFFFF);
  162. DOTEST("\xF7\xBF\xBF\xBF",
  163. 0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x001FFFFF */
  164. DOTEST("\xFB\xBF\xBF\xBF\xBF",
  165. 0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x03FFFFFF */
  166. DOTEST("\xFD\xBF\xBF\xBF\xBF\xBF",
  167. 0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x7FFFFFFF */
  168. /* Endpoints of the surrogate range */
  169. DOTEST("\xED\x9F\xBF", 0xD7FF);
  170. DOTEST("\xED\xA0\x80", 0xFFFD, DUTF8_ENCODED_SURROGATE); /* 0xD800 */
  171. DOTEST("\xED\xBF\xBF", 0xFFFD, DUTF8_ENCODED_SURROGATE); /* 0xDFFF */
  172. DOTEST("\xEE\x80\x80", 0xE000);
  173. /* REPLACEMENT CHARACTER itself */
  174. DOTEST("\xEF\xBF\xBD", 0xFFFD, DUTF8_SUCCESS); /* FFFD but no error! */
  175. /* Endpoints of the legal Unicode range */
  176. DOTEST("\xF4\x8F\xBF\xBF", 0x0010FFFF);
  177. DOTEST("\xF4\x90\x80\x80", 0xFFFD,
  178. DUTF8_CODE_POINT_TOO_BIG); /* would be 0x00110000 */
  179. /* Spurious continuation bytes, each shown as a separate failure */
  180. DOTEST("\x80 \x81\x82 \xBD\xBE\xBF",
  181. 0xFFFD, DUTF8_SPURIOUS_CONTINUATION,
  182. 0x0020,
  183. 0xFFFD, DUTF8_SPURIOUS_CONTINUATION,
  184. 0xFFFD, DUTF8_SPURIOUS_CONTINUATION,
  185. 0x0020,
  186. 0xFFFD, DUTF8_SPURIOUS_CONTINUATION,
  187. 0xFFFD, DUTF8_SPURIOUS_CONTINUATION,
  188. 0xFFFD, DUTF8_SPURIOUS_CONTINUATION);
  189. /* Truncated sequences, each shown as just one failure. The last
  190. * one gets a different error code because the sequence is
  191. * interrupted by the end of the string instead of another
  192. * character, so that if the string were a prefix of a longer
  193. * chunk of data then that would not _necessarily_ indicate an
  194. * error */
  195. DOTEST("\xC2\xE0\xA0\xF0\x90\x80\xF8\x88\x80\x80\xFC\x84\x80\x80\x80",
  196. 0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
  197. 0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
  198. 0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
  199. 0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
  200. 0xFFFD, DUTF8_E_OUT_OF_DATA);
  201. DOTEST("\xC2 \xE0\xA0 \xF0\x90\x80 \xF8\x88\x80\x80 \xFC\x84\x80\x80\x80",
  202. 0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
  203. 0x0020,
  204. 0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
  205. 0x0020,
  206. 0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
  207. 0x0020,
  208. 0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
  209. 0x0020,
  210. 0xFFFD, DUTF8_E_OUT_OF_DATA);
  211. /* Illegal bytes */
  212. DOTEST("\xFE\xFF", 0xFFFD, DUTF8_ILLEGAL_BYTE, 0xFFFD, DUTF8_ILLEGAL_BYTE);
  213. /* Overlong sequences */
  214. DOTEST("\xC1\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING);
  215. DOTEST("\xE0\x9F\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING);
  216. DOTEST("\xF0\x8F\xBF\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING);
  217. DOTEST("\xF8\x87\xBF\xBF\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING);
  218. DOTEST("\xFC\x83\xBF\xBF\xBF\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING);
  219. DOTEST("\xC0\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING);
  220. DOTEST("\xE0\x80\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING);
  221. DOTEST("\xF0\x80\x80\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING);
  222. DOTEST("\xF8\x80\x80\x80\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING);
  223. DOTEST("\xFC\x80\x80\x80\x80\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING);
  224. printf("%d tests %d passed", ntest, npass);
  225. if (npass < ntest) {
  226. printf(" %d FAILED\n", ntest-npass);
  227. return 1;
  228. } else {
  229. printf("\n");
  230. return 0;
  231. }
  232. }
  233. #endif