| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264 | 
							- /*
 
-  * Decode a single UTF-8 character.
 
-  */
 
- #include "putty.h"
 
- #include "misc.h"
 
- unsigned decode_utf8(BinarySource *src, DecodeUTF8Failure *err)
 
- {
 
-     /* Permit user to pass NULL as the err pointer */
 
-     DecodeUTF8Failure dummy;
 
-     if (!err) err = &dummy;
 
-     /* If the source has no byte available, this will return 0, which
 
-      * we'll return immediately and is a reasonable error return anyway */
 
-     unsigned char c = get_byte(src);
 
-     /* One-byte cases. */
 
-     if (c < 0x80) {
 
-         *err = DUTF8_SUCCESS;
 
-         return c;
 
-     } else if (c < 0xC0) {
 
-         *err = DUTF8_SPURIOUS_CONTINUATION;
 
-         return 0xFFFD;
 
-     }
 
-     unsigned long wc, min;
 
-     size_t ncont;
 
-     if (c < 0xE0) {
 
-         wc = c & 0x1F; ncont = 1; min = 0x80;
 
-     } else if (c < 0xF0) {
 
-         wc = c & 0x0F; ncont = 2; min = 0x800;
 
-     } else if (c < 0xF8) {
 
-         wc = c & 0x07; ncont = 3; min = 0x10000;
 
-     } else if (c < 0xFC) {
 
-         wc = c & 0x03; ncont = 4; min = 0x200000;
 
-     } else if (c < 0xFE) {
 
-         wc = c & 0x01; ncont = 5; min = 0x4000000;
 
-     } else {
 
-         *err = DUTF8_ILLEGAL_BYTE; /* FE or FF */
 
-         return 0xFFFD;
 
-     }
 
-     while (ncont-- > 0) {
 
-         if (!get_avail(src)) {
 
-             *err = DUTF8_E_OUT_OF_DATA;
 
-             return 0xFFFD;
 
-         }
 
-         unsigned char cont = get_byte(src);
 
-         if (!(0x80 <= cont && cont < 0xC0)) {
 
-             BinarySource_REWIND_TO(src, src->pos - 1);
 
-             *err = DUTF8_TRUNCATED_SEQUENCE;
 
-             return 0xFFFD;
 
-         }
 
-         wc = (wc << 6) | (cont & 0x3F);
 
-     }
 
-     if (wc < min) {
 
-         *err = DUTF8_OVERLONG_ENCODING;
 
-         return 0xFFFD;
 
-     }
 
-     if (0xD800 <= wc && wc < 0xE000) {
 
-         *err = DUTF8_ENCODED_SURROGATE;
 
-         return 0xFFFD;
 
-     }
 
-     if (wc > 0x10FFFF) {
 
-         *err = DUTF8_CODE_POINT_TOO_BIG;
 
-         return 0xFFFD;                 /* outside Unicode range */
 
-     }
 
-     *err = DUTF8_SUCCESS;
 
-     return wc;
 
- }
 
- const char *const decode_utf8_error_strings[DUTF8_N_FAILURE_CODES] = {
 
-     #define MSG_ENTRY(sym, string) string,
 
-     DECODE_UTF8_FAILURE_LIST(MSG_ENTRY)
 
-     #undef MSG_ENTRY
 
- };
 
- #ifdef TEST
 
- #include <stdio.h>
 
- void out_of_memory(void)
 
- {
 
-     fprintf(stderr, "out of memory!\n");
 
-     exit(2);
 
- }
 
- static const char *const decode_utf8_error_syms[DUTF8_N_FAILURE_CODES] = {
 
-     #define SYM_ENTRY(sym, string) #sym,
 
-     DECODE_UTF8_FAILURE_LIST(SYM_ENTRY)
 
-     #undef SYM_ENTRY
 
- };
 
- bool dotest(const char *file, int line, const char *input, size_t ninput,
 
-             const unsigned long *chars, size_t nchars)
 
- {
 
-     BinarySource src[1];
 
-     BinarySource_BARE_INIT(src, input, ninput);
 
-     size_t noutput = 0;
 
-     printf("%s:%d: test start\n", file, line);
 
-     while (get_avail(src)) {
 
-         size_t before = src->pos;
 
-         DecodeUTF8Failure err;
 
-         unsigned long wc = decode_utf8(src, &err);
 
-         printf("%s:%d in+%"SIZEu" out+%"SIZEu":", file, line, before, noutput);
 
-         while (before < src->pos)
 
-             printf(" %02x", (unsigned)(unsigned char)(input[before++]));
 
-         printf(" -> U-%08lx %s\n", wc, decode_utf8_error_syms[err]);
 
-         if (noutput >= nchars) {
 
-             printf("%s:%d: FAIL: expected no further output\n", file, line);
 
-             return false;
 
-         }
 
-         if (chars[noutput] != wc) {
 
-             printf("%s:%d: FAIL: expected U-%08lx\n",
 
-                    file, line, chars[noutput]);
 
-             return false;
 
-         }
 
-         noutput++;
 
-         DecodeUTF8Failure expected_err;
 
-         if (wc == 0xFFFD) {
 
-             /* In the 'chars' array, any occurrence of 0xFFFD is followed
 
-              * by the expected error code */
 
-             assert(noutput < nchars && "bad test data");
 
-             expected_err = chars[noutput++];
 
-         } else {
 
-             /* Expect success status to go with any non-FFFD character */
 
-             expected_err = DUTF8_SUCCESS;
 
-         }
 
-         if (err != expected_err) {
 
-             printf("%s:%d: FAIL: expected %s\n", file, line,
 
-                    decode_utf8_error_syms[expected_err]);
 
-             return false;
 
-         }
 
-     }
 
-     if (noutput < nchars) {
 
-         printf("%s:%d: FAIL: expected further output\n", file, line);
 
-         return false;
 
-     }
 
-     printf("%s:%d: pass\n", file, line);
 
-     return true;
 
- }
 
- #define DOTEST(input, ...) do {                                         \
 
-         static const unsigned long chars[] = { __VA_ARGS__ };           \
 
-         ntest++;                                                        \
 
-         if (dotest(__FILE__, __LINE__, input, sizeof(input)-1,          \
 
-                    chars, lenof(chars)))                                \
 
-             npass++;                                                    \
 
-     } while (0)
 
- int main(void)
 
- {
 
-     int ntest = 0, npass = 0;
 
-     DOTEST("\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5",
 
-            0x03BA, 0x1F79, 0x03C3, 0x03BC, 0x03B5);
 
-     /* First sequence of each length */
 
-     DOTEST("\x00", 0x0000);
 
-     DOTEST("\xC2\x80", 0x0080);
 
-     DOTEST("\xE0\xA0\x80", 0x0800);
 
-     DOTEST("\xF0\x90\x80\x80", 0x00010000);
 
-     DOTEST("\xF8\x88\x80\x80\x80",
 
-            0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x00200000 */
 
-     DOTEST("\xFC\x84\x80\x80\x80\x80",
 
-            0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x04000000 */
 
-     /* Last sequence of each length */
 
-     DOTEST("\x7F", 0x007F);
 
-     DOTEST("\xDF\xBF", 0x07FF);
 
-     DOTEST("\xEF\xBF\xBF", 0xFFFF);
 
-     DOTEST("\xF7\xBF\xBF\xBF",
 
-            0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x001FFFFF */
 
-     DOTEST("\xFB\xBF\xBF\xBF\xBF",
 
-            0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x03FFFFFF */
 
-     DOTEST("\xFD\xBF\xBF\xBF\xBF\xBF",
 
-            0xFFFD, DUTF8_CODE_POINT_TOO_BIG); /* would be 0x7FFFFFFF */
 
-     /* Endpoints of the surrogate range */
 
-     DOTEST("\xED\x9F\xBF", 0xD7FF);
 
-     DOTEST("\xED\xA0\x80", 0xFFFD, DUTF8_ENCODED_SURROGATE); /* 0xD800 */
 
-     DOTEST("\xED\xBF\xBF", 0xFFFD, DUTF8_ENCODED_SURROGATE); /* 0xDFFF */
 
-     DOTEST("\xEE\x80\x80", 0xE000);
 
-     /* REPLACEMENT CHARACTER itself */
 
-     DOTEST("\xEF\xBF\xBD", 0xFFFD, DUTF8_SUCCESS); /* FFFD but no error! */
 
-     /* Endpoints of the legal Unicode range */
 
-     DOTEST("\xF4\x8F\xBF\xBF", 0x0010FFFF);
 
-     DOTEST("\xF4\x90\x80\x80", 0xFFFD,
 
-            DUTF8_CODE_POINT_TOO_BIG); /* would be 0x00110000 */
 
-     /* Spurious continuation bytes, each shown as a separate failure */
 
-     DOTEST("\x80 \x81\x82 \xBD\xBE\xBF",
 
-            0xFFFD, DUTF8_SPURIOUS_CONTINUATION,
 
-            0x0020,
 
-            0xFFFD, DUTF8_SPURIOUS_CONTINUATION,
 
-            0xFFFD, DUTF8_SPURIOUS_CONTINUATION,
 
-            0x0020,
 
-            0xFFFD, DUTF8_SPURIOUS_CONTINUATION,
 
-            0xFFFD, DUTF8_SPURIOUS_CONTINUATION,
 
-            0xFFFD, DUTF8_SPURIOUS_CONTINUATION);
 
-     /* Truncated sequences, each shown as just one failure. The last
 
-      * one gets a different error code because the sequence is
 
-      * interrupted by the end of the string instead of another
 
-      * character, so that if the string were a prefix of a longer
 
-      * chunk of data then that would not _necessarily_ indicate an
 
-      * error */
 
-     DOTEST("\xC2\xE0\xA0\xF0\x90\x80\xF8\x88\x80\x80\xFC\x84\x80\x80\x80",
 
-            0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
 
-            0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
 
-            0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
 
-            0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
 
-            0xFFFD, DUTF8_E_OUT_OF_DATA);
 
-     DOTEST("\xC2 \xE0\xA0 \xF0\x90\x80 \xF8\x88\x80\x80 \xFC\x84\x80\x80\x80",
 
-            0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
 
-            0x0020,
 
-            0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
 
-            0x0020,
 
-            0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
 
-            0x0020,
 
-            0xFFFD, DUTF8_TRUNCATED_SEQUENCE,
 
-            0x0020,
 
-            0xFFFD, DUTF8_E_OUT_OF_DATA);
 
-     /* Illegal bytes */
 
-     DOTEST("\xFE\xFF", 0xFFFD, DUTF8_ILLEGAL_BYTE, 0xFFFD, DUTF8_ILLEGAL_BYTE);
 
-     /* Overlong sequences */
 
-     DOTEST("\xC1\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING);
 
-     DOTEST("\xE0\x9F\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING);
 
-     DOTEST("\xF0\x8F\xBF\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING);
 
-     DOTEST("\xF8\x87\xBF\xBF\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING);
 
-     DOTEST("\xFC\x83\xBF\xBF\xBF\xBF", 0xFFFD, DUTF8_OVERLONG_ENCODING);
 
-     DOTEST("\xC0\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING);
 
-     DOTEST("\xE0\x80\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING);
 
-     DOTEST("\xF0\x80\x80\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING);
 
-     DOTEST("\xF8\x80\x80\x80\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING);
 
-     DOTEST("\xFC\x80\x80\x80\x80\x80", 0xFFFD, DUTF8_OVERLONG_ENCODING);
 
-     printf("%d tests %d passed", ntest, npass);
 
-     if (npass < ntest) {
 
-         printf(" %d FAILED\n", ntest-npass);
 
-         return 1;
 
-     } else {
 
-         printf("\n");
 
-         return 0;
 
-     }
 
- }
 
- #endif
 
 
  |