xmlrpc_utf8.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. /* Copyright (C) 2001 by Eric Kidd. All rights reserved.
  2. **
  3. ** Redistribution and use in source and binary forms, with or without
  4. ** modification, are permitted provided that the following conditions
  5. ** are met:
  6. ** 1. Redistributions of source code must retain the above copyright
  7. ** notice, this list of conditions and the following disclaimer.
  8. ** 2. Redistributions in binary form must reproduce the above copyright
  9. ** notice, this list of conditions and the following disclaimer in the
  10. ** documentation and/or other materials provided with the distribution.
  11. ** 3. The name of the author may not be used to endorse or promote products
  12. ** derived from this software without specific prior written permission.
  13. **
  14. ** THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15. ** ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16. ** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17. ** ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18. ** FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19. ** DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20. ** OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21. ** HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22. ** LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23. ** OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24. ** SUCH DAMAGE. */
  25. /*=========================================================================
  26. ** XML-RPC UTF-8 Utilities
  27. **=========================================================================
  28. ** Routines for validating, encoding and decoding UTF-8 data. We try to
  29. ** be very, very strict about invalid UTF-8 data.
  30. **
  31. ** All of the code in this file assumes that your machine represents
  32. ** wchar_t as a 16-bit (or wider) character containing UCS-2 data. If this
  33. ** assumption is incorrect, you may need to replace this file.
  34. **
  35. ** For lots of information on Unicode and UTF-8 decoding, see:
  36. ** http://www.cl.cam.ac.uk/~mgk25/unicode.html
  37. */
  38. #include "xmlrpc_config.h"
  39. #include "xmlrpc.h"
  40. #ifdef HAVE_UNICODE_WCHAR
  41. /*=========================================================================
  42. ** Tables and Constants
  43. **=========================================================================
  44. ** We use a variety of tables and constants to help decode and validate
  45. ** UTF-8 data.
  46. */
  47. /* The number of bytes in a UTF-8 sequence starting with the character used
  48. ** as the array index. A zero entry indicates an illegal initial byte.
  49. ** This table was generated using a Perl script and information from the
  50. ** UTF-8 standard.
  51. **
  52. ** Fredrik Lundh's UTF-8 decoder Python 2.0 uses a similar table. But
  53. ** since Python 2.0 has the icky CNRI license, I regenerated this
  54. ** table from scratch and wrote my own decoder. */
  55. static unsigned char utf8_seq_length[256] = {
  56. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  57. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  58. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  59. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  60. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  61. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  62. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  63. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  64. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  65. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  66. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  67. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  68. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  69. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  70. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  71. 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
  72. };
  73. /* The minimum legal character value for a UTF-8 sequence of the given
  74. ** length. We have to check this to avoid accepting "overlong" UTF-8
  75. ** sequences, which use more bytes than necessary to encode a given
  76. ** character. Such sequences are commonly used by evil people to bypass
  77. ** filters and security checks. This table is based on the UTF-8-test.txt
  78. ** file by Markus Kuhn <[email protected]>. */
  79. static wchar_t utf8_min_char_for_length[4] = {
  80. 0, /* Length 0: Not used (meaningless) */
  81. 0x0000, /* Length 1: Not used (special-cased) */
  82. 0x0080, /* Length 2 */
  83. 0x0800 /* Length 3 */
  84. #if 0
  85. /* These are only useful on systems where wchar_t is 32-bits wide
  86. ** and supports full UCS-4. */
  87. 0x00010000, /* Length 4 */
  88. 0x00200000, /* Length 5 */
  89. 0x04000000 /* Length 6 */
  90. #endif
  91. };
  92. /* This is the maximum legal 16-byte (UCS-2) character. Again, this
  93. ** information is based on UTF-8-test.txt. */
  94. #define UCS2_MAX_LEGAL_CHARACTER (0xFFFD)
  95. /* First and last UTF-16 surrogate characters. These are *not* legal UCS-2
  96. ** characters--they're used to code for UCS-4 characters when using
  97. ** UTF-16. They should never appear in decoded UTF-8 data! Again, these
  98. ** could hypothetically be used to bypass security measures on some machines.
  99. ** Based on UTF-8-test.txt. */
  100. #define UTF16_FIRST_SURROGATE (0xD800)
  101. #define UTF16_LAST_SURROGATE (0xDFFF)
  102. /* Is the character 'c' a UTF-8 continuation character? */
  103. #define IS_CONTINUATION(c) (((c) & 0xC0) == 0x80)
  104. /* Maximum number of bytes needed to encode a supported character. */
  105. #define MAX_ENCODED_BYTES (3)
  106. /*=========================================================================
  107. ** decode_utf8
  108. **=========================================================================
  109. ** Internal routine which decodes (or validates) a UTF-8 string.
  110. ** To validate, set io_buff and out_buff_len to NULL. To decode, allocate
  111. ** a sufficiently large buffer, pass it as io_buff, and pass a pointer as
  112. ** as out_buff_len. The data will be written to the buffer, and the
  113. ** length to out_buff_len.
  114. **
  115. ** We assume that wchar_t holds a single UCS-2 character in native-endian
  116. ** byte ordering.
  117. */
  118. static void
  119. decode_utf8(xmlrpc_env * const env,
  120. const char * const utf8_data,
  121. size_t const utf8_len,
  122. wchar_t * const io_buff,
  123. size_t * const out_buff_len) {
  124. size_t i, length, out_pos;
  125. char init, con1, con2;
  126. wchar_t wc;
  127. XMLRPC_ASSERT_ENV_OK(env);
  128. XMLRPC_ASSERT_PTR_OK(utf8_data);
  129. XMLRPC_ASSERT((!io_buff && !out_buff_len) ||
  130. (io_buff && out_buff_len));
  131. /* Suppress GCC warning about possibly undefined variable. */
  132. wc = 0;
  133. i = 0;
  134. out_pos = 0;
  135. while (i < utf8_len) {
  136. init = utf8_data[i];
  137. if ((init & 0x80) == 0x00) {
  138. /* Convert ASCII character to wide character. */
  139. wc = init;
  140. i++;
  141. } else {
  142. /* Look up the length of this UTF-8 sequence. */
  143. length = utf8_seq_length[(unsigned char) init];
  144. /* Check to make sure we have enough bytes to convert. */
  145. if (i + length > utf8_len)
  146. XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
  147. "Truncated UTF-8 sequence");
  148. /* Decode a multibyte UTF-8 sequence. */
  149. switch (length) {
  150. case 0:
  151. XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
  152. "Invalid UTF-8 initial byte");
  153. case 2:
  154. /* 110xxxxx 10xxxxxx */
  155. con1 = utf8_data[i+1];
  156. if (!IS_CONTINUATION(con1))
  157. XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
  158. "UTF-8 sequence too short");
  159. wc = ((((wchar_t) (init & 0x1F)) << 6) |
  160. (((wchar_t) (con1 & 0x3F))));
  161. break;
  162. case 3:
  163. /* 1110xxxx 10xxxxxx 10xxxxxx */
  164. con1 = utf8_data[i+1];
  165. con2 = utf8_data[i+2];
  166. if (!IS_CONTINUATION(con1) || !IS_CONTINUATION(con2))
  167. XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
  168. "UTF-8 sequence too short");
  169. wc = ((((wchar_t) (init & 0x0F)) << 12) |
  170. (((wchar_t) (con1 & 0x3F)) << 6) |
  171. (((wchar_t) (con2 & 0x3F))));
  172. break;
  173. case 4:
  174. /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
  175. case 5:
  176. /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
  177. case 6:
  178. /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
  179. XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
  180. "UCS-4 characters not supported");
  181. default:
  182. XMLRPC_ASSERT("Error in UTF-8 decoder tables");
  183. }
  184. /* Advance to the end of the sequence. */
  185. i += length;
  186. /* Check for illegal UCS-2 characters. */
  187. if (wc > UCS2_MAX_LEGAL_CHARACTER)
  188. XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
  189. "UCS-2 characters > U+FFFD are illegal");
  190. /* Check for UTF-16 surrogates. */
  191. if (UTF16_FIRST_SURROGATE <= wc && wc <= UTF16_LAST_SURROGATE)
  192. XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
  193. "UTF-16 surrogates may not appear in UTF-8 data");
  194. /* Check for overlong sequences. */
  195. if (wc < utf8_min_char_for_length[length])
  196. XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
  197. "Overlong UTF-8 sequence not allowed");
  198. }
  199. /* If we have a buffer, write our character to it. */
  200. if (io_buff) {
  201. io_buff[out_pos++] = wc;
  202. }
  203. }
  204. /* Record the number of characters we found. */
  205. if (out_buff_len)
  206. *out_buff_len = out_pos;
  207. cleanup:
  208. if (env->fault_occurred) {
  209. if (out_buff_len)
  210. *out_buff_len = 0;
  211. }
  212. }
  213. /*=========================================================================
  214. ** xmlrpc_validate_utf8
  215. **=========================================================================
  216. ** Make sure that a UTF-8 string is valid.
  217. */
  218. void
  219. xmlrpc_validate_utf8 (xmlrpc_env * const env,
  220. const char * const utf8_data,
  221. size_t const utf8_len) {
  222. decode_utf8(env, utf8_data, utf8_len, NULL, NULL);
  223. }
  224. /*=========================================================================
  225. ** xmlrpc_utf8_to_wcs
  226. **=========================================================================
  227. ** Decode UTF-8 string to a "wide character string". This function
  228. ** returns an xmlrpc_mem_block with an element type of wchar_t. Don't
  229. ** try to intepret the block in a bytewise fashion--it won't work in
  230. ** any useful or portable fashion.
  231. */
  232. xmlrpc_mem_block *xmlrpc_utf8_to_wcs (xmlrpc_env *env,
  233. char *utf8_data,
  234. size_t utf8_len)
  235. {
  236. xmlrpc_mem_block *output;
  237. size_t wcs_length;
  238. /* Allocate a memory block large enough to hold any possible output.
  239. ** We assume that each byte of the input may decode to a whcar_t. */
  240. output = XMLRPC_TYPED_MEM_BLOCK_NEW(wchar_t, env, utf8_len);
  241. XMLRPC_FAIL_IF_FAULT(env);
  242. /* Decode the UTF-8 data. */
  243. decode_utf8(env, utf8_data, utf8_len,
  244. XMLRPC_TYPED_MEM_BLOCK_CONTENTS(wchar_t, output),
  245. &wcs_length);
  246. XMLRPC_FAIL_IF_FAULT(env);
  247. /* Make sure we didn't overrun our buffer. */
  248. XMLRPC_ASSERT(wcs_length <= utf8_len);
  249. /* Correct the length of the memory block. */
  250. XMLRPC_TYPED_MEM_BLOCK_RESIZE(wchar_t, env, output, wcs_length);
  251. XMLRPC_FAIL_IF_FAULT(env);
  252. cleanup:
  253. if (env->fault_occurred) {
  254. if (output)
  255. xmlrpc_mem_block_free(output);
  256. return NULL;
  257. }
  258. return output;
  259. }
  260. /*=========================================================================
  261. ** xmlrpc_utf8_to_wcs
  262. **=========================================================================
  263. ** Encode a "wide character string" as UTF-8.
  264. */
  265. xmlrpc_mem_block *xmlrpc_wcs_to_utf8 (xmlrpc_env *env,
  266. wchar_t *wcs_data,
  267. size_t wcs_len)
  268. {
  269. size_t estimate, bytes_used, i;
  270. xmlrpc_mem_block *output;
  271. unsigned char *buffer;
  272. wchar_t wc;
  273. int cwc;
  274. XMLRPC_ASSERT_ENV_OK(env);
  275. XMLRPC_ASSERT_PTR_OK(wcs_data);
  276. /* Allocate a memory block large enough to hold any possible output.
  277. ** We assume that every wchar might encode to the maximum length. */
  278. estimate = wcs_len * MAX_ENCODED_BYTES;
  279. output = XMLRPC_TYPED_MEM_BLOCK_NEW(char, env, estimate);
  280. XMLRPC_FAIL_IF_FAULT(env);
  281. /* Output our characters. */
  282. buffer = (unsigned char*) XMLRPC_TYPED_MEM_BLOCK_CONTENTS(char, output);
  283. bytes_used = 0;
  284. for (i = 0; i < wcs_len; i++) {
  285. wc = wcs_data[i];
  286. cwc = wc;
  287. if (cwc <= 0x007F) {
  288. buffer[bytes_used++] = wc & 0x7F;
  289. } else if (cwc <= 0x07FF) {
  290. /* 110xxxxx 10xxxxxx */
  291. buffer[bytes_used++] = 0xC0 | (wc >> 6);
  292. buffer[bytes_used++] = 0x80 | (wc & 0x3F);
  293. } else if (cwc <= 0xFFFF) {
  294. /* 1110xxxx 10xxxxxx 10xxxxxx */
  295. buffer[bytes_used++] = 0xE0 | (wc >> 12);
  296. buffer[bytes_used++] = 0x80 | ((wc >> 6) & 0x3F);
  297. buffer[bytes_used++] = 0x80 | (wc & 0x3F);
  298. } else {
  299. XMLRPC_FAIL(env, XMLRPC_INTERNAL_ERROR,
  300. "Don't know how to encode UCS-4 characters yet");
  301. }
  302. }
  303. /* Make sure we didn't overrun our buffer. */
  304. XMLRPC_ASSERT(bytes_used <= estimate);
  305. /* Correct the length of the memory block. */
  306. XMLRPC_TYPED_MEM_BLOCK_RESIZE(char, env, output, bytes_used);
  307. XMLRPC_FAIL_IF_FAULT(env);
  308. cleanup:
  309. if (env->fault_occurred) {
  310. if (output)
  311. xmlrpc_mem_block_free(output);
  312. return NULL;
  313. }
  314. return output;
  315. }
  316. #endif /* HAVE_UNICODE_WCHAR */