utf8.c 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
  1. /*
  2. * Copyright (c) 2007 Alexey Vatchenko <[email protected]>
  3. *
  4. * Permission to use, copy, modify, and/or distribute this software for any
  5. * purpose with or without fee is hereby granted, provided that the above
  6. * copyright notice and this permission notice appear in all copies.
  7. *
  8. * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9. * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  10. * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  11. * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  12. * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  13. * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  14. * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  15. */
  16. #include <wchar.h>
  17. #include "utf8.h"
  18. #ifdef _WIN32
  19. #include <windows.h>
  20. #include "c99defs.h"
  21. static inline bool has_utf8_bom(const char *in_char)
  22. {
  23. uint8_t *in = (uint8_t *)in_char;
  24. return (in && in[0] == 0xef && in[1] == 0xbb && in[2] == 0xbf);
  25. }
  26. size_t utf8_to_wchar(const char *in, size_t insize, wchar_t *out,
  27. size_t outsize, int flags)
  28. {
  29. int i_insize = (int)insize;
  30. int ret;
  31. if (i_insize == 0)
  32. i_insize = (int)strlen(in);
  33. /* prevent bom from being used in the string */
  34. if (has_utf8_bom(in)) {
  35. if (i_insize >= 3) {
  36. in += 3;
  37. i_insize -= 3;
  38. }
  39. }
  40. ret = MultiByteToWideChar(CP_UTF8, 0, in, i_insize, out, (int)outsize);
  41. UNUSED_PARAMETER(flags);
  42. return (ret > 0) ? (size_t)ret : 0;
  43. }
  44. size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out,
  45. size_t outsize, int flags)
  46. {
  47. int i_insize = (int)insize;
  48. int ret;
  49. if (i_insize == 0)
  50. i_insize = (int)wcslen(in);
  51. ret = WideCharToMultiByte(CP_UTF8, 0, in, i_insize, out, (int)outsize,
  52. NULL, NULL);
  53. UNUSED_PARAMETER(flags);
  54. return (ret > 0) ? (size_t)ret : 0;
  55. }
  56. #else
  57. #define _NXT 0x80
  58. #define _SEQ2 0xc0
  59. #define _SEQ3 0xe0
  60. #define _SEQ4 0xf0
  61. #define _SEQ5 0xf8
  62. #define _SEQ6 0xfc
  63. #define _BOM 0xfeff
  64. static int wchar_forbidden(wchar_t sym);
  65. static int utf8_forbidden(unsigned char octet);
  66. static int wchar_forbidden(wchar_t sym)
  67. {
  68. /* Surrogate pairs */
  69. if (sym >= 0xd800 && sym <= 0xdfff)
  70. return -1;
  71. return 0;
  72. }
  73. static int utf8_forbidden(unsigned char octet)
  74. {
  75. switch (octet) {
  76. case 0xc0:
  77. case 0xc1:
  78. case 0xf5:
  79. case 0xff:
  80. return -1;
  81. }
  82. return 0;
  83. }
  84. /*
  85. * DESCRIPTION
  86. * This function translates UTF-8 string into UCS-4 string (all symbols
  87. * will be in local machine byte order).
  88. *
  89. * It takes the following arguments:
  90. * in - input UTF-8 string. It can be null-terminated.
  91. * insize - size of input string in bytes. If insize is 0,
  92. * function continues until a null terminator is reached.
  93. * out - result buffer for UCS-4 string. If out is NULL,
  94. * function returns size of result buffer.
  95. * outsize - size of out buffer in wide characters.
  96. *
  97. * RETURN VALUES
  98. * The function returns size of result buffer (in wide characters).
  99. * Zero is returned in case of error.
  100. *
  101. * CAVEATS
  102. * 1. If UTF-8 string contains zero symbols, they will be translated
  103. * as regular symbols.
  104. * 2. If UTF8_IGNORE_ERROR or UTF8_SKIP_BOM flag is set, sizes may vary
  105. * when `out' is NULL and not NULL. It's because of special UTF-8
  106. * sequences which may result in forbidden (by RFC3629) UNICODE
  107. * characters. So, the caller must check return value every time and
  108. * not prepare buffer in advance (\0 terminate) but after calling this
  109. * function.
  110. */
  111. size_t utf8_to_wchar(const char *in, size_t insize, wchar_t *out,
  112. size_t outsize, int flags)
  113. {
  114. unsigned char *p, *lim;
  115. wchar_t *wlim, high;
  116. size_t n, total, i, n_bits;
  117. if (in == NULL || (outsize == 0 && out != NULL))
  118. return 0;
  119. total = 0;
  120. p = (unsigned char *)in;
  121. lim = (insize != 0) ? (p + insize) : (unsigned char *)-1;
  122. wlim = out == NULL ? NULL : out + outsize;
  123. for (; p < lim; p += n) {
  124. if (!*p && insize == 0)
  125. break;
  126. if (utf8_forbidden(*p) != 0 && (flags & UTF8_IGNORE_ERROR) == 0)
  127. return 0;
  128. /*
  129. * Get number of bytes for one wide character.
  130. */
  131. n = 1; /* default: 1 byte. Used when skipping bytes. */
  132. if ((*p & 0x80) == 0)
  133. high = (wchar_t)*p;
  134. else if ((*p & 0xe0) == _SEQ2) {
  135. n = 2;
  136. high = (wchar_t)(*p & 0x1f);
  137. } else if ((*p & 0xf0) == _SEQ3) {
  138. n = 3;
  139. high = (wchar_t)(*p & 0x0f);
  140. } else if ((*p & 0xf8) == _SEQ4) {
  141. n = 4;
  142. high = (wchar_t)(*p & 0x07);
  143. } else if ((*p & 0xfc) == _SEQ5) {
  144. n = 5;
  145. high = (wchar_t)(*p & 0x03);
  146. } else if ((*p & 0xfe) == _SEQ6) {
  147. n = 6;
  148. high = (wchar_t)(*p & 0x01);
  149. } else {
  150. if ((flags & UTF8_IGNORE_ERROR) == 0)
  151. return 0;
  152. continue;
  153. }
  154. /* does the sequence header tell us truth about length? */
  155. if ((size_t)(lim - p) <= n - 1) {
  156. if ((flags & UTF8_IGNORE_ERROR) == 0)
  157. return 0;
  158. n = 1;
  159. continue; /* skip */
  160. }
  161. /*
  162. * Validate sequence.
  163. * All symbols must have higher bits set to 10xxxxxx
  164. */
  165. if (n > 1) {
  166. for (i = 1; i < n; i++) {
  167. if ((p[i] & 0xc0) != _NXT)
  168. break;
  169. }
  170. if (i != n) {
  171. if ((flags & UTF8_IGNORE_ERROR) == 0)
  172. return 0;
  173. n = 1;
  174. continue; /* skip */
  175. }
  176. }
  177. total++;
  178. if (out == NULL)
  179. continue;
  180. if (out >= wlim)
  181. return 0; /* no space left */
  182. *out = 0;
  183. n_bits = 0;
  184. for (i = 1; i < n; i++) {
  185. *out |= (wchar_t)(p[n - i] & 0x3f) << n_bits;
  186. n_bits += 6; /* 6 low bits in every byte */
  187. }
  188. *out |= high << n_bits;
  189. if (wchar_forbidden(*out) != 0) {
  190. if ((flags & UTF8_IGNORE_ERROR) == 0)
  191. return 0; /* forbidden character */
  192. else {
  193. total--;
  194. out--;
  195. }
  196. } else if (*out == _BOM && (flags & UTF8_SKIP_BOM) != 0) {
  197. total--;
  198. out--;
  199. }
  200. out++;
  201. }
  202. return total;
  203. }
  204. /*
  205. * DESCRIPTION
  206. * This function translates UCS-4 symbols (given in local machine
  207. * byte order) into UTF-8 string.
  208. *
  209. * It takes the following arguments:
  210. * in - input unicode string. It can be null-terminated.
  211. * insize - size of input string in wide characters. If insize is 0,
  212. * function continues until a null terminator is reaches.
  213. * out - result buffer for utf8 string. If out is NULL,
  214. * function returns size of result buffer.
  215. * outsize - size of result buffer.
  216. *
  217. * RETURN VALUES
  218. * The function returns size of result buffer (in bytes). Zero is returned
  219. * in case of error.
  220. *
  221. * CAVEATS
  222. * If UCS-4 string contains zero symbols, they will be translated
  223. * as regular symbols.
  224. */
  225. size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out,
  226. size_t outsize, int flags)
  227. {
  228. wchar_t *w, *wlim, ch = 0;
  229. unsigned char *p, *lim, *oc;
  230. size_t total, n;
  231. if (in == NULL || (outsize == 0 && out != NULL))
  232. return 0;
  233. w = (wchar_t *)in;
  234. wlim = (insize != 0) ? (w + insize) : (wchar_t *)-1;
  235. p = (unsigned char *)out;
  236. lim = out == NULL ? NULL : p + outsize;
  237. total = 0;
  238. for (; w < wlim; w++) {
  239. if (!*w && insize == 0)
  240. break;
  241. if (wchar_forbidden(*w) != 0) {
  242. if ((flags & UTF8_IGNORE_ERROR) == 0)
  243. return 0;
  244. else
  245. continue;
  246. }
  247. if (*w == _BOM && (flags & UTF8_SKIP_BOM) != 0)
  248. continue;
  249. if (*w < 0) {
  250. if ((flags & UTF8_IGNORE_ERROR) == 0)
  251. return 0;
  252. continue;
  253. } else if (*w <= 0x0000007f)
  254. n = 1;
  255. else if (*w <= 0x000007ff)
  256. n = 2;
  257. else if (*w <= 0x0000ffff)
  258. n = 3;
  259. else if (*w <= 0x001fffff)
  260. n = 4;
  261. else if (*w <= 0x03ffffff)
  262. n = 5;
  263. else /* if (*w <= 0x7fffffff) */
  264. n = 6;
  265. total += n;
  266. if (out == NULL)
  267. continue;
  268. if ((size_t)(lim - p) <= n - 1)
  269. return 0; /* no space left */
  270. ch = *w;
  271. oc = (unsigned char *)&ch;
  272. switch (n) {
  273. case 1:
  274. *p = oc[0];
  275. break;
  276. case 2:
  277. p[1] = _NXT | (oc[0] & 0x3f);
  278. p[0] = _SEQ2 | (oc[0] >> 6) | ((oc[1] & 0x07) << 2);
  279. break;
  280. case 3:
  281. p[2] = _NXT | (oc[0] & 0x3f);
  282. p[1] = _NXT | (oc[0] >> 6) | ((oc[1] & 0x0f) << 2);
  283. p[0] = _SEQ3 | ((oc[1] & 0xf0) >> 4);
  284. break;
  285. case 4:
  286. p[3] = _NXT | (oc[0] & 0x3f);
  287. p[2] = _NXT | (oc[0] >> 6) | ((oc[1] & 0x0f) << 2);
  288. p[1] = _NXT | ((oc[1] & 0xf0) >> 4) |
  289. ((oc[2] & 0x03) << 4);
  290. p[0] = _SEQ4 | ((oc[2] & 0x1f) >> 2);
  291. break;
  292. case 5:
  293. p[4] = _NXT | (oc[0] & 0x3f);
  294. p[3] = _NXT | (oc[0] >> 6) | ((oc[1] & 0x0f) << 2);
  295. p[2] = _NXT | ((oc[1] & 0xf0) >> 4) |
  296. ((oc[2] & 0x03) << 4);
  297. p[1] = _NXT | (oc[2] >> 2);
  298. p[0] = _SEQ5 | (oc[3] & 0x03);
  299. break;
  300. case 6:
  301. p[5] = _NXT | (oc[0] & 0x3f);
  302. p[4] = _NXT | (oc[0] >> 6) | ((oc[1] & 0x0f) << 2);
  303. p[3] = _NXT | (oc[1] >> 4) | ((oc[2] & 0x03) << 4);
  304. p[2] = _NXT | (oc[2] >> 2);
  305. p[1] = _NXT | (oc[3] & 0x3f);
  306. p[0] = _SEQ6 | ((oc[3] & 0x40) >> 6);
  307. break;
  308. }
  309. /*
  310. * NOTE: do not check here for forbidden UTF-8 characters.
  311. * They cannot appear here because we do proper conversion.
  312. */
  313. p += n;
  314. }
  315. return total;
  316. }
  317. #endif