utf8.c 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. /*
  2. * Copyright (c) 2007 Alexey Vatchenko <[email protected]>
  3. *
  4. * Permission to use, copy, modify, and/or distribute this software for any
  5. * purpose with or without fee is hereby granted, provided that the above
  6. * copyright notice and this permission notice appear in all copies.
  7. *
  8. * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9. * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  10. * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  11. * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  12. * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  13. * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  14. * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  15. */
  16. #include <wchar.h>
  17. #include "utf8.h"
  18. #define _NXT 0x80
  19. #define _SEQ2 0xc0
  20. #define _SEQ3 0xe0
  21. #define _SEQ4 0xf0
  22. #define _SEQ5 0xf8
  23. #define _SEQ6 0xfc
  24. #define _BOM 0xfeff
  25. static int wchar_forbidden(wchar_t sym);
  26. static int utf8_forbidden(unsigned char octet);
  27. static int wchar_forbidden(wchar_t sym)
  28. {
  29. /* Surrogate pairs */
  30. if (sym >= 0xd800 && sym <= 0xdfff)
  31. return -1;
  32. return 0;
  33. }
  34. static int utf8_forbidden(unsigned char octet)
  35. {
  36. switch (octet) {
  37. case 0xc0:
  38. case 0xc1:
  39. case 0xf5:
  40. case 0xff:
  41. return -1;
  42. }
  43. return 0;
  44. }
  45. /*
  46. * DESCRIPTION
  47. * This function translates UTF-8 string into UCS-4 string (all symbols
  48. * will be in local machine byte order).
  49. *
  50. * It takes the following arguments:
  51. * in - input UTF-8 string. It can be null-terminated.
  52. * insize - size of input string in bytes. If insize is 0,
  53. * function continues until a null terminator is reached.
  54. * out - result buffer for UCS-4 string. If out is NULL,
  55. * function returns size of result buffer.
  56. * outsize - size of out buffer in wide characters.
  57. *
  58. * RETURN VALUES
  59. * The function returns size of result buffer (in wide characters).
  60. * Zero is returned in case of error.
  61. *
  62. * CAVEATS
  63. * 1. If UTF-8 string contains zero symbols, they will be translated
  64. * as regular symbols.
  65. * 2. If UTF8_IGNORE_ERROR or UTF8_SKIP_BOM flag is set, sizes may vary
  66. * when `out' is NULL and not NULL. It's because of special UTF-8
  67. * sequences which may result in forbidden (by RFC3629) UNICODE
  68. * characters. So, the caller must check return value every time and
  69. * not prepare buffer in advance (\0 terminate) but after calling this
  70. * function.
  71. */
  72. size_t utf8_to_wchar(const char *in, size_t insize, wchar_t *out,
  73. size_t outsize, int flags)
  74. {
  75. unsigned char *p, *lim;
  76. wchar_t *wlim, high;
  77. size_t n, total, i, n_bits;
  78. if (in == NULL || (outsize == 0 && out != NULL))
  79. return 0;
  80. total = 0;
  81. p = (unsigned char *)in;
  82. lim = (insize != 0) ? (p + insize) : (unsigned char*)-1;
  83. wlim = out + outsize;
  84. for (; p < lim; p += n) {
  85. if (!*p)
  86. break;
  87. if (utf8_forbidden(*p) != 0 &&
  88. (flags & UTF8_IGNORE_ERROR) == 0)
  89. return 0;
  90. /*
  91. * Get number of bytes for one wide character.
  92. */
  93. n = 1; /* default: 1 byte. Used when skipping bytes. */
  94. if ((*p & 0x80) == 0)
  95. high = (wchar_t)*p;
  96. else if ((*p & 0xe0) == _SEQ2) {
  97. n = 2;
  98. high = (wchar_t)(*p & 0x1f);
  99. } else if ((*p & 0xf0) == _SEQ3) {
  100. n = 3;
  101. high = (wchar_t)(*p & 0x0f);
  102. } else if ((*p & 0xf8) == _SEQ4) {
  103. n = 4;
  104. high = (wchar_t)(*p & 0x07);
  105. } else if ((*p & 0xfc) == _SEQ5) {
  106. n = 5;
  107. high = (wchar_t)(*p & 0x03);
  108. } else if ((*p & 0xfe) == _SEQ6) {
  109. n = 6;
  110. high = (wchar_t)(*p & 0x01);
  111. } else {
  112. if ((flags & UTF8_IGNORE_ERROR) == 0)
  113. return 0;
  114. continue;
  115. }
  116. /* does the sequence header tell us truth about length? */
  117. if ((size_t)(lim - p) <= n - 1) {
  118. if ((flags & UTF8_IGNORE_ERROR) == 0)
  119. return 0;
  120. n = 1;
  121. continue; /* skip */
  122. }
  123. /*
  124. * Validate sequence.
  125. * All symbols must have higher bits set to 10xxxxxx
  126. */
  127. if (n > 1) {
  128. for (i = 1; i < n; i++) {
  129. if ((p[i] & 0xc0) != _NXT)
  130. break;
  131. }
  132. if (i != n) {
  133. if ((flags & UTF8_IGNORE_ERROR) == 0)
  134. return 0;
  135. n = 1;
  136. continue; /* skip */
  137. }
  138. }
  139. total++;
  140. if (out == NULL)
  141. continue;
  142. if (out >= wlim)
  143. return 0; /* no space left */
  144. *out = 0;
  145. n_bits = 0;
  146. for (i = 1; i < n; i++) {
  147. *out |= (wchar_t)(p[n - i] & 0x3f) << n_bits;
  148. n_bits += 6; /* 6 low bits in every byte */
  149. }
  150. *out |= high << n_bits;
  151. if (wchar_forbidden(*out) != 0) {
  152. if ((flags & UTF8_IGNORE_ERROR) == 0)
  153. return 0; /* forbidden character */
  154. else {
  155. total--;
  156. out--;
  157. }
  158. } else if (*out == _BOM && (flags & UTF8_SKIP_BOM) != 0) {
  159. total--;
  160. out--;
  161. }
  162. out++;
  163. }
  164. return total;
  165. }
  166. /*
  167. * DESCRIPTION
  168. * This function translates UCS-4 symbols (given in local machine
  169. * byte order) into UTF-8 string.
  170. *
  171. * It takes the following arguments:
  172. * in - input unicode string. It can be null-terminated.
  173. * insize - size of input string in wide characters. If insize is 0,
  174. * function continues until a null terminator is reaches.
  175. * out - result buffer for utf8 string. If out is NULL,
  176. * function returns size of result buffer.
  177. * outsize - size of result buffer.
  178. *
  179. * RETURN VALUES
  180. * The function returns size of result buffer (in bytes). Zero is returned
  181. * in case of error.
  182. *
  183. * CAVEATS
  184. * If UCS-4 string contains zero symbols, they will be translated
  185. * as regular symbols.
  186. */
  187. size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out,
  188. size_t outsize, int flags)
  189. {
  190. wchar_t *w, *wlim, ch = 0;
  191. unsigned char *p, *lim, *oc;
  192. size_t total, n;
  193. if (in == NULL || (outsize == 0 && out != NULL))
  194. return 0;
  195. w = (wchar_t *)in;
  196. wlim = (insize != 0) ? (w + insize) : (wchar_t*)-1;
  197. p = (unsigned char *)out;
  198. lim = p + outsize;
  199. total = 0;
  200. for (; w < wlim; w++) {
  201. if (!*w)
  202. break;
  203. if (wchar_forbidden(*w) != 0) {
  204. if ((flags & UTF8_IGNORE_ERROR) == 0)
  205. return 0;
  206. else
  207. continue;
  208. }
  209. if (*w == _BOM && (flags & UTF8_SKIP_BOM) != 0)
  210. continue;
  211. if (*w < 0) {
  212. if ((flags & UTF8_IGNORE_ERROR) == 0)
  213. return 0;
  214. continue;
  215. } else if (*w <= 0x0000007f)
  216. n = 1;
  217. else if (*w <= 0x000007ff)
  218. n = 2;
  219. else if (*w <= 0x0000ffff)
  220. n = 3;
  221. else if (*w <= 0x001fffff)
  222. n = 4;
  223. else if (*w <= 0x03ffffff)
  224. n = 5;
  225. else /* if (*w <= 0x7fffffff) */
  226. n = 6;
  227. total += n;
  228. if (out == NULL)
  229. continue;
  230. if ((size_t)(lim - p) <= n - 1)
  231. return 0; /* no space left */
  232. ch = *w;
  233. oc = (unsigned char *)&ch;
  234. switch (n) {
  235. case 1:
  236. *p = oc[0];
  237. break;
  238. case 2:
  239. p[1] = _NXT | (oc[0] & 0x3f);
  240. p[0] = _SEQ2 | (oc[0] >> 6) | ((oc[1] & 0x07) << 2);
  241. break;
  242. case 3:
  243. p[2] = _NXT | (oc[0] & 0x3f);
  244. p[1] = _NXT | (oc[0] >> 6) | ((oc[1] & 0x0f) << 2);
  245. p[0] = _SEQ3 | ((oc[1] & 0xf0) >> 4);
  246. break;
  247. case 4:
  248. p[3] = _NXT | (oc[0] & 0x3f);
  249. p[2] = _NXT | (oc[0] >> 6) | ((oc[1] & 0x0f) << 2);
  250. p[1] = _NXT | ((oc[1] & 0xf0) >> 4) |
  251. ((oc[2] & 0x03) << 4);
  252. p[0] = _SEQ4 | ((oc[2] & 0x1f) >> 2);
  253. break;
  254. case 5:
  255. p[4] = _NXT | (oc[0] & 0x3f);
  256. p[3] = _NXT | (oc[0] >> 6) | ((oc[1] & 0x0f) << 2);
  257. p[2] = _NXT | ((oc[1] & 0xf0) >> 4) |
  258. ((oc[2] & 0x03) << 4);
  259. p[1] = _NXT | (oc[2] >> 2);
  260. p[0] = _SEQ5 | (oc[3] & 0x03);
  261. break;
  262. case 6:
  263. p[5] = _NXT | (oc[0] & 0x3f);
  264. p[4] = _NXT | (oc[0] >> 6) | ((oc[1] & 0x0f) << 2);
  265. p[3] = _NXT | (oc[1] >> 4) | ((oc[2] & 0x03) << 4);
  266. p[2] = _NXT | (oc[2] >> 2);
  267. p[1] = _NXT | (oc[3] & 0x3f);
  268. p[0] = _SEQ6 | ((oc[3] & 0x40) >> 6);
  269. break;
  270. }
  271. /*
  272. * NOTE: do not check here for forbidden UTF-8 characters.
  273. * They cannot appear here because we do proper convertion.
  274. */
  275. p += n;
  276. }
  277. return total;
  278. }