123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373 |
- /*
- * Copyright (c) 2007 Alexey Vatchenko <[email protected]>
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
- #include <wchar.h>
- #include "utf8.h"
- #ifdef _WIN32
- #include <windows.h>
- #include "c99defs.h"
- static inline bool has_utf8_bom(const char *in_char)
- {
- uint8_t *in = (uint8_t *)in_char;
- return (in && in[0] == 0xef && in[1] == 0xbb && in[2] == 0xbf);
- }
- size_t utf8_to_wchar(const char *in, size_t insize, wchar_t *out,
- size_t outsize, int flags)
- {
- int i_insize = (int)insize;
- int ret;
- if (i_insize == 0)
- i_insize = (int)strlen(in);
- /* prevent bom from being used in the string */
- if (has_utf8_bom(in)) {
- if (i_insize >= 3) {
- in += 3;
- i_insize -= 3;
- }
- }
- ret = MultiByteToWideChar(CP_UTF8, 0, in, i_insize, out, (int)outsize);
- UNUSED_PARAMETER(flags);
- return (ret > 0) ? (size_t)ret : 0;
- }
- size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out,
- size_t outsize, int flags)
- {
- int i_insize = (int)insize;
- int ret;
- if (i_insize == 0)
- i_insize = (int)wcslen(in);
- ret = WideCharToMultiByte(CP_UTF8, 0, in, i_insize, out, (int)outsize,
- NULL, NULL);
- UNUSED_PARAMETER(flags);
- return (ret > 0) ? (size_t)ret : 0;
- }
- #else
- #define _NXT 0x80
- #define _SEQ2 0xc0
- #define _SEQ3 0xe0
- #define _SEQ4 0xf0
- #define _SEQ5 0xf8
- #define _SEQ6 0xfc
- #define _BOM 0xfeff
- static int wchar_forbidden(wchar_t sym);
- static int utf8_forbidden(unsigned char octet);
- static int wchar_forbidden(wchar_t sym)
- {
- /* Surrogate pairs */
- if (sym >= 0xd800 && sym <= 0xdfff)
- return -1;
- return 0;
- }
- static int utf8_forbidden(unsigned char octet)
- {
- switch (octet) {
- case 0xc0:
- case 0xc1:
- case 0xf5:
- case 0xff:
- return -1;
- }
- return 0;
- }
- /*
- * DESCRIPTION
- * This function translates UTF-8 string into UCS-4 string (all symbols
- * will be in local machine byte order).
- *
- * It takes the following arguments:
- * in - input UTF-8 string. It can be null-terminated.
- * insize - size of input string in bytes. If insize is 0,
- * function continues until a null terminator is reached.
- * out - result buffer for UCS-4 string. If out is NULL,
- * function returns size of result buffer.
- * outsize - size of out buffer in wide characters.
- *
- * RETURN VALUES
- * The function returns size of result buffer (in wide characters).
- * Zero is returned in case of error.
- *
- * CAVEATS
- * 1. If UTF-8 string contains zero symbols, they will be translated
- * as regular symbols.
- * 2. If UTF8_IGNORE_ERROR or UTF8_SKIP_BOM flag is set, sizes may vary
- * when `out' is NULL and not NULL. It's because of special UTF-8
- * sequences which may result in forbidden (by RFC3629) UNICODE
- * characters. So, the caller must check return value every time and
- * not prepare buffer in advance (\0 terminate) but after calling this
- * function.
- */
- size_t utf8_to_wchar(const char *in, size_t insize, wchar_t *out,
- size_t outsize, int flags)
- {
- unsigned char *p, *lim;
- wchar_t *wlim, high;
- size_t n, total, i, n_bits;
- if (in == NULL || (outsize == 0 && out != NULL))
- return 0;
- total = 0;
- p = (unsigned char *)in;
- lim = (insize != 0) ? (p + insize) : (unsigned char *)-1;
- wlim = out == NULL ? NULL : out + outsize;
- for (; p < lim; p += n) {
- if (!*p)
- break;
- if (utf8_forbidden(*p) != 0 && (flags & UTF8_IGNORE_ERROR) == 0)
- return 0;
- /*
- * Get number of bytes for one wide character.
- */
- n = 1; /* default: 1 byte. Used when skipping bytes. */
- if ((*p & 0x80) == 0)
- high = (wchar_t)*p;
- else if ((*p & 0xe0) == _SEQ2) {
- n = 2;
- high = (wchar_t)(*p & 0x1f);
- } else if ((*p & 0xf0) == _SEQ3) {
- n = 3;
- high = (wchar_t)(*p & 0x0f);
- } else if ((*p & 0xf8) == _SEQ4) {
- n = 4;
- high = (wchar_t)(*p & 0x07);
- } else if ((*p & 0xfc) == _SEQ5) {
- n = 5;
- high = (wchar_t)(*p & 0x03);
- } else if ((*p & 0xfe) == _SEQ6) {
- n = 6;
- high = (wchar_t)(*p & 0x01);
- } else {
- if ((flags & UTF8_IGNORE_ERROR) == 0)
- return 0;
- continue;
- }
- /* does the sequence header tell us truth about length? */
- if ((size_t)(lim - p) <= n - 1) {
- if ((flags & UTF8_IGNORE_ERROR) == 0)
- return 0;
- n = 1;
- continue; /* skip */
- }
- /*
- * Validate sequence.
- * All symbols must have higher bits set to 10xxxxxx
- */
- if (n > 1) {
- for (i = 1; i < n; i++) {
- if ((p[i] & 0xc0) != _NXT)
- break;
- }
- if (i != n) {
- if ((flags & UTF8_IGNORE_ERROR) == 0)
- return 0;
- n = 1;
- continue; /* skip */
- }
- }
- total++;
- if (out == NULL)
- continue;
- if (out >= wlim)
- return 0; /* no space left */
- *out = 0;
- n_bits = 0;
- for (i = 1; i < n; i++) {
- *out |= (wchar_t)(p[n - i] & 0x3f) << n_bits;
- n_bits += 6; /* 6 low bits in every byte */
- }
- *out |= high << n_bits;
- if (wchar_forbidden(*out) != 0) {
- if ((flags & UTF8_IGNORE_ERROR) == 0)
- return 0; /* forbidden character */
- else {
- total--;
- out--;
- }
- } else if (*out == _BOM && (flags & UTF8_SKIP_BOM) != 0) {
- total--;
- out--;
- }
- out++;
- }
- return total;
- }
- /*
- * DESCRIPTION
- * This function translates UCS-4 symbols (given in local machine
- * byte order) into UTF-8 string.
- *
- * It takes the following arguments:
- * in - input unicode string. It can be null-terminated.
- * insize - size of input string in wide characters. If insize is 0,
- * function continues until a null terminator is reaches.
- * out - result buffer for utf8 string. If out is NULL,
- * function returns size of result buffer.
- * outsize - size of result buffer.
- *
- * RETURN VALUES
- * The function returns size of result buffer (in bytes). Zero is returned
- * in case of error.
- *
- * CAVEATS
- * If UCS-4 string contains zero symbols, they will be translated
- * as regular symbols.
- */
- size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out,
- size_t outsize, int flags)
- {
- wchar_t *w, *wlim, ch = 0;
- unsigned char *p, *lim, *oc;
- size_t total, n;
- if (in == NULL || (outsize == 0 && out != NULL))
- return 0;
- w = (wchar_t *)in;
- wlim = (insize != 0) ? (w + insize) : (wchar_t *)-1;
- p = (unsigned char *)out;
- lim = out == NULL ? NULL : p + outsize;
- total = 0;
- for (; w < wlim; w++) {
- if (!*w)
- break;
- if (wchar_forbidden(*w) != 0) {
- if ((flags & UTF8_IGNORE_ERROR) == 0)
- return 0;
- else
- continue;
- }
- if (*w == _BOM && (flags & UTF8_SKIP_BOM) != 0)
- continue;
- if (*w < 0) {
- if ((flags & UTF8_IGNORE_ERROR) == 0)
- return 0;
- continue;
- } else if (*w <= 0x0000007f)
- n = 1;
- else if (*w <= 0x000007ff)
- n = 2;
- else if (*w <= 0x0000ffff)
- n = 3;
- else if (*w <= 0x001fffff)
- n = 4;
- else if (*w <= 0x03ffffff)
- n = 5;
- else /* if (*w <= 0x7fffffff) */
- n = 6;
- total += n;
- if (out == NULL)
- continue;
- if ((size_t)(lim - p) <= n - 1)
- return 0; /* no space left */
- ch = *w;
- oc = (unsigned char *)&ch;
- switch (n) {
- case 1:
- *p = oc[0];
- break;
- case 2:
- p[1] = _NXT | (oc[0] & 0x3f);
- p[0] = _SEQ2 | (oc[0] >> 6) | ((oc[1] & 0x07) << 2);
- break;
- case 3:
- p[2] = _NXT | (oc[0] & 0x3f);
- p[1] = _NXT | (oc[0] >> 6) | ((oc[1] & 0x0f) << 2);
- p[0] = _SEQ3 | ((oc[1] & 0xf0) >> 4);
- break;
- case 4:
- p[3] = _NXT | (oc[0] & 0x3f);
- p[2] = _NXT | (oc[0] >> 6) | ((oc[1] & 0x0f) << 2);
- p[1] = _NXT | ((oc[1] & 0xf0) >> 4) |
- ((oc[2] & 0x03) << 4);
- p[0] = _SEQ4 | ((oc[2] & 0x1f) >> 2);
- break;
- case 5:
- p[4] = _NXT | (oc[0] & 0x3f);
- p[3] = _NXT | (oc[0] >> 6) | ((oc[1] & 0x0f) << 2);
- p[2] = _NXT | ((oc[1] & 0xf0) >> 4) |
- ((oc[2] & 0x03) << 4);
- p[1] = _NXT | (oc[2] >> 2);
- p[0] = _SEQ5 | (oc[3] & 0x03);
- break;
- case 6:
- p[5] = _NXT | (oc[0] & 0x3f);
- p[4] = _NXT | (oc[0] >> 6) | ((oc[1] & 0x0f) << 2);
- p[3] = _NXT | (oc[1] >> 4) | ((oc[2] & 0x03) << 4);
- p[2] = _NXT | (oc[2] >> 2);
- p[1] = _NXT | (oc[3] & 0x3f);
- p[0] = _SEQ6 | ((oc[3] & 0x40) >> 6);
- break;
- }
- /*
- * NOTE: do not check here for forbidden UTF-8 characters.
- * They cannot appear here because we do proper conversion.
- */
- p += n;
- }
- return total;
- }
- #endif
|