| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373 | /* * Copyright (c) 2007 Alexey Vatchenko <[email protected]> * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */#include <wchar.h>#include "utf8.h"#ifdef _WIN32#include <windows.h>#include "c99defs.h"static inline bool has_utf8_bom(const char *in_char){	uint8_t *in = (uint8_t *)in_char;	return (in && in[0] == 0xef && in[1] == 0xbb && in[2] == 0xbf);}size_t utf8_to_wchar(const char *in, size_t insize, wchar_t *out,		     size_t outsize, int flags){	int i_insize = (int)insize;	int ret;	if (i_insize == 0)		i_insize = (int)strlen(in);	/* prevent bom from being used in the string */	if (has_utf8_bom(in)) {		if (i_insize >= 3) {			in += 3;			i_insize -= 3;		}	}	ret = MultiByteToWideChar(CP_UTF8, 0, in, i_insize, out, (int)outsize);	UNUSED_PARAMETER(flags);	return (ret > 0) ? (size_t)ret : 0;}size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out,		     size_t outsize, int flags){	int i_insize = (int)insize;	int ret;	if (i_insize == 0)		i_insize = (int)wcslen(in);	ret = WideCharToMultiByte(CP_UTF8, 0, in, i_insize, out, (int)outsize,				  NULL, NULL);	UNUSED_PARAMETER(flags);	return (ret > 0) ? (size_t)ret : 0;}#else#define _NXT 0x80#define _SEQ2 0xc0#define _SEQ3 0xe0#define _SEQ4 0xf0#define _SEQ5 0xf8#define _SEQ6 0xfc#define _BOM 0xfeffstatic int wchar_forbidden(wchar_t sym);static int utf8_forbidden(unsigned char octet);static int wchar_forbidden(wchar_t sym){	/* Surrogate pairs */	if (sym >= 0xd800 && sym <= 0xdfff)		return -1;	return 0;}static int utf8_forbidden(unsigned char octet){	switch (octet) {	case 0xc0:	case 0xc1:	case 0xf5:	case 0xff:		return -1;	}	return 0;}/* * DESCRIPTION *	This function translates UTF-8 string into UCS-4 string (all symbols *	will be in local machine byte order). * *	It takes the following arguments: *	in	- input UTF-8 string. It can be null-terminated. *	insize	- size of input string in bytes.  If insize is 0, *	        function continues until a null terminator is reached. *	out	- result buffer for UCS-4 string. If out is NULL, *		function returns size of result buffer. *	outsize - size of out buffer in wide characters. * * RETURN VALUES *	The function returns size of result buffer (in wide characters). *	Zero is returned in case of error. * * CAVEATS *	1. If UTF-8 string contains zero symbols, they will be translated *	   as regular symbols. *	2. If UTF8_IGNORE_ERROR or UTF8_SKIP_BOM flag is set, sizes may vary *	   when `out' is NULL and not NULL. It's because of special UTF-8 *	   sequences which may result in forbidden (by RFC3629) UNICODE *	   characters.  So, the caller must check return value every time and *	   not prepare buffer in advance (\0 terminate) but after calling this *	   function. */size_t utf8_to_wchar(const char *in, size_t insize, wchar_t *out,		     size_t outsize, int flags){	unsigned char *p, *lim;	wchar_t *wlim, high;	size_t n, total, i, n_bits;	if (in == NULL || (outsize == 0 && out != NULL))		return 0;	total = 0;	p = (unsigned char *)in;	lim = (insize != 0) ? (p + insize) : (unsigned char *)-1;	wlim = out == NULL ? NULL : out + outsize;	for (; p < lim; p += n) {		if (!*p)			break;		if (utf8_forbidden(*p) != 0 && (flags & UTF8_IGNORE_ERROR) == 0)			return 0;		/*		 * Get number of bytes for one wide character.		 */		n = 1; /* default: 1 byte. Used when skipping bytes. */		if ((*p & 0x80) == 0)			high = (wchar_t)*p;		else if ((*p & 0xe0) == _SEQ2) {			n = 2;			high = (wchar_t)(*p & 0x1f);		} else if ((*p & 0xf0) == _SEQ3) {			n = 3;			high = (wchar_t)(*p & 0x0f);		} else if ((*p & 0xf8) == _SEQ4) {			n = 4;			high = (wchar_t)(*p & 0x07);		} else if ((*p & 0xfc) == _SEQ5) {			n = 5;			high = (wchar_t)(*p & 0x03);		} else if ((*p & 0xfe) == _SEQ6) {			n = 6;			high = (wchar_t)(*p & 0x01);		} else {			if ((flags & UTF8_IGNORE_ERROR) == 0)				return 0;			continue;		}		/* does the sequence header tell us truth about length? */		if ((size_t)(lim - p) <= n - 1) {			if ((flags & UTF8_IGNORE_ERROR) == 0)				return 0;			n = 1;			continue; /* skip */		}		/*		 * Validate sequence.		 * All symbols must have higher bits set to 10xxxxxx		 */		if (n > 1) {			for (i = 1; i < n; i++) {				if ((p[i] & 0xc0) != _NXT)					break;			}			if (i != n) {				if ((flags & UTF8_IGNORE_ERROR) == 0)					return 0;				n = 1;				continue; /* skip */			}		}		total++;		if (out == NULL)			continue;		if (out >= wlim)			return 0; /* no space left */		*out = 0;		n_bits = 0;		for (i = 1; i < n; i++) {			*out |= (wchar_t)(p[n - i] & 0x3f) << n_bits;			n_bits += 6; /* 6 low bits in every byte */		}		*out |= high << n_bits;		if (wchar_forbidden(*out) != 0) {			if ((flags & UTF8_IGNORE_ERROR) == 0)				return 0; /* forbidden character */			else {				total--;				out--;			}		} else if (*out == _BOM && (flags & UTF8_SKIP_BOM) != 0) {			total--;			out--;		}		out++;	}	return total;}/* * DESCRIPTION *	This function translates UCS-4 symbols (given in local machine *	byte order) into UTF-8 string. * *	It takes the following arguments: *	in	- input unicode string. It can be null-terminated. *	insize	- size of input string in wide characters.  If insize is 0, *	        function continues until a null terminator is reaches. *	out	- result buffer for utf8 string. If out is NULL, *		function returns size of result buffer. *	outsize - size of result buffer. * * RETURN VALUES *	The function returns size of result buffer (in bytes). Zero is returned *	in case of error. * * CAVEATS *	If UCS-4 string contains zero symbols, they will be translated *	as regular symbols. */size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out,		     size_t outsize, int flags){	wchar_t *w, *wlim, ch = 0;	unsigned char *p, *lim, *oc;	size_t total, n;	if (in == NULL || (outsize == 0 && out != NULL))		return 0;	w = (wchar_t *)in;	wlim = (insize != 0) ? (w + insize) : (wchar_t *)-1;	p = (unsigned char *)out;	lim = out == NULL ? NULL : p + outsize;	total = 0;	for (; w < wlim; w++) {		if (!*w)			break;		if (wchar_forbidden(*w) != 0) {			if ((flags & UTF8_IGNORE_ERROR) == 0)				return 0;			else				continue;		}		if (*w == _BOM && (flags & UTF8_SKIP_BOM) != 0)			continue;		if (*w < 0) {			if ((flags & UTF8_IGNORE_ERROR) == 0)				return 0;			continue;		} else if (*w <= 0x0000007f)			n = 1;		else if (*w <= 0x000007ff)			n = 2;		else if (*w <= 0x0000ffff)			n = 3;		else if (*w <= 0x001fffff)			n = 4;		else if (*w <= 0x03ffffff)			n = 5;		else /* if (*w <= 0x7fffffff) */			n = 6;		total += n;		if (out == NULL)			continue;		if ((size_t)(lim - p) <= n - 1)			return 0; /* no space left */		ch = *w;		oc = (unsigned char *)&ch;		switch (n) {		case 1:			*p = oc[0];			break;		case 2:			p[1] = _NXT | (oc[0] & 0x3f);			p[0] = _SEQ2 | (oc[0] >> 6) | ((oc[1] & 0x07) << 2);			break;		case 3:			p[2] = _NXT | (oc[0] & 0x3f);			p[1] = _NXT | (oc[0] >> 6) | ((oc[1] & 0x0f) << 2);			p[0] = _SEQ3 | ((oc[1] & 0xf0) >> 4);			break;		case 4:			p[3] = _NXT | (oc[0] & 0x3f);			p[2] = _NXT | (oc[0] >> 6) | ((oc[1] & 0x0f) << 2);			p[1] = _NXT | ((oc[1] & 0xf0) >> 4) |			       ((oc[2] & 0x03) << 4);			p[0] = _SEQ4 | ((oc[2] & 0x1f) >> 2);			break;		case 5:			p[4] = _NXT | (oc[0] & 0x3f);			p[3] = _NXT | (oc[0] >> 6) | ((oc[1] & 0x0f) << 2);			p[2] = _NXT | ((oc[1] & 0xf0) >> 4) |			       ((oc[2] & 0x03) << 4);			p[1] = _NXT | (oc[2] >> 2);			p[0] = _SEQ5 | (oc[3] & 0x03);			break;		case 6:			p[5] = _NXT | (oc[0] & 0x3f);			p[4] = _NXT | (oc[0] >> 6) | ((oc[1] & 0x0f) << 2);			p[3] = _NXT | (oc[1] >> 4) | ((oc[2] & 0x03) << 4);			p[2] = _NXT | (oc[2] >> 2);			p[1] = _NXT | (oc[3] & 0x3f);			p[0] = _SEQ6 | ((oc[3] & 0x40) >> 6);			break;		}		/*		 * NOTE: do not check here for forbidden UTF-8 characters.		 * They cannot appear here because we do proper conversion.		 */		p += n;	}	return total;}#endif
 |