utf8.c 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
  1. /** BEGIN COPYRIGHT BLOCK
  2. * License: GPL (version 3 or any later version).
  3. * See LICENSE for details.
  4. * END COPYRIGHT BLOCK **/
  5. /* ***** BEGIN LICENSE BLOCK *****
  6. *
  7. * The Original Code is Mozilla Communicator client code, released
  8. * March 31, 1998.
  9. *
  10. * The Initial Developer of the Original Code is
  11. * Netscape Communications Corporation.
  12. * Portions created by the Initial Developer are Copyright (C) 1998-1999
  13. * the Initial Developer. All Rights Reserved.
  14. *
  15. * Contributor(s):
  16. *
  17. * Alternatively, the contents of this file may be used under the terms of
  18. * either of the GNU General Public License Version 2 or later (the "GPL"),
  19. * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  20. * in which case the provisions of the GPL or the LGPL are applicable instead
  21. * of those above. If you wish to allow use of your version of this file only
  22. * under the terms of either the GPL or the LGPL, and not to allow others to
  23. * use your version of this file under the terms of the MPL, indicate your
  24. * decision by deleting the provisions above and replace them with the notice
  25. * and other provisions required by the GPL or the LGPL. If you do not delete
  26. * the provisions above, a recipient may use your version of this file under
  27. * the terms of any one of the MPL, the GPL or the LGPL.
  28. *
  29. * June 25, 2009 - copied from Mozilla LDAP C SDK - relicensed to use GPLv2
  30. * with directory server plug-in exception as per the above paragraph
  31. *
  32. * ***** END LICENSE BLOCK ***** */
  33. /* the openldap library has utf8 string handling functions, but they
  34. are somewhat different, and not exposed/exported for use outside
  35. of the library - therefore, we just copy these from mozldap when
  36. using openldap
  37. */
  38. #ifdef HAVE_CONFIG_H
  39. # include <config.h>
  40. #endif
  41. #if defined(USE_OPENLDAP)
  42. /* uft8.c - misc. utf8 "string" functions. */
  43. #include "slapi-plugin.h"
  44. static char UTF8len[64]
  45. = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  46. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  47. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  48. 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6};
  49. int
  50. ldap_utf8len (const char* s)
  51. /* Return the number of char's in the character at *s. */
  52. {
  53. return ldap_utf8next((char*)s) - s;
  54. }
  55. char*
  56. ldap_utf8next (char* s)
  57. /* Return a pointer to the character immediately following *s.
  58. Handle any valid UTF-8 character, including '\0' and ASCII.
  59. Try to handle a misaligned pointer or a malformed character.
  60. */
  61. {
  62. register unsigned char* next = (unsigned char*)s;
  63. switch (UTF8len [(*next >> 2) & 0x3F]) {
  64. case 0: /* erroneous: s points to the middle of a character. */
  65. case 6: if ((*++next & 0xC0) != 0x80) break;
  66. case 5: if ((*++next & 0xC0) != 0x80) break;
  67. case 4: if ((*++next & 0xC0) != 0x80) break;
  68. case 3: if ((*++next & 0xC0) != 0x80) break;
  69. case 2: if ((*++next & 0xC0) != 0x80) break;
  70. case 1: ++next;
  71. }
  72. return (char*) next;
  73. }
  74. char*
  75. ldap_utf8prev (char* s)
  76. /* Return a pointer to the character immediately preceding *s.
  77. Handle any valid UTF-8 character, including '\0' and ASCII.
  78. Try to handle a misaligned pointer or a malformed character.
  79. */
  80. {
  81. register unsigned char* prev = (unsigned char*)s;
  82. unsigned char* limit = prev - 6;
  83. while (((*--prev & 0xC0) == 0x80) && (prev != limit)) {
  84. ;
  85. }
  86. return (char*) prev;
  87. }
  88. int
  89. ldap_utf8copy (char* dst, const char* src)
  90. /* Copy a character from src to dst; return the number of char's copied.
  91. Handle any valid UTF-8 character, including '\0' and ASCII.
  92. Try to handle a misaligned pointer or a malformed character.
  93. */
  94. {
  95. register const unsigned char* s = (const unsigned char*)src;
  96. switch (UTF8len [(*s >> 2) & 0x3F]) {
  97. case 0: /* erroneous: s points to the middle of a character. */
  98. case 6: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
  99. case 5: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
  100. case 4: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
  101. case 3: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
  102. case 2: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
  103. case 1: *dst = *s++;
  104. }
  105. return s - (const unsigned char*)src;
  106. }
  107. size_t
  108. ldap_utf8characters (const char* src)
  109. /* Return the number of UTF-8 characters in the 0-terminated array s. */
  110. {
  111. register char* s = (char*)src;
  112. size_t n;
  113. for (n = 0; *s; LDAP_UTF8INC(s)) ++n;
  114. return n;
  115. }
  116. unsigned long
  117. ldap_utf8getcc( const char** src )
  118. {
  119. register unsigned long c = 0;
  120. register const unsigned char* s = (const unsigned char*)*src;
  121. switch (UTF8len [(*s >> 2) & 0x3F]) {
  122. case 0: /* erroneous: s points to the middle of a character. */
  123. c = (*s++) & 0x3F; goto more5;
  124. case 1: c = (*s++); break;
  125. case 2: c = (*s++) & 0x1F; goto more1;
  126. case 3: c = (*s++) & 0x0F; goto more2;
  127. case 4: c = (*s++) & 0x07; goto more3;
  128. case 5: c = (*s++) & 0x03; goto more4;
  129. case 6: c = (*s++) & 0x01; goto more5;
  130. more5: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
  131. more4: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
  132. more3: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
  133. more2: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
  134. more1: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
  135. break;
  136. }
  137. *src = (const char*)s;
  138. return c;
  139. }
  140. char*
  141. ldap_utf8strtok_r( char* sp, const char* brk, char** next)
  142. {
  143. const char *bp;
  144. unsigned long sc, bc;
  145. char *tok;
  146. if (sp == NULL && (sp = *next) == NULL)
  147. return NULL;
  148. /* Skip leading delimiters; roughly, sp += strspn(sp, brk) */
  149. cont:
  150. sc = LDAP_UTF8GETC(sp);
  151. for (bp = brk; (bc = LDAP_UTF8GETCC(bp)) != 0;) {
  152. if (sc == bc)
  153. goto cont;
  154. }
  155. if (sc == 0) { /* no non-delimiter characters */
  156. *next = NULL;
  157. return NULL;
  158. }
  159. tok = LDAP_UTF8PREV(sp);
  160. /* Scan token; roughly, sp += strcspn(sp, brk)
  161. * Note that brk must be 0-terminated; we stop if we see that, too.
  162. */
  163. while (1) {
  164. sc = LDAP_UTF8GETC(sp);
  165. bp = brk;
  166. do {
  167. if ((bc = LDAP_UTF8GETCC(bp)) == sc) {
  168. if (sc == 0) {
  169. *next = NULL;
  170. } else {
  171. *next = sp;
  172. *(LDAP_UTF8PREV(sp)) = 0;
  173. }
  174. return tok;
  175. }
  176. } while (bc != 0);
  177. }
  178. /* NOTREACHED */
  179. }
  180. int
  181. ldap_utf8isalnum( char* s )
  182. {
  183. register unsigned char c = *(unsigned char*)s;
  184. if (0x80 & c) return 0;
  185. if (c >= 'A' && c <= 'Z') return 1;
  186. if (c >= 'a' && c <= 'z') return 1;
  187. if (c >= '0' && c <= '9') return 1;
  188. return 0;
  189. }
  190. int
  191. ldap_utf8isalpha( char* s )
  192. {
  193. register unsigned char c = *(unsigned char*)s;
  194. if (0x80 & c) return 0;
  195. if (c >= 'A' && c <= 'Z') return 1;
  196. if (c >= 'a' && c <= 'z') return 1;
  197. return 0;
  198. }
  199. int
  200. ldap_utf8isdigit( char* s )
  201. {
  202. register unsigned char c = *(unsigned char*)s;
  203. if (0x80 & c) return 0;
  204. if (c >= '0' && c <= '9') return 1;
  205. return 0;
  206. }
  207. int
  208. ldap_utf8isxdigit( char* s )
  209. {
  210. register unsigned char c = *(unsigned char*)s;
  211. if (0x80 & c) return 0;
  212. if (c >= '0' && c <= '9') return 1;
  213. if (c >= 'A' && c <= 'F') return 1;
  214. if (c >= 'a' && c <= 'f') return 1;
  215. return 0;
  216. }
  217. int
  218. ldap_utf8isspace( char* s )
  219. {
  220. register unsigned char *c = (unsigned char*)s;
  221. int len = ldap_utf8len(s);
  222. if (len == 0) {
  223. return 0;
  224. } else if (len == 1) {
  225. switch (*c) {
  226. case 0x09:
  227. case 0x0A:
  228. case 0x0B:
  229. case 0x0C:
  230. case 0x0D:
  231. case 0x20:
  232. return 1;
  233. default:
  234. return 0;
  235. }
  236. } else if (len == 2) {
  237. if (*c == 0xc2) {
  238. return *(c+1) == 0x80;
  239. }
  240. } else if (len == 3) {
  241. if (*c == 0xE2) {
  242. c++;
  243. if (*c == 0x80) {
  244. c++;
  245. return (*c>=0x80 && *c<=0x8a);
  246. }
  247. } else if (*c == 0xE3) {
  248. return (*(c+1)==0x80) && (*(c+2)==0x80);
  249. } else if (*c==0xEF) {
  250. return (*(c+1)==0xBB) && (*(c+2)==0xBF);
  251. }
  252. return 0;
  253. }
  254. /* should never reach here */
  255. return 0;
  256. }
  257. #endif /* USE_OPENLDAP */