collate.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395
  1. /** --- BEGIN COPYRIGHT BLOCK ---
  2. * Copyright (C) 2001 Sun Microsystems, Inc. Used by permission.
  3. * Copyright (C) 2005 Red Hat, Inc.
  4. * All rights reserved.
  5. --- END COPYRIGHT BLOCK --- */
  6. /*
  7. * collate.c -- routines to collate character strings
  8. */
  9. #include <stdio.h>
  10. #include "dsgw.h"
  11. #include <ldap.h> /* ldap_utf8* */
  12. #include <unicode/ucol.h> /* Collation */
  13. #include <unicode/ucnv.h> /* Conversion */
  14. #include <unicode/ustring.h> /* UTF8 conversion */
  15. #ifdef _WINDOWS
  16. #undef strcasecmp
  17. #define strcasecmp _strcmpi
  18. #endif
  19. /*
  20. Convert the given string s, encoded in UTF8, into a Unicode (UTF16 or 32, depending on sizeof(UChar))
  21. string for use with collation and key generation
  22. The given string U will be filled in if it's capacity (given by Ulen) is big enough,
  23. otherwise, it will be malloced (or realloced if already allocated)
  24. */
  25. static UErrorCode
  26. SetUnicodeStringFromUTF_8 (UChar** U, int32_t* Ulen, int *isAlloced, const char *s)
  27. /* Copy the UTF-8 string bv into the UnicodeString U,
  28. but remove leading and trailing whitespace, and
  29. convert consecutive whitespaces into a single space.
  30. Ulen is set to the number of UChars in the array (not necessarily the number of bytes!)
  31. */
  32. {
  33. int32_t len = 0; /* length of non-space string */
  34. int32_t needLen = 0; /* number of bytes needed for string */
  35. UErrorCode err = U_ZERO_ERROR;
  36. const char* begin; /* will point to beginning of non-space in s */
  37. /* first, set s to the first non-space char in bv->bv_val */
  38. while (s && *s && ldap_utf8isspace((char *)s)) { /* cast away const */
  39. const char *next = LDAP_UTF8NEXT((char *)s); /* cast away const */
  40. s = next;
  41. }
  42. begin = s;
  43. if (!s || !*s) {
  44. return U_INVALID_FORMAT_ERROR; /* don't know what else to use here */
  45. }
  46. /* next, find the length of the non-space string */
  47. while (s && *s && !ldap_utf8isspace((char *)s)) { /* cast away const */
  48. const char *next = LDAP_UTF8NEXT((char *)s); /* cast away const */
  49. len += (next - s); /* count bytes, not chars */
  50. needLen++; /* needLen counts chars */
  51. s = next;
  52. }
  53. if (needLen == 0) { /* bogus */
  54. return U_INVALID_FORMAT_ERROR; /* don't know what else to use here */
  55. }
  56. needLen++; /* +1 for trailing UChar space */
  57. if (needLen > *Ulen) { /* need more space */
  58. if (*isAlloced) { /* realloc space */
  59. *U = (UChar *)dsgw_ch_realloc((char *)*U, sizeof(UChar) * needLen);
  60. } else { /* must use malloc */
  61. *U = (UChar *)dsgw_ch_malloc(sizeof(UChar) * needLen);
  62. *isAlloced = 1; /* no longer using fixed buffer */
  63. }
  64. *Ulen = needLen;
  65. }
  66. u_strFromUTF8(*U, sizeof(UChar) * (*Ulen), NULL, begin, len, &err);
  67. return err;
  68. }
  69. static UCollator*
  70. get_collator (int flavor)
  71. {
  72. static UCollator* collator[2] = {NULL, NULL};
  73. /* dsgw_emitf("get_collator (%i)<br>\n", flavor); */
  74. if (collator[flavor] == NULL &&
  75. gc->gc_ClientLanguage && gc->gc_ClientLanguage[0]) {
  76. /* Try to create a Collation for the client's preferred language */
  77. ACCEPT_LANGUAGE_LIST langlist;
  78. size_t langs;
  79. /* dsgw_emitf ("ClientLanguage = \"%s\"<br>\n", gc->gc_ClientLanguage); */
  80. langs = AcceptLangList (gc->gc_ClientLanguage, langlist);
  81. if (langs <= 0) {
  82. dsgw_emitf ("AcceptLangList (%s) = %lu<br>\n",
  83. gc->gc_ClientLanguage, (unsigned long)langs);
  84. } else {
  85. UCollator* fallback_collator = NULL;
  86. UCollator* default_collator = NULL;
  87. UErrorCode err = U_ZERO_ERROR;
  88. size_t i;
  89. for (i = 0; i < langs; ++i) {
  90. /* Try to create a Collation for langs[i] */
  91. char* lang = langlist[i];
  92. collator[flavor] = ucol_open(lang, &err);
  93. if (err == U_ZERO_ERROR && collator[flavor]) {
  94. dsgw_emitf("<!-- New Collator (%s) == SUCCESS -->\n", lang);
  95. break;
  96. } else {
  97. if (err == U_USING_FALLBACK_WARNING) {
  98. if (fallback_collator == NULL) {
  99. fallback_collator = collator[flavor];
  100. dsgw_emitf("<!-- New Collator (%s) == USING_FALLBACK_LOCALE -->\n", lang);
  101. } else {
  102. ucol_close (collator[flavor]);
  103. }
  104. } else if (err == U_USING_DEFAULT_WARNING) {
  105. if (default_collator == NULL) {
  106. default_collator = collator[flavor];
  107. dsgw_emitf("<!-- New Collator (%s) == USING_DEFAULT_LOCALE -->\n", lang);
  108. } else {
  109. ucol_close (collator[flavor]);
  110. }
  111. } else {
  112. dsgw_emitf("New Collator error (%s) == %i<br>\n", lang, err);
  113. }
  114. collator[flavor] = NULL;
  115. }
  116. }
  117. if (collator[flavor] == NULL) {
  118. if (fallback_collator != NULL) {
  119. collator[flavor] = fallback_collator;
  120. fallback_collator = NULL;
  121. } else if (default_collator != NULL) {
  122. collator[flavor] = default_collator;
  123. default_collator = NULL;
  124. }
  125. }
  126. if (collator[flavor] != NULL) {
  127. switch (flavor) {
  128. case CASE_EXACT:
  129. dsgw_emits("<!-- CollationSetStrength (TERTIARY) -->\n");
  130. ucol_setAttribute (collator[flavor], UCOL_STRENGTH, UCOL_TERTIARY, &err);
  131. break;
  132. default: /* CASE_IGNORE */
  133. if (dsgw_scriptorder()->so_caseIgnoreAccents) {
  134. dsgw_emits("<!-- CollationSetStrength (PRIMARY) -->\n");
  135. ucol_setAttribute (collator[flavor], UCOL_STRENGTH, UCOL_PRIMARY, &err);
  136. } else {
  137. dsgw_emits("<!-- CollationSetStrength (SECONDARY) -->\n");
  138. ucol_setAttribute (collator[flavor], UCOL_STRENGTH, UCOL_SECONDARY, &err);
  139. }
  140. break;
  141. }
  142. }
  143. if (default_collator != NULL) {
  144. ucol_close (default_collator);
  145. default_collator = NULL;
  146. }
  147. if (fallback_collator != NULL) {
  148. ucol_close (fallback_collator);
  149. fallback_collator = NULL;
  150. }
  151. }
  152. }
  153. return collator[flavor];
  154. }
  155. static int
  156. valcmp (const char** L, const char** R)
  157. {
  158. return strcmp (*L, *R);
  159. }
  160. static int
  161. valcasecmp (const char** L, const char** R)
  162. {
  163. return strcasecmp (*L, *R);
  164. }
  165. static int
  166. strXcollate (int flavor, const char* L, const char* R)
  167. {
  168. UCollator* collator = get_collator (flavor);
  169. if (collator != NULL) {
  170. UChar LuBuffer[128];
  171. UChar* Lu = LuBuffer;
  172. int32_t LuLen = u_strlen(LuBuffer);
  173. int LuisAlloced = 0;
  174. if (SetUnicodeStringFromUTF_8 (&Lu, &LuLen, &LuisAlloced, L) == U_ZERO_ERROR) {
  175. UChar RuBuffer[128];
  176. UChar* Ru = RuBuffer;
  177. int32_t RuLen = u_strlen(RuBuffer);
  178. int RuisAlloced = 0;
  179. if (SetUnicodeStringFromUTF_8 (&Ru, &RuLen, &RuisAlloced, R) == U_ZERO_ERROR) {
  180. UCollationResult colres = ucol_strcoll(collator, Lu, LuLen, Ru, RuLen);
  181. int result = 0;
  182. switch (colres) {
  183. case UCOL_LESS:
  184. result = -1;
  185. break;
  186. case UCOL_GREATER:
  187. result = 1;
  188. break;
  189. default:
  190. break;
  191. }
  192. #ifdef DSGW_DEBUG
  193. {
  194. auto char* Le = dsgw_strdup_escaped (L);
  195. auto char* Re = dsgw_strdup_escaped (R);
  196. dsgw_log ("strXcollate:%s %s %s\n",
  197. Le, result < 0 ? "<" : (result == 0 ? "=" : ">"), Re);
  198. free (Le);
  199. free (Re);
  200. }
  201. #endif
  202. if (RuisAlloced) {
  203. free(Ru);
  204. Ru = NULL;
  205. }
  206. if (LuisAlloced) {
  207. free(Lu);
  208. Lu = NULL;
  209. }
  210. return result;
  211. }
  212. if (LuisAlloced) {
  213. free(Lu);
  214. Lu = NULL;
  215. }
  216. }
  217. }
  218. return flavor ? strcasecmp (L, R) : strcmp (L, R);
  219. }
  220. static int
  221. strcollate (const char* L, const char* R)
  222. {
  223. return strXcollate (CASE_EXACT, L, R);
  224. }
  225. static int
  226. strcasecollate (const char* L, const char* R)
  227. {
  228. return strXcollate (CASE_INSENSITIVE, L, R);
  229. }
  230. static int
  231. valcollate (const char** L, const char** R)
  232. {
  233. return strXcollate (CASE_EXACT, *L, *R);
  234. }
  235. static int
  236. valcasecollate (const char** L, const char** R)
  237. {
  238. return strXcollate (CASE_INSENSITIVE, *L, *R);
  239. }
  240. strcmp_t
  241. dsgw_strcmp (int flavor)
  242. {
  243. if (get_collator (flavor) != NULL) {
  244. return flavor ? strcasecollate : strcollate;
  245. }
  246. return flavor ? strcasecmp : strcmp;
  247. }
  248. valcmp_t
  249. dsgw_valcmp (int flavor)
  250. {
  251. if (get_collator (flavor) != NULL) {
  252. return flavor ? valcasecollate : valcollate;
  253. }
  254. return flavor ? valcasecmp : valcmp;
  255. }
  256. static size_t
  257. dsgw_scriptof (const char* s, scriptrange_t** ranges)
  258. {
  259. auto size_t result = 0;
  260. if (s && ranges) {
  261. auto unsigned long u;
  262. while ((u = LDAP_UTF8GETCC (s)) != 0) {
  263. auto size_t ss;
  264. auto scriptrange_t* sr;
  265. for (ss = 0; (sr = ranges[ss]) != NULL; ++ss) {
  266. do {
  267. if (sr->sr_min <= u && u <= sr->sr_max) {
  268. break;
  269. }
  270. } while ((sr = sr->sr_next) != NULL);
  271. if (sr) {
  272. if (result < ss) result = ss;
  273. break;
  274. }
  275. }
  276. if (!sr) {
  277. result = ss;
  278. break;
  279. }
  280. }
  281. }
  282. #ifdef DSGW_DEBUG
  283. dsgw_log ("script %lu\n", (unsigned long)result);
  284. #endif
  285. return result;
  286. }
  287. static struct berval key_first = {0, 0};
  288. static struct berval key_last = {0, 0};
  289. struct berval* dsgw_key_first = &key_first;
  290. struct berval* dsgw_key_last = &key_last;
  291. void LDAP_C LDAP_CALLBACK
  292. dsgw_keyfree( void *arg, const struct berval* key )
  293. {
  294. if (key->bv_val) free (key->bv_val);
  295. else if (key == dsgw_key_first || key == dsgw_key_last) return;
  296. free ((void*)key);
  297. }
  298. int LDAP_C LDAP_CALLBACK
  299. dsgw_keycmp( void *arg, const struct berval *L, const struct berval *R )
  300. {
  301. int result = 0;
  302. if (L == R) {
  303. } else if (L->bv_val == NULL) { /* L is either first or last */
  304. result = (L == dsgw_key_last) ? 1 : -1;
  305. } else if (R->bv_val == NULL) { /* R is either first or last */
  306. result = (R == dsgw_key_last) ? -1 : 1;
  307. } else
  308. /* copied from slapi_berval_cmp(), in ../../servers/slapd/plugin.c: */
  309. if (L->bv_len < R->bv_len) {
  310. result = memcmp (L->bv_val, R->bv_val, L->bv_len);
  311. if (result == 0)
  312. result = -1;
  313. } else {
  314. result = memcmp (L->bv_val, R->bv_val, R->bv_len);
  315. if (result == 0 && (L->bv_len > R->bv_len))
  316. result = 1;
  317. }
  318. return result;
  319. }
  320. struct berval*
  321. dsgw_strkeygen (int flavor, const char* s)
  322. {
  323. auto struct berval* v = (struct berval*)dsgw_ch_malloc (sizeof (struct berval));
  324. auto UCollator* collator = get_collator (flavor);
  325. v->bv_val = NULL;
  326. if (collator != NULL) {
  327. UChar uBuffer[128];
  328. UChar* u = uBuffer;
  329. int32_t uLen = u_strlen(uBuffer);
  330. int uisAlloced = 0;
  331. if (SetUnicodeStringFromUTF_8 (&u, &uLen, &uisAlloced, s) == U_ZERO_ERROR) {
  332. char keyBuffer[128]; /* try to use static space buffer to avoid malloc */
  333. int32_t keyLen = sizeof(keyBuffer);
  334. char* key = keyBuffer; /* but key can grow if necessary */
  335. int32_t realLen = ucol_getSortKey(collator, u, uLen, (uint8_t *)key, keyLen);
  336. if (realLen > keyLen) { /* need more space */
  337. key = (char*)dsgw_ch_malloc(sizeof(char) * realLen);
  338. keyLen = ucol_getSortKey(collator, u, uLen, (uint8_t *)key, realLen);
  339. }
  340. v->bv_len = realLen + 2;
  341. v->bv_val = dsgw_ch_malloc (v->bv_len);
  342. memcpy(v->bv_val+1, key, realLen);
  343. if (uisAlloced) {
  344. free(u);
  345. u = NULL;
  346. }
  347. if (key != keyBuffer) {
  348. free(key);
  349. key = NULL;
  350. }
  351. }
  352. }
  353. if (v->bv_val == NULL) {
  354. v->bv_len = (s ? strlen (s) : 0) + 2;
  355. v->bv_val = dsgw_ch_malloc (v->bv_len);
  356. if (v->bv_len > 2) memcpy (v->bv_val+1, s, v->bv_len-2);
  357. if (flavor) {
  358. register char* t;
  359. for (t = v->bv_val+1; *t; ++t) {
  360. if (isascii (*t)) *t = tolower (*t);
  361. }
  362. }
  363. }
  364. v->bv_val[0] = (char) dsgw_scriptof (s, dsgw_scriptorder()->so_sort);
  365. v->bv_val[v->bv_len-1] = '\0';
  366. return v;
  367. }