collate.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427
  1. /** --- BEGIN COPYRIGHT BLOCK ---
  2. * This Program is free software; you can redistribute it and/or modify it under
  3. * the terms of the GNU General Public License as published by the Free Software
  4. * Foundation; version 2 of the License.
  5. *
  6. * This Program is distributed in the hope that it will be useful, but WITHOUT
  7. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  8. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
  9. *
  10. * You should have received a copy of the GNU General Public License along with
  11. * this Program; if not, write to the Free Software Foundation, Inc., 59 Temple
  12. * Place, Suite 330, Boston, MA 02111-1307 USA.
  13. *
  14. * In addition, as a special exception, Red Hat, Inc. gives You the additional
  15. * right to link the code of this Program with code not covered under the GNU
  16. * General Public License ("Non-GPL Code") and to distribute linked combinations
  17. * including the two, subject to the limitations in this paragraph. Non-GPL Code
  18. * permitted under this exception must only link to the code of this Program
  19. * through those well defined interfaces identified in the file named EXCEPTION
  20. * found in the source code files (the "Approved Interfaces"). The files of
  21. * Non-GPL Code may instantiate templates or use macros or inline functions from
  22. * the Approved Interfaces without causing the resulting work to be covered by
  23. * the GNU General Public License. Only Red Hat, Inc. may make changes or
  24. * additions to the list of Approved Interfaces. You must obey the GNU General
  25. * Public License in all respects for all of the Program code and other code used
  26. * in conjunction with the Program except the Non-GPL Code covered by this
  27. * exception. If you modify this file, you may extend this exception to your
  28. * version of the file, but you are not obligated to do so. If you do not wish to
  29. * provide this exception without modification, you must delete this exception
  30. * statement from your version and license this file solely under the GPL without
  31. * exception.
  32. *
  33. *
  34. * Copyright (C) 2001 Sun Microsystems, Inc. Used by permission.
  35. * Copyright (C) 2005 Red Hat, Inc.
  36. * All rights reserved.
  37. --- END COPYRIGHT BLOCK --- */
  38. /*
  39. * collate.c -- routines to collate character strings
  40. */
  41. #include <stdio.h>
  42. #include "dsgw.h"
  43. #include <ldap.h> /* ldap_utf8* */
  44. #include <unicode/ucol.h> /* Collation */
  45. #include <unicode/ucnv.h> /* Conversion */
  46. #include <unicode/ustring.h> /* UTF8 conversion */
  47. #ifdef _WINDOWS
  48. #undef strcasecmp
  49. #define strcasecmp _strcmpi
  50. #endif
  51. /*
  52. Convert the given string s, encoded in UTF8, into a Unicode (UTF16 or 32, depending on sizeof(UChar))
  53. string for use with collation and key generation
  54. The given string U will be filled in if it's capacity (given by Ulen) is big enough,
  55. otherwise, it will be malloced (or realloced if already allocated)
  56. */
  57. static UErrorCode
  58. SetUnicodeStringFromUTF_8 (UChar** U, int32_t* Ulen, int *isAlloced, const char *s)
  59. /* Copy the UTF-8 string bv into the UnicodeString U,
  60. but remove leading and trailing whitespace, and
  61. convert consecutive whitespaces into a single space.
  62. Ulen is set to the number of UChars in the array (not necessarily the number of bytes!)
  63. */
  64. {
  65. int32_t len = 0; /* length of non-space string */
  66. int32_t needLen = 0; /* number of bytes needed for string */
  67. UErrorCode err = U_ZERO_ERROR;
  68. const char* begin; /* will point to beginning of non-space in s */
  69. /* first, set s to the first non-space char in bv->bv_val */
  70. while (s && *s && ldap_utf8isspace((char *)s)) { /* cast away const */
  71. const char *next = LDAP_UTF8NEXT((char *)s); /* cast away const */
  72. s = next;
  73. }
  74. begin = s;
  75. if (!s || !*s) {
  76. return U_INVALID_FORMAT_ERROR; /* don't know what else to use here */
  77. }
  78. /* next, find the length of the non-space string */
  79. while (s && *s && !ldap_utf8isspace((char *)s)) { /* cast away const */
  80. const char *next = LDAP_UTF8NEXT((char *)s); /* cast away const */
  81. len += (next - s); /* count bytes, not chars */
  82. needLen++; /* needLen counts chars */
  83. s = next;
  84. }
  85. if (needLen == 0) { /* bogus */
  86. return U_INVALID_FORMAT_ERROR; /* don't know what else to use here */
  87. }
  88. needLen++; /* +1 for trailing UChar space */
  89. if (needLen > *Ulen) { /* need more space */
  90. if (*isAlloced) { /* realloc space */
  91. *U = (UChar *)dsgw_ch_realloc((char *)*U, sizeof(UChar) * needLen);
  92. } else { /* must use malloc */
  93. *U = (UChar *)dsgw_ch_malloc(sizeof(UChar) * needLen);
  94. *isAlloced = 1; /* no longer using fixed buffer */
  95. }
  96. *Ulen = needLen;
  97. }
  98. u_strFromUTF8(*U, sizeof(UChar) * (*Ulen), NULL, begin, len, &err);
  99. return err;
  100. }
  101. static UCollator*
  102. get_collator (int flavor)
  103. {
  104. static UCollator* collator[2] = {NULL, NULL};
  105. /* dsgw_emitf("get_collator (%i)<br>\n", flavor); */
  106. if (collator[flavor] == NULL &&
  107. gc->gc_ClientLanguage && gc->gc_ClientLanguage[0]) {
  108. /* Try to create a Collation for the client's preferred language */
  109. ACCEPT_LANGUAGE_LIST langlist;
  110. size_t langs;
  111. /* dsgw_emitf ("ClientLanguage = \"%s\"<br>\n", gc->gc_ClientLanguage); */
  112. langs = AcceptLangList (gc->gc_ClientLanguage, langlist);
  113. if (langs <= 0) {
  114. dsgw_emitf ("AcceptLangList (%s) = %lu<br>\n",
  115. gc->gc_ClientLanguage, (unsigned long)langs);
  116. } else {
  117. UCollator* fallback_collator = NULL;
  118. UCollator* default_collator = NULL;
  119. UErrorCode err = U_ZERO_ERROR;
  120. size_t i;
  121. for (i = 0; i < langs; ++i) {
  122. /* Try to create a Collation for langs[i] */
  123. char* lang = langlist[i];
  124. collator[flavor] = ucol_open(lang, &err);
  125. if (err == U_ZERO_ERROR && collator[flavor]) {
  126. dsgw_emitf("<!-- New Collator (%s) == SUCCESS -->\n", lang);
  127. break;
  128. } else {
  129. if (err == U_USING_FALLBACK_WARNING) {
  130. if (fallback_collator == NULL) {
  131. fallback_collator = collator[flavor];
  132. dsgw_emitf("<!-- New Collator (%s) == USING_FALLBACK_LOCALE -->\n", lang);
  133. } else {
  134. ucol_close (collator[flavor]);
  135. }
  136. } else if (err == U_USING_DEFAULT_WARNING) {
  137. if (default_collator == NULL) {
  138. default_collator = collator[flavor];
  139. dsgw_emitf("<!-- New Collator (%s) == USING_DEFAULT_LOCALE -->\n", lang);
  140. } else {
  141. ucol_close (collator[flavor]);
  142. }
  143. } else {
  144. dsgw_emitf("New Collator error (%s) == %i<br>\n", lang, err);
  145. }
  146. collator[flavor] = NULL;
  147. }
  148. }
  149. if (collator[flavor] == NULL) {
  150. if (fallback_collator != NULL) {
  151. collator[flavor] = fallback_collator;
  152. fallback_collator = NULL;
  153. } else if (default_collator != NULL) {
  154. collator[flavor] = default_collator;
  155. default_collator = NULL;
  156. }
  157. }
  158. if (collator[flavor] != NULL) {
  159. switch (flavor) {
  160. case CASE_EXACT:
  161. dsgw_emits("<!-- CollationSetStrength (TERTIARY) -->\n");
  162. ucol_setAttribute (collator[flavor], UCOL_STRENGTH, UCOL_TERTIARY, &err);
  163. break;
  164. default: /* CASE_IGNORE */
  165. if (dsgw_scriptorder()->so_caseIgnoreAccents) {
  166. dsgw_emits("<!-- CollationSetStrength (PRIMARY) -->\n");
  167. ucol_setAttribute (collator[flavor], UCOL_STRENGTH, UCOL_PRIMARY, &err);
  168. } else {
  169. dsgw_emits("<!-- CollationSetStrength (SECONDARY) -->\n");
  170. ucol_setAttribute (collator[flavor], UCOL_STRENGTH, UCOL_SECONDARY, &err);
  171. }
  172. break;
  173. }
  174. }
  175. if (default_collator != NULL) {
  176. ucol_close (default_collator);
  177. default_collator = NULL;
  178. }
  179. if (fallback_collator != NULL) {
  180. ucol_close (fallback_collator);
  181. fallback_collator = NULL;
  182. }
  183. }
  184. }
  185. return collator[flavor];
  186. }
  187. static int
  188. valcmp (const char** L, const char** R)
  189. {
  190. return strcmp (*L, *R);
  191. }
  192. static int
  193. valcasecmp (const char** L, const char** R)
  194. {
  195. return strcasecmp (*L, *R);
  196. }
  197. static int
  198. strXcollate (int flavor, const char* L, const char* R)
  199. {
  200. UCollator* collator = get_collator (flavor);
  201. if (collator != NULL) {
  202. UChar LuBuffer[128];
  203. UChar* Lu = LuBuffer;
  204. int32_t LuLen = u_strlen(LuBuffer);
  205. int LuisAlloced = 0;
  206. if (SetUnicodeStringFromUTF_8 (&Lu, &LuLen, &LuisAlloced, L) == U_ZERO_ERROR) {
  207. UChar RuBuffer[128];
  208. UChar* Ru = RuBuffer;
  209. int32_t RuLen = u_strlen(RuBuffer);
  210. int RuisAlloced = 0;
  211. if (SetUnicodeStringFromUTF_8 (&Ru, &RuLen, &RuisAlloced, R) == U_ZERO_ERROR) {
  212. UCollationResult colres = ucol_strcoll(collator, Lu, LuLen, Ru, RuLen);
  213. int result = 0;
  214. switch (colres) {
  215. case UCOL_LESS:
  216. result = -1;
  217. break;
  218. case UCOL_GREATER:
  219. result = 1;
  220. break;
  221. default:
  222. break;
  223. }
  224. #ifdef DSGW_DEBUG
  225. {
  226. auto char* Le = dsgw_strdup_escaped (L);
  227. auto char* Re = dsgw_strdup_escaped (R);
  228. dsgw_log ("strXcollate:%s %s %s\n",
  229. Le, result < 0 ? "<" : (result == 0 ? "=" : ">"), Re);
  230. free (Le);
  231. free (Re);
  232. }
  233. #endif
  234. if (RuisAlloced) {
  235. free(Ru);
  236. Ru = NULL;
  237. }
  238. if (LuisAlloced) {
  239. free(Lu);
  240. Lu = NULL;
  241. }
  242. return result;
  243. }
  244. if (LuisAlloced) {
  245. free(Lu);
  246. Lu = NULL;
  247. }
  248. }
  249. }
  250. return flavor ? strcasecmp (L, R) : strcmp (L, R);
  251. }
  252. static int
  253. strcollate (const char* L, const char* R)
  254. {
  255. return strXcollate (CASE_EXACT, L, R);
  256. }
  257. static int
  258. strcasecollate (const char* L, const char* R)
  259. {
  260. return strXcollate (CASE_INSENSITIVE, L, R);
  261. }
  262. static int
  263. valcollate (const char** L, const char** R)
  264. {
  265. return strXcollate (CASE_EXACT, *L, *R);
  266. }
  267. static int
  268. valcasecollate (const char** L, const char** R)
  269. {
  270. return strXcollate (CASE_INSENSITIVE, *L, *R);
  271. }
  272. strcmp_t
  273. dsgw_strcmp (int flavor)
  274. {
  275. if (get_collator (flavor) != NULL) {
  276. return flavor ? strcasecollate : strcollate;
  277. }
  278. return flavor ? strcasecmp : strcmp;
  279. }
  280. valcmp_t
  281. dsgw_valcmp (int flavor)
  282. {
  283. if (get_collator (flavor) != NULL) {
  284. return flavor ? valcasecollate : valcollate;
  285. }
  286. return flavor ? valcasecmp : valcmp;
  287. }
  288. static size_t
  289. dsgw_scriptof (const char* s, scriptrange_t** ranges)
  290. {
  291. auto size_t result = 0;
  292. if (s && ranges) {
  293. auto unsigned long u;
  294. while ((u = LDAP_UTF8GETCC (s)) != 0) {
  295. auto size_t ss;
  296. auto scriptrange_t* sr;
  297. for (ss = 0; (sr = ranges[ss]) != NULL; ++ss) {
  298. do {
  299. if (sr->sr_min <= u && u <= sr->sr_max) {
  300. break;
  301. }
  302. } while ((sr = sr->sr_next) != NULL);
  303. if (sr) {
  304. if (result < ss) result = ss;
  305. break;
  306. }
  307. }
  308. if (!sr) {
  309. result = ss;
  310. break;
  311. }
  312. }
  313. }
  314. #ifdef DSGW_DEBUG
  315. dsgw_log ("script %lu\n", (unsigned long)result);
  316. #endif
  317. return result;
  318. }
  319. static struct berval key_first = {0, 0};
  320. static struct berval key_last = {0, 0};
  321. struct berval* dsgw_key_first = &key_first;
  322. struct berval* dsgw_key_last = &key_last;
  323. void LDAP_C LDAP_CALLBACK
  324. dsgw_keyfree( void *arg, const struct berval* key )
  325. {
  326. if (key->bv_val) free (key->bv_val);
  327. else if (key == dsgw_key_first || key == dsgw_key_last) return;
  328. free ((void*)key);
  329. }
  330. int LDAP_C LDAP_CALLBACK
  331. dsgw_keycmp( void *arg, const struct berval *L, const struct berval *R )
  332. {
  333. int result = 0;
  334. if (L == R) {
  335. } else if (L->bv_val == NULL) { /* L is either first or last */
  336. result = (L == dsgw_key_last) ? 1 : -1;
  337. } else if (R->bv_val == NULL) { /* R is either first or last */
  338. result = (R == dsgw_key_last) ? -1 : 1;
  339. } else
  340. /* copied from slapi_berval_cmp(), in ../../servers/slapd/plugin.c: */
  341. if (L->bv_len < R->bv_len) {
  342. result = memcmp (L->bv_val, R->bv_val, L->bv_len);
  343. if (result == 0)
  344. result = -1;
  345. } else {
  346. result = memcmp (L->bv_val, R->bv_val, R->bv_len);
  347. if (result == 0 && (L->bv_len > R->bv_len))
  348. result = 1;
  349. }
  350. return result;
  351. }
  352. struct berval*
  353. dsgw_strkeygen (int flavor, const char* s)
  354. {
  355. auto struct berval* v = (struct berval*)dsgw_ch_malloc (sizeof (struct berval));
  356. auto UCollator* collator = get_collator (flavor);
  357. v->bv_val = NULL;
  358. if (collator != NULL) {
  359. UChar uBuffer[128];
  360. UChar* u = uBuffer;
  361. int32_t uLen = u_strlen(uBuffer);
  362. int uisAlloced = 0;
  363. if (SetUnicodeStringFromUTF_8 (&u, &uLen, &uisAlloced, s) == U_ZERO_ERROR) {
  364. char keyBuffer[128]; /* try to use static space buffer to avoid malloc */
  365. int32_t keyLen = sizeof(keyBuffer);
  366. char* key = keyBuffer; /* but key can grow if necessary */
  367. int32_t realLen = ucol_getSortKey(collator, u, uLen, (uint8_t *)key, keyLen);
  368. if (realLen > keyLen) { /* need more space */
  369. key = (char*)dsgw_ch_malloc(sizeof(char) * realLen);
  370. keyLen = ucol_getSortKey(collator, u, uLen, (uint8_t *)key, realLen);
  371. }
  372. v->bv_len = realLen + 2;
  373. v->bv_val = dsgw_ch_malloc (v->bv_len);
  374. memcpy(v->bv_val+1, key, realLen);
  375. if (uisAlloced) {
  376. free(u);
  377. u = NULL;
  378. }
  379. if (key != keyBuffer) {
  380. free(key);
  381. key = NULL;
  382. }
  383. }
  384. }
  385. if (v->bv_val == NULL) {
  386. v->bv_len = (s ? strlen (s) : 0) + 2;
  387. v->bv_val = dsgw_ch_malloc (v->bv_len);
  388. if (v->bv_len > 2) memcpy (v->bv_val+1, s, v->bv_len-2);
  389. if (flavor) {
  390. register char* t;
  391. for (t = v->bv_val+1; *t; ++t) {
  392. if (isascii (*t)) *t = tolower (*t);
  393. }
  394. }
  395. }
  396. v->bv_val[0] = (char) dsgw_scriptof (s, dsgw_scriptorder()->so_sort);
  397. v->bv_val[v->bv_len-1] = '\0';
  398. return v;
  399. }