collate.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488
  1. /** BEGIN COPYRIGHT BLOCK
  2. * This Program is free software; you can redistribute it and/or modify it under
  3. * the terms of the GNU General Public License as published by the Free Software
  4. * Foundation; version 2 of the License.
  5. *
  6. * This Program is distributed in the hope that it will be useful, but WITHOUT
  7. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  8. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
  9. *
  10. * You should have received a copy of the GNU General Public License along with
  11. * this Program; if not, write to the Free Software Foundation, Inc., 59 Temple
  12. * Place, Suite 330, Boston, MA 02111-1307 USA.
  13. *
  14. * In addition, as a special exception, Red Hat, Inc. gives You the additional
  15. * right to link the code of this Program with code not covered under the GNU
  16. * General Public License ("Non-GPL Code") and to distribute linked combinations
  17. * including the two, subject to the limitations in this paragraph. Non-GPL Code
  18. * permitted under this exception must only link to the code of this Program
  19. * through those well defined interfaces identified in the file named EXCEPTION
  20. * found in the source code files (the "Approved Interfaces"). The files of
  21. * Non-GPL Code may instantiate templates or use macros or inline functions from
  22. * the Approved Interfaces without causing the resulting work to be covered by
  23. * the GNU General Public License. Only Red Hat, Inc. may make changes or
  24. * additions to the list of Approved Interfaces. You must obey the GNU General
  25. * Public License in all respects for all of the Program code and other code used
  26. * in conjunction with the Program except the Non-GPL Code covered by this
  27. * exception. If you modify this file, you may extend this exception to your
  28. * version of the file, but you are not obligated to do so. If you do not wish to
  29. * provide this exception without modification, you must delete this exception
  30. * statement from your version and license this file solely under the GPL without
  31. * exception.
  32. *
  33. *
  34. * Copyright (C) 2001 Sun Microsystems, Inc. Used by permission.
  35. * Copyright (C) 2005 Red Hat, Inc.
  36. * All rights reserved.
  37. * END COPYRIGHT BLOCK **/
  38. /* collate.c - implementation of indexing, using a Collation */
  39. #include "collate.h"
  40. #include <string.h> /* memcpy */
  41. #include <unicode/ucol.h> /* Collation */
  42. #include <unicode/ucnv.h> /* Conversion */
  43. #include <unicode/ustring.h> /* UTF8 conversion */
  44. #include <ldap.h> /* LDAP_UTF8LEN */
  45. #include <slap.h> /* for strcasecmp on non-UNIX platforms and correct debug macro */
  46. void
  47. collation_init( char *configpath )
  48. /* Called once per process, to initialize globals. */
  49. {
  50. /* ICU needs no initialization? */
  51. }
  52. typedef struct coll_profile_t { /* Collator characteristics */
  53. const char* language;
  54. const char* country;
  55. const char* variant;
  56. UColAttributeValue strength; /* one of UCOL_PRIMARY = 0, UCOL_SECONDARY = 1, UCOL_TERTIARY = 2, UCOL_QUATERNARY = 3, UCOL_IDENTICAL = 4 */
  57. UColAttributeValue decomposition; /* one of UCOL_OFF = 0, UCOL_DEFAULT = 1, UCOL_ON = 2 */
  58. } coll_profile_t;
  59. typedef struct coll_id_t { /* associates an OID with a coll_profile_t */
  60. char* oid;
  61. coll_profile_t* profile;
  62. } coll_id_t;
  63. /* A list of all OIDs that identify collator profiles: */
  64. static const coll_id_t** collation_id = NULL;
  65. static size_t collation_ids = 0;
  66. int
  67. collation_config (size_t cargc, char** cargv,
  68. const char* fname, size_t lineno)
  69. /* Process one line from a configuration file.
  70. Return 0 if it's OK, -1 if it's not recognized.
  71. Any other return value is a process exit code.
  72. */
  73. {
  74. if (cargc <= 0) { /* Bizarre. Oh, well... */
  75. } else if (!strcasecmp (cargv[0], "NLS")) {
  76. /* ignore - not needed anymore with ICU - was used to get path for NLS_Initialize */
  77. } else if (!strcasecmp (cargv[0], "collation")) {
  78. if ( cargc < 7 ) {
  79. LDAPDebug (LDAP_DEBUG_ANY,
  80. "%s: line %lu ignored: only %lu arguments (expected "
  81. "collation language country variant strength decomposition oid ...)\n",
  82. fname, (unsigned long)lineno, (unsigned long)cargc );
  83. } else {
  84. auto size_t arg;
  85. auto coll_profile_t* profile = (coll_profile_t*) slapi_ch_calloc (1, sizeof (coll_profile_t));
  86. if (*cargv[1]) profile->language = slapi_ch_strdup (cargv[1]);
  87. if (*cargv[2]) profile->country = slapi_ch_strdup (cargv[2]);
  88. if (*cargv[3]) profile->variant = slapi_ch_strdup (cargv[3]);
  89. switch (atoi(cargv[4])) {
  90. case 1: profile->strength = UCOL_PRIMARY; break;
  91. case 2: profile->strength = UCOL_SECONDARY; /* no break here? fall through? wtf? */
  92. case 3: profile->strength = UCOL_TERTIARY; break;
  93. case 4: profile->strength = UCOL_IDENTICAL; break;
  94. default: profile->strength = UCOL_SECONDARY;
  95. LDAPDebug (LDAP_DEBUG_ANY,
  96. "%s: line %lu: strength \"%s\" not supported (will use 2)\n",
  97. fname, (unsigned long)lineno, cargv[4]);
  98. break;
  99. }
  100. switch (atoi(cargv[5])) {
  101. case 1: profile->decomposition = UCOL_OFF; break;
  102. case 2: profile->decomposition = UCOL_DEFAULT; /* no break here? fall through? wtf? */
  103. case 3: profile->decomposition = UCOL_ON; break;
  104. default: profile->decomposition = UCOL_DEFAULT;
  105. LDAPDebug (LDAP_DEBUG_ANY,
  106. "%s: line %lu: decomposition \"%s\" not supported (will use 2)\n",
  107. fname, (unsigned long)lineno, cargv[5]);
  108. break;
  109. }
  110. {
  111. char descStr[256];
  112. char nameOrder[256];
  113. char nameSubstring[256];
  114. char oidString[256];
  115. char *tmpStr=NULL;
  116. Slapi_MatchingRuleEntry *mrentry=slapi_matchingrule_new();
  117. if(UCOL_PRIMARY == profile->strength) {
  118. strcpy(nameOrder,"caseIgnoreOrderingMatch");
  119. strcpy(nameSubstring,"caseIgnoreSubstringMatch");
  120. }
  121. else {
  122. strcpy(nameOrder,"caseExactOrderingMatch");
  123. strcpy(nameSubstring,"caseExactSubstringMatch");
  124. }
  125. if(cargc > 7) {
  126. strcpy(nameOrder,"-");
  127. PL_strcatn(nameOrder,sizeof(nameOrder),cargv[7]);
  128. strcpy(nameSubstring,"-");
  129. PL_strcatn(nameSubstring,sizeof(nameSubstring),cargv[7]);
  130. slapi_matchingrule_set(mrentry,SLAPI_MATCHINGRULE_NAME,
  131. (void *)slapi_ch_strdup(nameOrder));
  132. }
  133. else {
  134. if(0 != cargv[1][0]) {
  135. strcpy(nameOrder,"-");
  136. strcpy(nameSubstring,"-");
  137. } else {
  138. nameOrder[0] = 0;
  139. nameSubstring[0] = 0;
  140. }
  141. PL_strcatn(nameOrder,sizeof(nameOrder),cargv[1]);
  142. PL_strcatn(nameSubstring,sizeof(nameSubstring),cargv[1]);
  143. slapi_matchingrule_set(mrentry,SLAPI_MATCHINGRULE_NAME,
  144. (void *)slapi_ch_strdup(nameOrder));
  145. }
  146. PL_strncpyz(oidString,cargv[6], sizeof(oidString));
  147. slapi_matchingrule_set(mrentry,SLAPI_MATCHINGRULE_OID,
  148. (void *)slapi_ch_strdup(oidString));
  149. if(0 != cargv[2][0]) {
  150. PR_snprintf(descStr, sizeof(descStr), "%s-%s",cargv[1],cargv[2]);
  151. }
  152. else {
  153. PL_strncpyz(descStr,cargv[1], sizeof(descStr));
  154. }
  155. slapi_matchingrule_set(mrentry,SLAPI_MATCHINGRULE_DESC,
  156. (void *)slapi_ch_strdup(descStr));
  157. slapi_matchingrule_set(mrentry,SLAPI_MATCHINGRULE_SYNTAX,
  158. (void *)slapi_ch_strdup(DIRSTRING_SYNTAX_OID));
  159. slapi_matchingrule_register(mrentry);
  160. slapi_matchingrule_get(mrentry,SLAPI_MATCHINGRULE_NAME,
  161. (void *)&tmpStr);
  162. slapi_ch_free((void **)&tmpStr);
  163. slapi_matchingrule_get(mrentry,SLAPI_MATCHINGRULE_OID,
  164. (void *)&tmpStr);
  165. slapi_ch_free((void **)&tmpStr);
  166. slapi_matchingrule_set(mrentry,SLAPI_MATCHINGRULE_NAME,
  167. (void *)slapi_ch_strdup(nameSubstring));
  168. PL_strcatn(oidString,sizeof(oidString),".6");
  169. slapi_matchingrule_set(mrentry,SLAPI_MATCHINGRULE_OID,
  170. (void *)slapi_ch_strdup(oidString));
  171. slapi_matchingrule_register(mrentry);
  172. slapi_matchingrule_free(&mrentry,1);
  173. }
  174. for (arg = 6; arg < cargc; ++arg) {
  175. auto coll_id_t* id = (coll_id_t*) slapi_ch_malloc (sizeof (coll_id_t));
  176. id->oid = slapi_ch_strdup (cargv[arg]);
  177. id->profile = profile;
  178. if (collation_ids <= 0) {
  179. collation_id = (const coll_id_t**) slapi_ch_malloc (2 * sizeof (coll_id_t*));
  180. } else {
  181. collation_id = (const coll_id_t**) slapi_ch_realloc
  182. ((void*)collation_id, (collation_ids + 2) * sizeof (coll_id_t*));
  183. }
  184. collation_id [collation_ids++] = id;
  185. collation_id [collation_ids] = NULL;
  186. }
  187. }
  188. } else {
  189. return -1; /* unrecognized */
  190. }
  191. return 0; /* success */
  192. }
  193. typedef struct collation_indexer_t
  194. /* A kind of indexer, implemented using an ICU Collator */
  195. {
  196. UCollator* collator;
  197. UConverter* converter;
  198. struct berval** ix_keys;
  199. int is_default_collator;
  200. } collation_indexer_t;
  201. /*
  202. Caller must ensure that U == NULL and Ulen == 0 the first time called
  203. */
  204. static UErrorCode
  205. SetUnicodeStringFromUTF_8 (UChar** U, int32_t* Ulen, int *isAlloced, const struct berval* bv)
  206. /* Copy the UTF-8 string bv into the UnicodeString U,
  207. but remove leading and trailing whitespace, and
  208. convert consecutive whitespaces into a single space.
  209. Ulen is set to the number of UChars in the array (not necessarily the number of bytes!)
  210. */
  211. {
  212. size_t n;
  213. int32_t len = 0; /* length of non-space string */
  214. UErrorCode err = U_ZERO_ERROR;
  215. const char* s = bv->bv_val;
  216. const char* begin = NULL; /* will point to beginning of non-space in val */
  217. const char* end = NULL; /* will point to the first space after the last non-space char in val */
  218. int32_t nUchars = 0;
  219. if (!bv->bv_len) { /* no value? */
  220. return U_INVALID_FORMAT_ERROR; /* don't know what else to use here */
  221. }
  222. /* first, set s to the first non-space char in bv->bv_val */
  223. for (n = 0; (n < bv->bv_len) && ldap_utf8isspace((char *)s); ) { /* cast away const */
  224. const char *next = LDAP_UTF8NEXT((char *)s); /* cast away const */
  225. n += (next - s); /* count bytes, not chars */
  226. s = next;
  227. }
  228. begin = s; /* begin points to first non-space char in val */
  229. if (n >= bv->bv_len) { /* value is all spaces? */
  230. return U_INVALID_FORMAT_ERROR; /* don't know what else to use here */
  231. }
  232. s = bv->bv_val + (bv->bv_len-1); /* move s to last char of bv_val */
  233. end = s; /* end points at last char of bv_val - may change below */
  234. /* find the last non-null and non-space char of val */
  235. for (n = bv->bv_len; (n > 0) && (!*s || ldap_utf8isspace((char *)s));) {
  236. const char *prev = LDAP_UTF8PREV((char *)s);
  237. end = prev;
  238. n -= (s - prev); /* count bytes, not chars */
  239. s = prev;
  240. }
  241. /* end now points at last non-null/non-space of val */
  242. if (n < 0) { /* bogus */
  243. return U_INVALID_FORMAT_ERROR; /* don't know what else to use here */
  244. }
  245. len = LDAP_UTF8NEXT((char *)end) - begin;
  246. u_strFromUTF8(*U, *Ulen, &nUchars, begin, len, &err);
  247. if (nUchars > *Ulen) { /* need more space */
  248. if (*isAlloced) { /* realloc space */
  249. *U = (UChar *)slapi_ch_realloc((char *)*U, sizeof(UChar) * nUchars);
  250. } else { /* must use malloc */
  251. *U = (UChar *)slapi_ch_malloc(sizeof(UChar) * nUchars);
  252. *isAlloced = 1; /* no longer using fixed buffer */
  253. }
  254. *Ulen = nUchars;
  255. err = U_ZERO_ERROR; /* reset */
  256. u_strFromUTF8(*U, *Ulen, NULL, begin, len, &err);
  257. } else {
  258. *Ulen = nUchars;
  259. }
  260. return err;
  261. }
  262. static struct berval**
  263. collation_index (indexer_t* ix, struct berval** bvec, struct berval** prefixes)
  264. {
  265. collation_indexer_t* etc = (collation_indexer_t*) ix->ix_etc;
  266. struct berval** keys = NULL;
  267. if (bvec) {
  268. char keyBuffer[128]; /* try to use static space buffer to avoid malloc */
  269. int32_t keyLen = sizeof(keyBuffer);
  270. char* key = keyBuffer; /* but key can grow if necessary */
  271. size_t keyn = 0;
  272. struct berval** bv;
  273. UChar charBuffer[128]; /* try to use static space buffer */
  274. int32_t nChars = sizeof(charBuffer)/sizeof(UChar); /* but grow if necessary */
  275. UChar *chars = charBuffer; /* try to reuse this */
  276. int isAlloced = 0; /* using fixed buffer */
  277. for (bv = bvec; *bv; ++bv) {
  278. /* if chars is allocated, nChars will be the capacity and the number of chars in chars */
  279. /* otherwise, nChars will be the number of chars, which may be less than the capacity */
  280. if (!isAlloced) {
  281. nChars = sizeof(charBuffer)/sizeof(UChar); /* reset */
  282. }
  283. if (U_ZERO_ERROR == SetUnicodeStringFromUTF_8 (&chars, &nChars, &isAlloced, *bv)) {
  284. /* nChars is now the number of UChar in chars, which may be less than the
  285. capacity of charBuffer if not allocated */
  286. struct berval* prefix = prefixes ? prefixes[bv-bvec] : NULL;
  287. const size_t prefixLen = prefix ? prefix->bv_len : 0;
  288. struct berval* bk = NULL;
  289. int32_t realLen; /* real length of key, not keyLen which is buffer size */
  290. /* try to get the sort key using key and keyLen; only grow key
  291. if we need to */
  292. /* can use -1 for char len since the conversion from UTF8
  293. null terminates the string */
  294. realLen = ucol_getSortKey(etc->collator, chars, nChars, (uint8_t *)key, keyLen);
  295. if (realLen > keyLen) { /* need more space */
  296. if (key == keyBuffer) {
  297. key = (char*)slapi_ch_malloc(sizeof(char) * realLen);
  298. } else {
  299. key = (char*)slapi_ch_realloc(key, sizeof(char) * realLen);
  300. }
  301. keyLen = ucol_getSortKey(etc->collator, chars, nChars, (uint8_t *)key, realLen);
  302. }
  303. if (realLen > 0) {
  304. bk = (struct berval*) slapi_ch_malloc (sizeof(struct berval));
  305. bk->bv_len = prefixLen + realLen;
  306. bk->bv_val = slapi_ch_malloc (bk->bv_len + 1);
  307. if (prefixLen) {
  308. memcpy(bk->bv_val, prefix->bv_val, prefixLen);
  309. }
  310. memcpy(bk->bv_val + prefixLen, key, realLen);
  311. bk->bv_val[bk->bv_len] = '\0';
  312. LDAPDebug (LDAP_DEBUG_FILTER, "collation_index(%.*s) %lu bytes\n",
  313. bk->bv_len, bk->bv_val, (unsigned long)bk->bv_len);
  314. keys = (struct berval**)
  315. slapi_ch_realloc ((void*)keys, sizeof(struct berval*) * (keyn + 2));
  316. keys[keyn++] = bk;
  317. keys[keyn] = NULL;
  318. }
  319. }
  320. }
  321. if (chars != charBuffer) { /* realloc'ed, need to free */
  322. slapi_ch_free((void **)&chars);
  323. }
  324. if (key != keyBuffer) { /* realloc'ed, need to free */
  325. slapi_ch_free_string(&key);
  326. }
  327. }
  328. if (etc->ix_keys != NULL) ber_bvecfree (etc->ix_keys);
  329. etc->ix_keys = keys;
  330. return keys;
  331. }
  332. static void
  333. collation_indexer_destroy (indexer_t* ix)
  334. /* The destructor function for a collation-based indexer. */
  335. {
  336. collation_indexer_t* etc = (collation_indexer_t*) ix->ix_etc;
  337. if (etc->converter) {
  338. ucnv_close(etc->converter);
  339. etc->converter = NULL;
  340. }
  341. if (!etc->is_default_collator) {
  342. /* Don't delete the default collation - it seems to cause problems */
  343. ucol_close(etc->collator);
  344. etc->collator = NULL;
  345. }
  346. if (etc->ix_keys != NULL) {
  347. ber_bvecfree (etc->ix_keys);
  348. etc->ix_keys = NULL;
  349. }
  350. slapi_ch_free((void**)&ix->ix_etc);
  351. ix->ix_etc = NULL; /* just for hygiene */
  352. }
  353. static UErrorCode
  354. s_newNamedLocaleFromComponents(char **locale, const char *lang, const char *country, const char *variant)
  355. {
  356. UErrorCode err = U_ZERO_ERROR;
  357. int hasLang = (lang && *lang);
  358. int hasC = (country && *country);
  359. int hasVar = (variant && *variant);
  360. *locale = NULL;
  361. if (hasLang) {
  362. *locale = PR_smprintf("%s%s%s%s%s", lang, (hasC ? "_" : ""), (hasC ? country : ""),
  363. (hasVar ? "_" : ""), (hasVar ? variant : ""));
  364. } else {
  365. err = U_INVALID_FORMAT_ERROR; /* don't know what else to use here */
  366. }
  367. return err;
  368. }
  369. indexer_t*
  370. collation_indexer_create (const char* oid)
  371. /* Return a new indexer, based on the collation identified by oid.
  372. Return NULL if this can't be done.
  373. */
  374. {
  375. indexer_t* ix = NULL;
  376. const coll_id_t** id = collation_id;
  377. char* locale = NULL; /* NULL == default locale */
  378. if (id) for (; *id; ++id) {
  379. if (!strcasecmp (oid, (*id)->oid)) {
  380. const coll_profile_t* profile = (*id)->profile;
  381. const int is_default = (profile->language == NULL &&
  382. profile->country == NULL &&
  383. profile->variant == NULL);
  384. UErrorCode err = U_ZERO_ERROR;
  385. if ( ! is_default) {
  386. if (locale) {
  387. PR_smprintf_free(locale);
  388. locale = NULL;
  389. }
  390. err = s_newNamedLocaleFromComponents(&locale,
  391. profile->language,
  392. profile->country,
  393. profile->variant);
  394. }
  395. if (err == U_ZERO_ERROR) {
  396. UCollator* coll = ucol_open(locale, &err);
  397. /*
  398. * If we found exactly the right collator for this locale,
  399. * or if we found a fallback one, or if we are happy with
  400. * the default, use it.
  401. */
  402. if (err == U_ZERO_ERROR || err == U_USING_FALLBACK_WARNING ||
  403. (err == U_USING_DEFAULT_WARNING && is_default)) {
  404. collation_indexer_t* etc = (collation_indexer_t*)
  405. slapi_ch_calloc (1, sizeof (collation_indexer_t));
  406. ix = (indexer_t*) slapi_ch_calloc (1, sizeof (indexer_t));
  407. ucol_setAttribute (coll, UCOL_STRENGTH, profile->strength, &err);
  408. if (err != U_ZERO_ERROR) {
  409. LDAPDebug (LDAP_DEBUG_ANY, "collation_indexer_create: could not "
  410. "set the collator strength for oid %s to %d: err %d\n",
  411. oid, profile->strength, err);
  412. }
  413. ucol_setAttribute (coll, UCOL_DECOMPOSITION_MODE, profile->decomposition, &err);
  414. if (err != U_ZERO_ERROR) {
  415. LDAPDebug (LDAP_DEBUG_ANY, "collation_indexer_create: could not "
  416. "set the collator decomposition mode for oid %s to %d: err %d\n",
  417. oid, profile->decomposition, err);
  418. }
  419. etc->collator = coll;
  420. etc->is_default_collator = is_default;
  421. for (id = collation_id; *id; ++id) {
  422. if ((*id)->profile == profile) {
  423. break; /* found the 'official' id */
  424. }
  425. }
  426. ix->ix_etc = etc;
  427. ix->ix_oid = (*id)->oid;
  428. ix->ix_index = collation_index;
  429. ix->ix_destroy = collation_indexer_destroy;
  430. break; /* return */
  431. /* free (etc); */
  432. /* free (ix); */
  433. } else if (err == U_USING_DEFAULT_WARNING) {
  434. LDAPDebug (LDAP_DEBUG_FILTER, "collation_indexer_create: could not "
  435. "create an indexer for OID %s for locale %s and could not "
  436. "use default locale\n",
  437. oid, (locale ? locale : "(default)"), NULL);
  438. } else { /* error */
  439. LDAPDebug (LDAP_DEBUG_FILTER, "collation_indexer_create: could not "
  440. "create an indexer for OID %s for locale %s: err = %d\n",
  441. oid, (locale ? locale : "(default)"), err);
  442. }
  443. if (coll) {
  444. ucol_close (coll);
  445. coll = NULL;
  446. }
  447. }
  448. break; /* failed to create the specified collator */
  449. }
  450. }
  451. if (locale) {
  452. PR_smprintf_free(locale);
  453. locale = NULL;
  454. }
  455. return ix;
  456. }