phonetic.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649
  1. /** BEGIN COPYRIGHT BLOCK
  2. * Copyright (C) 2001 Sun Microsystems, Inc. Used by permission.
  3. * Copyright (C) 2005 Red Hat, Inc.
  4. * All rights reserved.
  5. *
  6. * License: GPL (version 3 or any later version).
  7. * See LICENSE for details.
  8. * END COPYRIGHT BLOCK **/
  9. #ifdef HAVE_CONFIG_H
  10. #include <config.h>
  11. #endif
  12. /* phonetic.c - routines to do phonetic matching */
  13. #include <stdio.h>
  14. #include <string.h>
  15. #include <ctype.h>
  16. #include <sys/types.h>
  17. #include "syntax.h"
  18. #include "portable.h"
  19. #if !defined(METAPHONE) && !defined(SOUNDEX)
  20. #define METAPHONE
  21. #endif
  22. #define iswordbreak(s) \
  23. (isascii(*(s)) \
  24. ? (isspace(*(s)) || \
  25. ispunct(*(s)) || \
  26. isdigit(*(s)) || \
  27. *(s) == '\0') \
  28. : utf8iswordbreak(s))
  29. static int
  30. utf8iswordbreak(const char *s)
  31. {
  32. switch (LDAP_UTF8GETCC(s)) {
  33. case 0x00A0: /* non-breaking space */
  34. case 0x3000: /* ideographic space */
  35. case 0xFEFF: /* zero-width non-breaking space */
  36. return 1;
  37. default:
  38. break;
  39. }
  40. return 0;
  41. }
  42. char *
  43. first_word(char *s)
  44. {
  45. if (s == NULL) {
  46. return (NULL);
  47. }
  48. while (iswordbreak(s)) {
  49. if (*s == '\0') {
  50. return (NULL);
  51. } else {
  52. LDAP_UTF8INC(s);
  53. }
  54. }
  55. return (s);
  56. }
  57. char *
  58. next_word(char *s)
  59. {
  60. if (s == NULL) {
  61. return (NULL);
  62. }
  63. while (!iswordbreak(s)) {
  64. LDAP_UTF8INC(s);
  65. }
  66. while (iswordbreak(s)) {
  67. if (*s == '\0') {
  68. return (NULL);
  69. } else {
  70. LDAP_UTF8INC(s);
  71. }
  72. }
  73. return (s);
  74. }
  75. char *
  76. word_dup(char *w)
  77. {
  78. char *s, *ret;
  79. char save;
  80. for (s = w; !iswordbreak(s); LDAP_UTF8INC(s))
  81. ; /* NULL */
  82. save = *s;
  83. *s = '\0';
  84. ret = slapi_ch_strdup(w);
  85. *s = save;
  86. return (ret);
  87. }
  88. #ifndef MAXPHONEMELEN
  89. #define MAXPHONEMELEN 6
  90. #endif
  91. #if defined(SOUNDEX)
  92. /* lifted from isode-8.0 */
  93. char *
  94. phonetic(char *s)
  95. {
  96. char code, adjacent, ch;
  97. char *p;
  98. char **c;
  99. int i, cmax;
  100. char phoneme[MAXPHONEMELEN + 1];
  101. p = s;
  102. if (p == NULL || *p == '\0') {
  103. return (NULL);
  104. }
  105. adjacent = '0';
  106. phoneme[0] = TOUPPER(*p);
  107. phoneme[1] = '\0';
  108. for (i = 0; i < 99 && (!iswordbreak(p)); LDAP_UTF8INC(p)) {
  109. ch = TOUPPER(*p);
  110. code = '0';
  111. switch (ch) {
  112. case 'B':
  113. case 'F':
  114. case 'P':
  115. case 'V':
  116. code = (adjacent != '1') ? '1' : '0';
  117. break;
  118. case 'S':
  119. case 'C':
  120. case 'G':
  121. case 'J':
  122. case 'K':
  123. case 'Q':
  124. case 'X':
  125. case 'Z':
  126. code = (adjacent != '2') ? '2' : '0';
  127. break;
  128. case 'D':
  129. case 'T':
  130. code = (adjacent != '3') ? '3' : '0';
  131. break;
  132. case 'L':
  133. code = (adjacent != '4') ? '4' : '0';
  134. break;
  135. case 'M':
  136. case 'N':
  137. code = (adjacent != '5') ? '5' : '0';
  138. break;
  139. case 'R':
  140. code = (adjacent != '6') ? '6' : '0';
  141. break;
  142. default:
  143. adjacent = '0';
  144. }
  145. if (i == 0) {
  146. adjacent = code;
  147. i++;
  148. } else if (code != '0') {
  149. if (i == MAXPHONEMELEN)
  150. break;
  151. adjacent = phoneme[i] = code;
  152. i++;
  153. }
  154. }
  155. if (i > 0)
  156. phoneme[i] = '\0';
  157. return (slapi_ch_strdup(phoneme));
  158. }
  159. #else
  160. #if defined(METAPHONE)
  161. /*
  162. * Metaphone copied from C Gazette, June/July 1991, pp 56-57,
  163. * author Gary A. Parker, with changes by Bernard Tiffany of the
  164. * University of Michigan, and more changes by Tim Howes of the
  165. * University of Michigan.
  166. */
  167. /* Character coding array */
  168. static char vsvfn[26] = {
  169. 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2,
  170. /* A B C D E F G H I J K L M */
  171. 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0};
  172. /* N O P Q R S T U V W X Y Z */
  173. /* Macros to access character coding array */
  174. #define vowel(x) ((*(x) != '\0' && vsvfn[(*(x)) - 'A'] & 1) || /* AEIOU */ \
  175. (((*(x) == 0xC3) && (*((x) + 1))) ? ((0x80 <= *((x) + 1) && *((x) + 1) < 0x87) || \
  176. (0x88 <= *((x) + 1) && *((x) + 1) < 0x90) || (0x92 <= *((x) + 1) && *((x) + 1) < 0x97) || \
  177. (0x98 <= *((x) + 1) && *((x) + 1) < 0x9D) || (0xA0 <= *((x) + 1) && *((x) + 1) < 0xA7) || \
  178. (0xA8 <= *((x) + 1) && *((x) + 1) < 0xB0) || (0xB2 <= *((x) + 1) && *((x) + 1) < 0xB7) || \
  179. (0xB8 <= *((x) + 1) && *((x) + 1) < 0xBD)) \
  180. : 0) /* Latin-1 characters */)
  181. /*
  182. case 0xC3:
  183. */
  184. #define same(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 2) /* FJLMNR */
  185. #define varson(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 4) /* CGPST */
  186. #define frontv(x) ((*(x) != '\0' && vsvfn[(*(x)) - 'A'] & 8) || /* EIY */ \
  187. (((*(x) == 0xC3) && (*((x) + 1))) ? ((0x88 <= *((x) + 1) && *((x) + 1) < 0x90) || \
  188. (0xA8 <= *((x) + 1) && *((x) + 1) < 0xB0)) \
  189. : 0) /* Latin-1 E/I */)
  190. #define noghf(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 16) /* BDH */
  191. char *
  192. phonetic(char *Word)
  193. {
  194. unsigned char *n_start, *n, *n_end; /* pointers to string */
  195. char *metaph_end; /* pointers to metaph */
  196. unsigned char ntrans[42]; /* word with uppercase letters */
  197. int KSflag; /* state flag for X -> KS */
  198. char buf[MAXPHONEMELEN + 2];
  199. char *Metaph;
  200. /*
  201. * Copy Word to internal buffer, dropping non-alphabetic characters
  202. * and converting to upper case
  203. */
  204. n = ntrans + 4;
  205. n_end = ntrans + 35;
  206. while (!iswordbreak(Word) && n < n_end) {
  207. if (isascii(*Word)) {
  208. if (isalpha(*Word)) {
  209. *n++ = TOUPPER(*Word);
  210. }
  211. ++Word;
  212. } else {
  213. auto const size_t len = LDAP_UTF8COPY((char *)n, Word);
  214. n += len;
  215. Word += len;
  216. }
  217. }
  218. Metaph = buf;
  219. *Metaph = '\0';
  220. if (n == ntrans + 4) {
  221. return (slapi_ch_strdup(buf)); /* Return if null */
  222. }
  223. n_end = n; /* Set n_end to end of string */
  224. /* ntrans[0] will always be == 0 */
  225. ntrans[0] = '\0';
  226. ntrans[1] = '\0';
  227. ntrans[2] = '\0';
  228. ntrans[3] = '\0';
  229. *n++ = 0;
  230. *n++ = 0;
  231. *n++ = 0;
  232. *n = 0; /* Pad with nulls */
  233. n = ntrans + 4; /* Assign pointer to start */
  234. /* Check for PN, KN, GN, AE, WR, WH, and X at start */
  235. switch (*n) {
  236. case 'P':
  237. case 'K':
  238. case 'G':
  239. /* 'PN', 'KN', 'GN' becomes 'N' */
  240. if (*(n + 1) == 'N')
  241. *n++ = 0;
  242. break;
  243. case 'A':
  244. /* 'AE' becomes 'E' */
  245. if (*(n + 1) == 'E')
  246. *n++ = 0;
  247. break;
  248. case 'W':
  249. /* 'WR' becomes 'R', and 'WH' to 'H' */
  250. if (*(n + 1) == 'R')
  251. *n++ = 0;
  252. else if (*(n + 1) == 'H') {
  253. *n++ = 0;
  254. }
  255. break;
  256. case 'X':
  257. /* 'X' becomes 'S' */
  258. *n = 'S';
  259. break;
  260. case 0xC3:
  261. switch (*(n + 1)) {
  262. case 0x80:
  263. case 0x81:
  264. case 0x82:
  265. case 0x83:
  266. case 0x84:
  267. case 0x85:
  268. *n++ = 0;
  269. *n = 'A';
  270. break;
  271. case 0x87:
  272. *n++ = 0;
  273. *n = 'C';
  274. break;
  275. case 0x86:
  276. case 0x88:
  277. case 0x89:
  278. case 0x8A:
  279. case 0x8B:
  280. *n++ = 0;
  281. *n = 'E';
  282. break;
  283. case 0x8C:
  284. case 0x8D:
  285. case 0x8E:
  286. case 0x8F:
  287. *n++ = 0;
  288. *n = 'I';
  289. break;
  290. case 0x90: /* eth: TH */
  291. *n++ = 0;
  292. *n = '0';
  293. break;
  294. case 0x91:
  295. *n++ = 0;
  296. *n = 'N';
  297. break;
  298. case 0x92:
  299. case 0x93:
  300. case 0x94:
  301. case 0x95:
  302. case 0x96:
  303. case 0x98:
  304. *n++ = 0;
  305. *n = 'O';
  306. break;
  307. case 0x99:
  308. case 0x9A:
  309. case 0x9B:
  310. case 0x9C:
  311. *n++ = 0;
  312. *n = 'U';
  313. break;
  314. case 0x9D:
  315. *n++ = 0;
  316. *n = 'Y';
  317. break;
  318. case 0x9E:
  319. *n++ = 0;
  320. *n = '0'; /* thorn: TH */
  321. break;
  322. case 0x9F:
  323. *n++ = 0;
  324. *n = 's';
  325. break;
  326. case 0xA0:
  327. case 0xA1:
  328. case 0xA2:
  329. case 0xA3:
  330. case 0xA4:
  331. case 0xA5:
  332. *n++ = 0;
  333. *n = 'a';
  334. break;
  335. case 0xA6:
  336. *n++ = 0;
  337. *n = 'e';
  338. break;
  339. case 0xA7:
  340. *n++ = 0;
  341. *n = 'c';
  342. break;
  343. case 0xA8:
  344. case 0xA9:
  345. case 0xAA:
  346. case 0xAB:
  347. *n++ = 0;
  348. *n = 'e';
  349. break;
  350. case 0xAC:
  351. case 0xAD:
  352. case 0xAE:
  353. case 0xAF:
  354. *n++ = 0;
  355. *n = 'i';
  356. break;
  357. case 0xB0:
  358. *n++ = 0;
  359. *n = '0'; /* eth: th */
  360. break;
  361. case 0xB1:
  362. *n++ = 0;
  363. *n = 'n';
  364. break;
  365. case 0xB2:
  366. case 0xB3:
  367. case 0xB4:
  368. case 0xB5:
  369. case 0xB6:
  370. case 0xB8:
  371. *n++ = 0;
  372. *n = 'o';
  373. break;
  374. case 0xB9:
  375. case 0xBA:
  376. case 0xBB:
  377. case 0xBC:
  378. *n++ = 0;
  379. *n = 'u';
  380. break;
  381. case 0xBD:
  382. case 0xBF:
  383. *n++ = 0;
  384. *n = 'y';
  385. break;
  386. case 0xBE:
  387. *n++ = 0;
  388. *n = '0'; /* thorn: th */
  389. break;
  390. }
  391. break;
  392. }
  393. /*
  394. * Now, loop step through string, stopping at end of string or when
  395. * the computed 'metaph' is MAXPHONEMELEN characters long
  396. */
  397. KSflag = 0; /* state flag for KS translation */
  398. for (metaph_end = Metaph + MAXPHONEMELEN, n_start = n;
  399. n <= n_end && Metaph < metaph_end; n++) {
  400. if (KSflag) {
  401. KSflag = 0;
  402. *Metaph++ = 'S';
  403. } else if (!isascii(*n)) {
  404. switch (*n) {
  405. case 0xC3:
  406. if (n + 1 <= n_end) {
  407. switch (*(++n)) {
  408. case 0x87: /* C with cedilla */
  409. case 0x9F: /* ess-zed */
  410. case 0xA7: /* c with cedilla */
  411. *Metaph++ = 'S';
  412. break;
  413. case 0x90: /* eth: TH */
  414. case 0x9E: /* thorn: TH */
  415. case 0xB0: /* eth: th */
  416. case 0xBE: /* thorn: th */
  417. *Metaph++ = '0';
  418. break;
  419. case 0x91:
  420. case 0xB1:
  421. *Metaph++ = 'N';
  422. break;
  423. case 0x9D:
  424. case 0xBD:
  425. case 0xBF:
  426. *Metaph++ = 'Y';
  427. break;
  428. default: /* skipping the rest */
  429. break;
  430. }
  431. }
  432. break;
  433. default:
  434. *Metaph++ = *n;
  435. }
  436. } else {
  437. /* Drop duplicates except for CC */
  438. if (*(n - 1) == *n && *n != 'C')
  439. continue;
  440. /* Check for F J L M N R or first letter vowel */
  441. if (same(*n) || (n == n_start && vowel(n))) {
  442. *Metaph++ = *n;
  443. } else {
  444. switch (*n) {
  445. case 'B':
  446. /*
  447. * B unless in -MB
  448. */
  449. if (n < (n_end - 1) && *(n - 1) != 'M') {
  450. *Metaph++ = *n;
  451. }
  452. break;
  453. case 'C':
  454. /*
  455. * X if in -CIA-, -CH- else S if in
  456. * -CI-, -CE-, -CY- else dropped if
  457. * in -SCI-, -SCE-, -SCY- else K
  458. */
  459. if (*(n - 1) != 'S' || !frontv((n + 1))) {
  460. if (*(n + 1) == 'I' && *(n + 2) == 'A') {
  461. *Metaph++ = 'X';
  462. } else if (frontv((n + 1))) {
  463. *Metaph++ = 'S';
  464. } else if (*(n + 1) == 'H') {
  465. *Metaph++ = ((n == n_start && !vowel((n + 2))) || *(n - 1) == 'S')
  466. ? (char)'K'
  467. : (char)'X';
  468. } else {
  469. *Metaph++ = 'K';
  470. }
  471. }
  472. break;
  473. case 'D':
  474. /*
  475. * J if in DGE or DGI or DGY else T
  476. */
  477. *Metaph++ = (*(n + 1) == 'G' && frontv((n + 2)))
  478. ? (char)'J'
  479. : (char)'T';
  480. break;
  481. case 'G':
  482. /*
  483. * F if in -GH and not B--GH, D--GH,
  484. * -H--GH, -H---GH else dropped if
  485. * -GNED, -GN, -DGE-, -DGI-, -DGY-
  486. * else J if in -GE-, -GI-, -GY- and
  487. * not GG else K
  488. */
  489. if ((*(n + 1) != 'J' || vowel((n + 2))) &&
  490. (*(n + 1) != 'N' || ((n + 1) < n_end &&
  491. (*(n + 2) != 'E' || *(n + 3) != 'D'))) &&
  492. (*(n - 1) != 'D' || !frontv((n + 1))))
  493. *Metaph++ = (frontv((n + 1)) &&
  494. *(n + 2) != 'G')
  495. ? (char)'G'
  496. : (char)'K';
  497. else if (*(n + 1) == 'H' && !noghf(*(n - 3)) &&
  498. *(n - 4) != 'H')
  499. *Metaph++ = 'F';
  500. break;
  501. case 'H':
  502. /*
  503. * H if before a vowel and not after
  504. * C, G, P, S, T else dropped
  505. */
  506. if (!varson(*(n - 1)) && (!vowel((n - 1)) ||
  507. vowel((n + 1))))
  508. *Metaph++ = 'H';
  509. break;
  510. case 'K':
  511. /*
  512. * dropped if after C else K
  513. */
  514. if (*(n - 1) != 'C')
  515. *Metaph++ = 'K';
  516. break;
  517. case 'P':
  518. /*
  519. * F if before H, else P
  520. */
  521. *Metaph++ = *(n + 1) == 'H' ? (char)'F' : (char)'P';
  522. break;
  523. case 'Q':
  524. /*
  525. * K
  526. */
  527. *Metaph++ = 'K';
  528. break;
  529. case 'S':
  530. /*
  531. * X in -SH-, -SIO- or -SIA- else S
  532. */
  533. *Metaph++ = (*(n + 1) == 'H' ||
  534. (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
  535. *(n + 2) == 'A')))
  536. ? (char)'X'
  537. : (char)'S';
  538. break;
  539. case 'T':
  540. /*
  541. * X in -TIA- or -TIO- else 0 (zero)
  542. * before H else dropped if in -TCH-
  543. * else T
  544. */
  545. if (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
  546. *(n + 2) == 'A'))
  547. *Metaph++ = 'X';
  548. else if (*(n + 1) == 'H')
  549. *Metaph++ = '0';
  550. else if (*(n + 1) != 'C' || *(n + 2) != 'H')
  551. *Metaph++ = 'T';
  552. break;
  553. case 'V':
  554. /*
  555. * F
  556. */
  557. *Metaph++ = 'F';
  558. break;
  559. case 'W':
  560. /*
  561. * W after a vowel, else dropped
  562. */
  563. case 'Y':
  564. /*
  565. * Y unless followed by a vowel
  566. */
  567. if (vowel((n + 1)))
  568. *Metaph++ = *n;
  569. break;
  570. case 'X':
  571. /*
  572. * KS
  573. */
  574. if (n == n_start)
  575. *Metaph++ = 'S';
  576. else {
  577. *Metaph++ = 'K'; /* Insert K, then S */
  578. KSflag = 1;
  579. }
  580. break;
  581. case 'Z':
  582. /*
  583. * S
  584. */
  585. *Metaph++ = 'S';
  586. break;
  587. }
  588. }
  589. }
  590. }
  591. *Metaph = 0; /* Null terminate */
  592. return (slapi_ch_strdup(buf));
  593. }
  594. #endif /* METAPHONE */
  595. #endif /* !SOUNDEX */