phonetic.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641
  1. /** BEGIN COPYRIGHT BLOCK
  2. * Copyright (C) 2001 Sun Microsystems, Inc. Used by permission.
  3. * Copyright (C) 2005 Red Hat, Inc.
  4. * All rights reserved.
  5. *
  6. * License: GPL (version 3 or any later version).
  7. * See LICENSE for details.
  8. * END COPYRIGHT BLOCK **/
  9. #ifdef HAVE_CONFIG_H
  10. # include <config.h>
  11. #endif
  12. /* phonetic.c - routines to do phonetic matching */
  13. #include <stdio.h>
  14. #include <string.h>
  15. #include <ctype.h>
  16. #include <sys/types.h>
  17. #include "syntax.h"
  18. #include "portable.h"
  19. #if !defined(METAPHONE) && !defined(SOUNDEX)
  20. #define METAPHONE
  21. #endif
  22. #define iswordbreak(s) \
  23. (isascii(*(s)) \
  24. ? (isspace(*(s)) || \
  25. ispunct(*(s)) || \
  26. isdigit(*(s)) || \
  27. *(s) == '\0') \
  28. : utf8iswordbreak(s))
  29. static int
  30. utf8iswordbreak( const char* s )
  31. {
  32. switch( LDAP_UTF8GETCC( s )) {
  33. case 0x00A0: /* non-breaking space */
  34. case 0x3000: /* ideographic space */
  35. case 0xFEFF: /* zero-width non-breaking space */
  36. return 1;
  37. default: break;
  38. }
  39. return 0;
  40. }
  41. char *
  42. first_word( char *s )
  43. {
  44. if ( s == NULL ) {
  45. return( NULL );
  46. }
  47. while ( iswordbreak( s ) ) {
  48. if ( *s == '\0' ) {
  49. return( NULL );
  50. } else {
  51. LDAP_UTF8INC( s );
  52. }
  53. }
  54. return( s );
  55. }
  56. char *
  57. next_word( char *s )
  58. {
  59. if ( s == NULL ) {
  60. return( NULL );
  61. }
  62. while ( ! iswordbreak( s ) ) {
  63. LDAP_UTF8INC( s );
  64. }
  65. while ( iswordbreak( s ) ) {
  66. if ( *s == '\0' ) {
  67. return( NULL );
  68. } else {
  69. LDAP_UTF8INC( s );
  70. }
  71. }
  72. return( s );
  73. }
  74. char *
  75. word_dup( char *w )
  76. {
  77. char *s, *ret;
  78. char save;
  79. for ( s = w; !iswordbreak( s ); LDAP_UTF8INC( s ))
  80. ; /* NULL */
  81. save = *s;
  82. *s = '\0';
  83. ret = slapi_ch_strdup( w );
  84. *s = save;
  85. return( ret );
  86. }
  87. #ifndef MAXPHONEMELEN
  88. #define MAXPHONEMELEN 6
  89. #endif
  90. #if defined(SOUNDEX)
  91. /* lifted from isode-8.0 */
  92. char *
  93. phonetic( char *s )
  94. {
  95. char code, adjacent, ch;
  96. char *p;
  97. char **c;
  98. int i, cmax;
  99. char phoneme[MAXPHONEMELEN + 1];
  100. p = s;
  101. if ( p == NULL || *p == '\0' ) {
  102. return( NULL );
  103. }
  104. adjacent = '0';
  105. phoneme[0] = TOUPPER(*p);
  106. phoneme[1] = '\0';
  107. for ( i = 0; i < 99 && (! iswordbreak(p)); LDAP_UTF8INC( p )) {
  108. ch = TOUPPER (*p);
  109. code = '0';
  110. switch (ch) {
  111. case 'B':
  112. case 'F':
  113. case 'P':
  114. case 'V':
  115. code = (adjacent != '1') ? '1' : '0';
  116. break;
  117. case 'S':
  118. case 'C':
  119. case 'G':
  120. case 'J':
  121. case 'K':
  122. case 'Q':
  123. case 'X':
  124. case 'Z':
  125. code = (adjacent != '2') ? '2' : '0';
  126. break;
  127. case 'D':
  128. case 'T':
  129. code = (adjacent != '3') ? '3' : '0';
  130. break;
  131. case 'L':
  132. code = (adjacent != '4') ? '4' : '0';
  133. break;
  134. case 'M':
  135. case 'N':
  136. code = (adjacent != '5') ? '5' : '0';
  137. break;
  138. case 'R':
  139. code = (adjacent != '6') ? '6' : '0';
  140. break;
  141. default:
  142. adjacent = '0';
  143. }
  144. if ( i == 0 ) {
  145. adjacent = code;
  146. i++;
  147. } else if ( code != '0' ) {
  148. if ( i == MAXPHONEMELEN )
  149. break;
  150. adjacent = phoneme[i] = code;
  151. i++;
  152. }
  153. }
  154. if ( i > 0 )
  155. phoneme[i] = '\0';
  156. return( slapi_ch_strdup( phoneme ) );
  157. }
  158. #else
  159. #if defined(METAPHONE)
  160. /*
  161. * Metaphone copied from C Gazette, June/July 1991, pp 56-57,
  162. * author Gary A. Parker, with changes by Bernard Tiffany of the
  163. * University of Michigan, and more changes by Tim Howes of the
  164. * University of Michigan.
  165. */
  166. /* Character coding array */
  167. static char vsvfn[26] = {
  168. 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2,
  169. /* A B C D E F G H I J K L M */
  170. 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0};
  171. /* N O P Q R S T U V W X Y Z */
  172. /* Macros to access character coding array */
  173. #define vowel(x) ((*(x) != '\0' && vsvfn[(*(x)) - 'A'] & 1) || /* AEIOU */ \
  174. (((*(x)==0xC3) && (*((x)+1))) ? ((0x80<=*((x)+1) && *((x)+1)<0x87) || \
  175. (0x88<=*((x)+1) && *((x)+1)<0x90) || (0x92<=*((x)+1) && *((x)+1)<0x97) || \
  176. (0x98<=*((x)+1) && *((x)+1)<0x9D) || (0xA0<=*((x)+1) && *((x)+1)<0xA7) || \
  177. (0xA8<=*((x)+1) && *((x)+1)<0xB0) || (0xB2<=*((x)+1) && *((x)+1)<0xB7) || \
  178. (0xB8<=*((x)+1) && *((x)+1)<0xBD)) : 0 ) /* Latin-1 characters */ )
  179. /*
  180. case 0xC3:
  181. */
  182. #define same(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 2) /* FJLMNR */
  183. #define varson(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 4) /* CGPST */
  184. #define frontv(x) ((*(x) != '\0' && vsvfn[(*(x)) - 'A'] & 8) || /* EIY */ \
  185. (((*(x)==0xC3) && (*((x)+1))) ? ((0x88<=*((x)+1) && *((x)+1)<0x90) || \
  186. (0xA8<=*((x)+1) && *((x)+1)<0xB0)) : 0 ) /* Latin-1 E/I */ )
  187. #define noghf(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 16) /* BDH */
  188. char *
  189. phonetic( char *Word )
  190. {
  191. unsigned char *n_start, *n, *n_end; /* pointers to string */
  192. char *metaph_end; /* pointers to metaph */
  193. unsigned char ntrans[42]; /* word with uppercase letters */
  194. int KSflag; /* state flag for X -> KS */
  195. char buf[MAXPHONEMELEN + 2];
  196. char *Metaph;
  197. /*
  198. * Copy Word to internal buffer, dropping non-alphabetic characters
  199. * and converting to upper case
  200. */
  201. n = ntrans + 4; n_end = ntrans + 35;
  202. while (!iswordbreak( Word ) && n < n_end) {
  203. if (isascii(*Word)) {
  204. if (isalpha(*Word)) {
  205. *n++ = TOUPPER(*Word);
  206. }
  207. ++Word;
  208. } else {
  209. auto const size_t len = LDAP_UTF8COPY((char *)n, Word);
  210. n += len; Word += len;
  211. }
  212. }
  213. Metaph = buf;
  214. *Metaph = '\0';
  215. if (n == ntrans + 4) {
  216. return( slapi_ch_strdup( buf ) ); /* Return if null */
  217. }
  218. n_end = n; /* Set n_end to end of string */
  219. /* ntrans[0] will always be == 0 */
  220. ntrans[0] = '\0';
  221. ntrans[1] = '\0';
  222. ntrans[2] = '\0';
  223. ntrans[3] = '\0';
  224. *n++ = 0;
  225. *n++ = 0;
  226. *n++ = 0;
  227. *n = 0; /* Pad with nulls */
  228. n = ntrans + 4; /* Assign pointer to start */
  229. /* Check for PN, KN, GN, AE, WR, WH, and X at start */
  230. switch (*n) {
  231. case 'P':
  232. case 'K':
  233. case 'G':
  234. /* 'PN', 'KN', 'GN' becomes 'N' */
  235. if (*(n + 1) == 'N')
  236. *n++ = 0;
  237. break;
  238. case 'A':
  239. /* 'AE' becomes 'E' */
  240. if (*(n + 1) == 'E')
  241. *n++ = 0;
  242. break;
  243. case 'W':
  244. /* 'WR' becomes 'R', and 'WH' to 'H' */
  245. if (*(n + 1) == 'R')
  246. *n++ = 0;
  247. else if (*(n + 1) == 'H') {
  248. *n++ = 0;
  249. }
  250. break;
  251. case 'X':
  252. /* 'X' becomes 'S' */
  253. *n = 'S';
  254. break;
  255. case 0xC3:
  256. switch (*(n+1)) {
  257. case 0x80:
  258. case 0x81:
  259. case 0x82:
  260. case 0x83:
  261. case 0x84:
  262. case 0x85:
  263. *n++ = 0;
  264. *n = 'A';
  265. break;
  266. case 0x87:
  267. *n++ = 0;
  268. *n = 'C';
  269. break;
  270. case 0x86:
  271. case 0x88:
  272. case 0x89:
  273. case 0x8A:
  274. case 0x8B:
  275. *n++ = 0;
  276. *n = 'E';
  277. break;
  278. case 0x8C:
  279. case 0x8D:
  280. case 0x8E:
  281. case 0x8F:
  282. *n++ = 0;
  283. *n = 'I';
  284. break;
  285. case 0x90: /* eth: TH */
  286. *n++ = 0;
  287. *n = '0';
  288. break;
  289. case 0x91:
  290. *n++ = 0;
  291. *n = 'N';
  292. break;
  293. case 0x92:
  294. case 0x93:
  295. case 0x94:
  296. case 0x95:
  297. case 0x96:
  298. case 0x98:
  299. *n++ = 0;
  300. *n = 'O';
  301. break;
  302. case 0x99:
  303. case 0x9A:
  304. case 0x9B:
  305. case 0x9C:
  306. *n++ = 0;
  307. *n = 'U';
  308. break;
  309. case 0x9D:
  310. *n++ = 0;
  311. *n = 'Y';
  312. break;
  313. case 0x9E:
  314. *n++ = 0;
  315. *n = '0'; /* thorn: TH */
  316. break;
  317. case 0x9F:
  318. *n++ = 0;
  319. *n = 's';
  320. break;
  321. case 0xA0:
  322. case 0xA1:
  323. case 0xA2:
  324. case 0xA3:
  325. case 0xA4:
  326. case 0xA5:
  327. *n++ = 0;
  328. *n = 'a';
  329. break;
  330. case 0xA6:
  331. *n++ = 0;
  332. *n = 'e';
  333. break;
  334. case 0xA7:
  335. *n++ = 0;
  336. *n = 'c';
  337. break;
  338. case 0xA8:
  339. case 0xA9:
  340. case 0xAA:
  341. case 0xAB:
  342. *n++ = 0;
  343. *n = 'e';
  344. break;
  345. case 0xAC:
  346. case 0xAD:
  347. case 0xAE:
  348. case 0xAF:
  349. *n++ = 0;
  350. *n = 'i';
  351. break;
  352. case 0xB0:
  353. *n++ = 0;
  354. *n = '0'; /* eth: th */
  355. break;
  356. case 0xB1:
  357. *n++ = 0;
  358. *n = 'n';
  359. break;
  360. case 0xB2:
  361. case 0xB3:
  362. case 0xB4:
  363. case 0xB5:
  364. case 0xB6:
  365. case 0xB8:
  366. *n++ = 0;
  367. *n = 'o';
  368. break;
  369. case 0xB9:
  370. case 0xBA:
  371. case 0xBB:
  372. case 0xBC:
  373. *n++ = 0;
  374. *n = 'u';
  375. break;
  376. case 0xBD:
  377. case 0xBF:
  378. *n++ = 0;
  379. *n = 'y';
  380. break;
  381. case 0xBE:
  382. *n++ = 0;
  383. *n = '0'; /* thorn: th */
  384. break;
  385. }
  386. break;
  387. }
  388. /*
  389. * Now, loop step through string, stopping at end of string or when
  390. * the computed 'metaph' is MAXPHONEMELEN characters long
  391. */
  392. KSflag = 0; /* state flag for KS translation */
  393. for (metaph_end = Metaph + MAXPHONEMELEN, n_start = n;
  394. n <= n_end && Metaph < metaph_end; n++) {
  395. if (KSflag) {
  396. KSflag = 0;
  397. *Metaph++ = 'S';
  398. } else if (!isascii(*n)) {
  399. switch (*n) {
  400. case 0xC3:
  401. if (n+1 <= n_end) {
  402. switch (*(++n)) {
  403. case 0x87: /* C with cedilla */
  404. case 0x9F: /* ess-zed */
  405. case 0xA7: /* c with cedilla */
  406. *Metaph++ = 'S';
  407. break;
  408. case 0x90: /* eth: TH */
  409. case 0x9E: /* thorn: TH */
  410. case 0xB0: /* eth: th */
  411. case 0xBE: /* thorn: th */
  412. *Metaph++ = '0';
  413. break;
  414. case 0x91:
  415. case 0xB1:
  416. *Metaph++ = 'N';
  417. break;
  418. case 0x9D:
  419. case 0xBD:
  420. case 0xBF:
  421. *Metaph++ = 'Y';
  422. break;
  423. default: /* skipping the rest */
  424. break;
  425. }
  426. }
  427. break;
  428. default:
  429. *Metaph++ = *n;
  430. }
  431. } else {
  432. /* Drop duplicates except for CC */
  433. if (*(n - 1) == *n && *n != 'C')
  434. continue;
  435. /* Check for F J L M N R or first letter vowel */
  436. if (same(*n) || (n == n_start && vowel(n))) {
  437. *Metaph++ = *n;
  438. } else {
  439. switch (*n) {
  440. case 'B':
  441. /*
  442. * B unless in -MB
  443. */
  444. if (n < (n_end - 1) && *(n - 1) != 'M') {
  445. *Metaph++ = *n;
  446. }
  447. break;
  448. case 'C':
  449. /*
  450. * X if in -CIA-, -CH- else S if in
  451. * -CI-, -CE-, -CY- else dropped if
  452. * in -SCI-, -SCE-, -SCY- else K
  453. */
  454. if (*(n - 1) != 'S' || !frontv((n + 1))) {
  455. if (*(n + 1) == 'I' && *(n + 2) == 'A') {
  456. *Metaph++ = 'X';
  457. } else if (frontv((n + 1))) {
  458. *Metaph++ = 'S';
  459. } else if (*(n + 1) == 'H') {
  460. *Metaph++ = ((n == n_start && !vowel((n + 2)))
  461. || *(n - 1) == 'S')
  462. ? (char) 'K' : (char) 'X';
  463. } else {
  464. *Metaph++ = 'K';
  465. }
  466. }
  467. break;
  468. case 'D':
  469. /*
  470. * J if in DGE or DGI or DGY else T
  471. */
  472. *Metaph++ = (*(n + 1) == 'G' && frontv((n + 2)))
  473. ? (char) 'J' : (char) 'T';
  474. break;
  475. case 'G':
  476. /*
  477. * F if in -GH and not B--GH, D--GH,
  478. * -H--GH, -H---GH else dropped if
  479. * -GNED, -GN, -DGE-, -DGI-, -DGY-
  480. * else J if in -GE-, -GI-, -GY- and
  481. * not GG else K
  482. */
  483. if ((*(n + 1) != 'J' || vowel((n + 2))) &&
  484. (*(n + 1) != 'N' || ((n + 1) < n_end &&
  485. (*(n + 2) != 'E' || *(n + 3) != 'D'))) &&
  486. (*(n - 1) != 'D' || !frontv((n + 1))))
  487. *Metaph++ = (frontv((n + 1)) &&
  488. *(n + 2) != 'G') ? (char) 'G' : (char) 'K';
  489. else if (*(n + 1) == 'H' && !noghf(*(n - 3)) &&
  490. *(n - 4) != 'H')
  491. *Metaph++ = 'F';
  492. break;
  493. case 'H':
  494. /*
  495. * H if before a vowel and not after
  496. * C, G, P, S, T else dropped
  497. */
  498. if (!varson(*(n - 1)) && (!vowel((n - 1)) ||
  499. vowel((n + 1))))
  500. *Metaph++ = 'H';
  501. break;
  502. case 'K':
  503. /*
  504. * dropped if after C else K
  505. */
  506. if (*(n - 1) != 'C')
  507. *Metaph++ = 'K';
  508. break;
  509. case 'P':
  510. /*
  511. * F if before H, else P
  512. */
  513. *Metaph++ = *(n + 1) == 'H' ?
  514. (char) 'F' : (char) 'P';
  515. break;
  516. case 'Q':
  517. /*
  518. * K
  519. */
  520. *Metaph++ = 'K';
  521. break;
  522. case 'S':
  523. /*
  524. * X in -SH-, -SIO- or -SIA- else S
  525. */
  526. *Metaph++ = (*(n + 1) == 'H' ||
  527. (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
  528. *(n + 2) == 'A')))
  529. ? (char) 'X' : (char) 'S';
  530. break;
  531. case 'T':
  532. /*
  533. * X in -TIA- or -TIO- else 0 (zero)
  534. * before H else dropped if in -TCH-
  535. * else T
  536. */
  537. if (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
  538. *(n + 2) == 'A'))
  539. *Metaph++ = 'X';
  540. else if (*(n + 1) == 'H')
  541. *Metaph++ = '0';
  542. else if (*(n + 1) != 'C' || *(n + 2) != 'H')
  543. *Metaph++ = 'T';
  544. break;
  545. case 'V':
  546. /*
  547. * F
  548. */
  549. *Metaph++ = 'F';
  550. break;
  551. case 'W':
  552. /*
  553. * W after a vowel, else dropped
  554. */
  555. case 'Y':
  556. /*
  557. * Y unless followed by a vowel
  558. */
  559. if (vowel((n + 1)))
  560. *Metaph++ = *n;
  561. break;
  562. case 'X':
  563. /*
  564. * KS
  565. */
  566. if (n == n_start)
  567. *Metaph++ = 'S';
  568. else {
  569. *Metaph++ = 'K'; /* Insert K, then S */
  570. KSflag = 1;
  571. }
  572. break;
  573. case 'Z':
  574. /*
  575. * S
  576. */
  577. *Metaph++ = 'S';
  578. break;
  579. }
  580. }
  581. }
  582. }
  583. *Metaph = 0; /* Null terminate */
  584. return( slapi_ch_strdup( buf ) );
  585. }
  586. #endif /* METAPHONE */
  587. #endif /* !SOUNDEX */