phonetic.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670
  1. /** BEGIN COPYRIGHT BLOCK
  2. * This Program is free software; you can redistribute it and/or modify it under
  3. * the terms of the GNU General Public License as published by the Free Software
  4. * Foundation; version 2 of the License.
  5. *
  6. * This Program is distributed in the hope that it will be useful, but WITHOUT
  7. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  8. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
  9. *
  10. * You should have received a copy of the GNU General Public License along with
  11. * this Program; if not, write to the Free Software Foundation, Inc., 59 Temple
  12. * Place, Suite 330, Boston, MA 02111-1307 USA.
  13. *
  14. * In addition, as a special exception, Red Hat, Inc. gives You the additional
  15. * right to link the code of this Program with code not covered under the GNU
  16. * General Public License ("Non-GPL Code") and to distribute linked combinations
  17. * including the two, subject to the limitations in this paragraph. Non-GPL Code
  18. * permitted under this exception must only link to the code of this Program
  19. * through those well defined interfaces identified in the file named EXCEPTION
  20. * found in the source code files (the "Approved Interfaces"). The files of
  21. * Non-GPL Code may instantiate templates or use macros or inline functions from
  22. * the Approved Interfaces without causing the resulting work to be covered by
  23. * the GNU General Public License. Only Red Hat, Inc. may make changes or
  24. * additions to the list of Approved Interfaces. You must obey the GNU General
  25. * Public License in all respects for all of the Program code and other code used
  26. * in conjunction with the Program except the Non-GPL Code covered by this
  27. * exception. If you modify this file, you may extend this exception to your
  28. * version of the file, but you are not obligated to do so. If you do not wish to
  29. * provide this exception without modification, you must delete this exception
  30. * statement from your version and license this file solely under the GPL without
  31. * exception.
  32. *
  33. *
  34. * Copyright (C) 2001 Sun Microsystems, Inc. Used by permission.
  35. * Copyright (C) 2005 Red Hat, Inc.
  36. * All rights reserved.
  37. * END COPYRIGHT BLOCK **/
  38. #ifdef HAVE_CONFIG_H
  39. # include <config.h>
  40. #endif
  41. /* phonetic.c - routines to do phonetic matching */
  42. #include <stdio.h>
  43. #include <string.h>
  44. #include <ctype.h>
  45. #include <sys/types.h>
  46. #include "syntax.h"
  47. #include "portable.h"
  48. #if !defined(METAPHONE) && !defined(SOUNDEX)
  49. #define METAPHONE
  50. #endif
  51. #define iswordbreak(s) \
  52. (isascii(*(s)) \
  53. ? (isspace(*(s)) || \
  54. ispunct(*(s)) || \
  55. isdigit(*(s)) || \
  56. *(s) == '\0') \
  57. : utf8iswordbreak(s))
  58. static int
  59. utf8iswordbreak( const char* s )
  60. {
  61. switch( LDAP_UTF8GETCC( s )) {
  62. case 0x00A0: /* non-breaking space */
  63. case 0x3000: /* ideographic space */
  64. case 0xFEFF: /* zero-width non-breaking space */
  65. return 1;
  66. default: break;
  67. }
  68. return 0;
  69. }
  70. char *
  71. first_word( char *s )
  72. {
  73. if ( s == NULL ) {
  74. return( NULL );
  75. }
  76. while ( iswordbreak( s ) ) {
  77. if ( *s == '\0' ) {
  78. return( NULL );
  79. } else {
  80. LDAP_UTF8INC( s );
  81. }
  82. }
  83. return( s );
  84. }
  85. char *
  86. next_word( char *s )
  87. {
  88. if ( s == NULL ) {
  89. return( NULL );
  90. }
  91. while ( ! iswordbreak( s ) ) {
  92. LDAP_UTF8INC( s );
  93. }
  94. while ( iswordbreak( s ) ) {
  95. if ( *s == '\0' ) {
  96. return( NULL );
  97. } else {
  98. LDAP_UTF8INC( s );
  99. }
  100. }
  101. return( s );
  102. }
  103. char *
  104. word_dup( char *w )
  105. {
  106. char *s, *ret;
  107. char save;
  108. for ( s = w; !iswordbreak( s ); LDAP_UTF8INC( s ))
  109. ; /* NULL */
  110. save = *s;
  111. *s = '\0';
  112. ret = slapi_ch_strdup( w );
  113. *s = save;
  114. return( ret );
  115. }
  116. #ifndef MAXPHONEMELEN
  117. #define MAXPHONEMELEN 6
  118. #endif
  119. #if defined(SOUNDEX)
  120. /* lifted from isode-8.0 */
  121. char *
  122. phonetic( char *s )
  123. {
  124. char code, adjacent, ch;
  125. char *p;
  126. char **c;
  127. int i, cmax;
  128. char phoneme[MAXPHONEMELEN + 1];
  129. p = s;
  130. if ( p == NULL || *p == '\0' ) {
  131. return( NULL );
  132. }
  133. adjacent = '0';
  134. phoneme[0] = TOUPPER(*p);
  135. phoneme[1] = '\0';
  136. for ( i = 0; i < 99 && (! iswordbreak(p)); LDAP_UTF8INC( p )) {
  137. ch = TOUPPER (*p);
  138. code = '0';
  139. switch (ch) {
  140. case 'B':
  141. case 'F':
  142. case 'P':
  143. case 'V':
  144. code = (adjacent != '1') ? '1' : '0';
  145. break;
  146. case 'S':
  147. case 'C':
  148. case 'G':
  149. case 'J':
  150. case 'K':
  151. case 'Q':
  152. case 'X':
  153. case 'Z':
  154. code = (adjacent != '2') ? '2' : '0';
  155. break;
  156. case 'D':
  157. case 'T':
  158. code = (adjacent != '3') ? '3' : '0';
  159. break;
  160. case 'L':
  161. code = (adjacent != '4') ? '4' : '0';
  162. break;
  163. case 'M':
  164. case 'N':
  165. code = (adjacent != '5') ? '5' : '0';
  166. break;
  167. case 'R':
  168. code = (adjacent != '6') ? '6' : '0';
  169. break;
  170. default:
  171. adjacent = '0';
  172. }
  173. if ( i == 0 ) {
  174. adjacent = code;
  175. i++;
  176. } else if ( code != '0' ) {
  177. if ( i == MAXPHONEMELEN )
  178. break;
  179. adjacent = phoneme[i] = code;
  180. i++;
  181. }
  182. }
  183. if ( i > 0 )
  184. phoneme[i] = '\0';
  185. return( slapi_ch_strdup( phoneme ) );
  186. }
  187. #else
  188. #if defined(METAPHONE)
  189. /*
  190. * Metaphone copied from C Gazette, June/July 1991, pp 56-57,
  191. * author Gary A. Parker, with changes by Bernard Tiffany of the
  192. * University of Michigan, and more changes by Tim Howes of the
  193. * University of Michigan.
  194. */
  195. /* Character coding array */
  196. static char vsvfn[26] = {
  197. 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2,
  198. /* A B C D E F G H I J K L M */
  199. 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0};
  200. /* N O P Q R S T U V W X Y Z */
  201. /* Macros to access character coding array */
  202. #define vowel(x) ((*(x) != '\0' && vsvfn[(*(x)) - 'A'] & 1) || /* AEIOU */ \
  203. (((*(x)==0xC3) && (*((x)+1))) ? ((0x80<=*((x)+1) && *((x)+1)<0x87) || \
  204. (0x88<=*((x)+1) && *((x)+1)<0x90) || (0x92<=*((x)+1) && *((x)+1)<0x97) || \
  205. (0x98<=*((x)+1) && *((x)+1)<0x9D) || (0xA0<=*((x)+1) && *((x)+1)<0xA7) || \
  206. (0xA8<=*((x)+1) && *((x)+1)<0xB0) || (0xB2<=*((x)+1) && *((x)+1)<0xB7) || \
  207. (0xB8<=*((x)+1) && *((x)+1)<0xBD)) : 0 ) /* Latin-1 characters */ )
  208. /*
  209. case 0xC3:
  210. */
  211. #define same(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 2) /* FJLMNR */
  212. #define varson(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 4) /* CGPST */
  213. #define frontv(x) ((*(x) != '\0' && vsvfn[(*(x)) - 'A'] & 8) || /* EIY */ \
  214. (((*(x)==0xC3) && (*((x)+1))) ? ((0x88<=*((x)+1) && *((x)+1)<0x90) || \
  215. (0xA8<=*((x)+1) && *((x)+1)<0xB0)) : 0 ) /* Latin-1 E/I */ )
  216. #define noghf(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 16) /* BDH */
  217. char *
  218. phonetic( char *Word )
  219. {
  220. unsigned char *n_start, *n, *n_end; /* pointers to string */
  221. char *metaph_end; /* pointers to metaph */
  222. unsigned char ntrans[42]; /* word with uppercase letters */
  223. int KSflag; /* state flag for X -> KS */
  224. char buf[MAXPHONEMELEN + 2];
  225. char *Metaph;
  226. /*
  227. * Copy Word to internal buffer, dropping non-alphabetic characters
  228. * and converting to upper case
  229. */
  230. n = ntrans + 4; n_end = ntrans + 35;
  231. while (!iswordbreak( Word ) && n < n_end) {
  232. if (isascii(*Word)) {
  233. if (isalpha(*Word)) {
  234. *n++ = TOUPPER(*Word);
  235. }
  236. ++Word;
  237. } else {
  238. auto const size_t len = LDAP_UTF8COPY((char *)n, Word);
  239. n += len; Word += len;
  240. }
  241. }
  242. Metaph = buf;
  243. *Metaph = '\0';
  244. if (n == ntrans + 4) {
  245. return( slapi_ch_strdup( buf ) ); /* Return if null */
  246. }
  247. n_end = n; /* Set n_end to end of string */
  248. /* ntrans[0] will always be == 0 */
  249. ntrans[0] = '\0';
  250. ntrans[1] = '\0';
  251. ntrans[2] = '\0';
  252. ntrans[3] = '\0';
  253. *n++ = 0;
  254. *n++ = 0;
  255. *n++ = 0;
  256. *n = 0; /* Pad with nulls */
  257. n = ntrans + 4; /* Assign pointer to start */
  258. /* Check for PN, KN, GN, AE, WR, WH, and X at start */
  259. switch (*n) {
  260. case 'P':
  261. case 'K':
  262. case 'G':
  263. /* 'PN', 'KN', 'GN' becomes 'N' */
  264. if (*(n + 1) == 'N')
  265. *n++ = 0;
  266. break;
  267. case 'A':
  268. /* 'AE' becomes 'E' */
  269. if (*(n + 1) == 'E')
  270. *n++ = 0;
  271. break;
  272. case 'W':
  273. /* 'WR' becomes 'R', and 'WH' to 'H' */
  274. if (*(n + 1) == 'R')
  275. *n++ = 0;
  276. else if (*(n + 1) == 'H') {
  277. *n++ = 0;
  278. }
  279. break;
  280. case 'X':
  281. /* 'X' becomes 'S' */
  282. *n = 'S';
  283. break;
  284. case 0xC3:
  285. switch (*(n+1)) {
  286. case 0x80:
  287. case 0x81:
  288. case 0x82:
  289. case 0x83:
  290. case 0x84:
  291. case 0x85:
  292. *n++ = 0;
  293. *n = 'A';
  294. break;
  295. case 0x87:
  296. *n++ = 0;
  297. *n = 'C';
  298. break;
  299. case 0x86:
  300. case 0x88:
  301. case 0x89:
  302. case 0x8A:
  303. case 0x8B:
  304. *n++ = 0;
  305. *n = 'E';
  306. break;
  307. case 0x8C:
  308. case 0x8D:
  309. case 0x8E:
  310. case 0x8F:
  311. *n++ = 0;
  312. *n = 'I';
  313. break;
  314. case 0x90: /* eth: TH */
  315. *n++ = 0;
  316. *n = '0';
  317. break;
  318. case 0x91:
  319. *n++ = 0;
  320. *n = 'N';
  321. break;
  322. case 0x92:
  323. case 0x93:
  324. case 0x94:
  325. case 0x95:
  326. case 0x96:
  327. case 0x98:
  328. *n++ = 0;
  329. *n = 'O';
  330. break;
  331. case 0x99:
  332. case 0x9A:
  333. case 0x9B:
  334. case 0x9C:
  335. *n++ = 0;
  336. *n = 'U';
  337. break;
  338. case 0x9D:
  339. *n++ = 0;
  340. *n = 'Y';
  341. break;
  342. case 0x9E:
  343. *n++ = 0;
  344. *n = '0'; /* thorn: TH */
  345. break;
  346. case 0x9F:
  347. *n++ = 0;
  348. *n = 's';
  349. break;
  350. case 0xA0:
  351. case 0xA1:
  352. case 0xA2:
  353. case 0xA3:
  354. case 0xA4:
  355. case 0xA5:
  356. *n++ = 0;
  357. *n = 'a';
  358. break;
  359. case 0xA6:
  360. *n++ = 0;
  361. *n = 'e';
  362. break;
  363. case 0xA7:
  364. *n++ = 0;
  365. *n = 'c';
  366. break;
  367. case 0xA8:
  368. case 0xA9:
  369. case 0xAA:
  370. case 0xAB:
  371. *n++ = 0;
  372. *n = 'e';
  373. break;
  374. case 0xAC:
  375. case 0xAD:
  376. case 0xAE:
  377. case 0xAF:
  378. *n++ = 0;
  379. *n = 'i';
  380. break;
  381. case 0xB0:
  382. *n++ = 0;
  383. *n = '0'; /* eth: th */
  384. break;
  385. case 0xB1:
  386. *n++ = 0;
  387. *n = 'n';
  388. break;
  389. case 0xB2:
  390. case 0xB3:
  391. case 0xB4:
  392. case 0xB5:
  393. case 0xB6:
  394. case 0xB8:
  395. *n++ = 0;
  396. *n = 'o';
  397. break;
  398. case 0xB9:
  399. case 0xBA:
  400. case 0xBB:
  401. case 0xBC:
  402. *n++ = 0;
  403. *n = 'u';
  404. break;
  405. case 0xBD:
  406. case 0xBF:
  407. *n++ = 0;
  408. *n = 'y';
  409. break;
  410. case 0xBE:
  411. *n++ = 0;
  412. *n = '0'; /* thorn: th */
  413. break;
  414. }
  415. break;
  416. }
  417. /*
  418. * Now, loop step through string, stopping at end of string or when
  419. * the computed 'metaph' is MAXPHONEMELEN characters long
  420. */
  421. KSflag = 0; /* state flag for KS translation */
  422. for (metaph_end = Metaph + MAXPHONEMELEN, n_start = n;
  423. n <= n_end && Metaph < metaph_end; n++) {
  424. if (KSflag) {
  425. KSflag = 0;
  426. *Metaph++ = 'S';
  427. } else if (!isascii(*n)) {
  428. switch (*n) {
  429. case 0xC3:
  430. if (n+1 <= n_end) {
  431. switch (*(++n)) {
  432. case 0x87: /* C with cedilla */
  433. case 0x9F: /* ess-zed */
  434. case 0xA7: /* c with cedilla */
  435. *Metaph++ = 'S';
  436. break;
  437. case 0x90: /* eth: TH */
  438. case 0x9E: /* thorn: TH */
  439. case 0xB0: /* eth: th */
  440. case 0xBE: /* thorn: th */
  441. *Metaph++ = '0';
  442. break;
  443. case 0x91:
  444. case 0xB1:
  445. *Metaph++ = 'N';
  446. break;
  447. case 0x9D:
  448. case 0xBD:
  449. case 0xBF:
  450. *Metaph++ = 'Y';
  451. break;
  452. default: /* skipping the rest */
  453. break;
  454. }
  455. }
  456. break;
  457. default:
  458. *Metaph++ = *n;
  459. }
  460. } else {
  461. /* Drop duplicates except for CC */
  462. if (*(n - 1) == *n && *n != 'C')
  463. continue;
  464. /* Check for F J L M N R or first letter vowel */
  465. if (same(*n) || (n == n_start && vowel(n))) {
  466. *Metaph++ = *n;
  467. } else {
  468. switch (*n) {
  469. case 'B':
  470. /*
  471. * B unless in -MB
  472. */
  473. if (n < (n_end - 1) && *(n - 1) != 'M') {
  474. *Metaph++ = *n;
  475. }
  476. break;
  477. case 'C':
  478. /*
  479. * X if in -CIA-, -CH- else S if in
  480. * -CI-, -CE-, -CY- else dropped if
  481. * in -SCI-, -SCE-, -SCY- else K
  482. */
  483. if (*(n - 1) != 'S' || !frontv((n + 1))) {
  484. if (*(n + 1) == 'I' && *(n + 2) == 'A') {
  485. *Metaph++ = 'X';
  486. } else if (frontv((n + 1))) {
  487. *Metaph++ = 'S';
  488. } else if (*(n + 1) == 'H') {
  489. *Metaph++ = ((n == n_start && !vowel((n + 2)))
  490. || *(n - 1) == 'S')
  491. ? (char) 'K' : (char) 'X';
  492. } else {
  493. *Metaph++ = 'K';
  494. }
  495. }
  496. break;
  497. case 'D':
  498. /*
  499. * J if in DGE or DGI or DGY else T
  500. */
  501. *Metaph++ = (*(n + 1) == 'G' && frontv((n + 2)))
  502. ? (char) 'J' : (char) 'T';
  503. break;
  504. case 'G':
  505. /*
  506. * F if in -GH and not B--GH, D--GH,
  507. * -H--GH, -H---GH else dropped if
  508. * -GNED, -GN, -DGE-, -DGI-, -DGY-
  509. * else J if in -GE-, -GI-, -GY- and
  510. * not GG else K
  511. */
  512. if ((*(n + 1) != 'J' || vowel((n + 2))) &&
  513. (*(n + 1) != 'N' || ((n + 1) < n_end &&
  514. (*(n + 2) != 'E' || *(n + 3) != 'D'))) &&
  515. (*(n - 1) != 'D' || !frontv((n + 1))))
  516. *Metaph++ = (frontv((n + 1)) &&
  517. *(n + 2) != 'G') ? (char) 'G' : (char) 'K';
  518. else if (*(n + 1) == 'H' && !noghf(*(n - 3)) &&
  519. *(n - 4) != 'H')
  520. *Metaph++ = 'F';
  521. break;
  522. case 'H':
  523. /*
  524. * H if before a vowel and not after
  525. * C, G, P, S, T else dropped
  526. */
  527. if (!varson(*(n - 1)) && (!vowel((n - 1)) ||
  528. vowel((n + 1))))
  529. *Metaph++ = 'H';
  530. break;
  531. case 'K':
  532. /*
  533. * dropped if after C else K
  534. */
  535. if (*(n - 1) != 'C')
  536. *Metaph++ = 'K';
  537. break;
  538. case 'P':
  539. /*
  540. * F if before H, else P
  541. */
  542. *Metaph++ = *(n + 1) == 'H' ?
  543. (char) 'F' : (char) 'P';
  544. break;
  545. case 'Q':
  546. /*
  547. * K
  548. */
  549. *Metaph++ = 'K';
  550. break;
  551. case 'S':
  552. /*
  553. * X in -SH-, -SIO- or -SIA- else S
  554. */
  555. *Metaph++ = (*(n + 1) == 'H' ||
  556. (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
  557. *(n + 2) == 'A')))
  558. ? (char) 'X' : (char) 'S';
  559. break;
  560. case 'T':
  561. /*
  562. * X in -TIA- or -TIO- else 0 (zero)
  563. * before H else dropped if in -TCH-
  564. * else T
  565. */
  566. if (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
  567. *(n + 2) == 'A'))
  568. *Metaph++ = 'X';
  569. else if (*(n + 1) == 'H')
  570. *Metaph++ = '0';
  571. else if (*(n + 1) != 'C' || *(n + 2) != 'H')
  572. *Metaph++ = 'T';
  573. break;
  574. case 'V':
  575. /*
  576. * F
  577. */
  578. *Metaph++ = 'F';
  579. break;
  580. case 'W':
  581. /*
  582. * W after a vowel, else dropped
  583. */
  584. case 'Y':
  585. /*
  586. * Y unless followed by a vowel
  587. */
  588. if (vowel((n + 1)))
  589. *Metaph++ = *n;
  590. break;
  591. case 'X':
  592. /*
  593. * KS
  594. */
  595. if (n == n_start)
  596. *Metaph++ = 'S';
  597. else {
  598. *Metaph++ = 'K'; /* Insert K, then S */
  599. KSflag = 1;
  600. }
  601. break;
  602. case 'Z':
  603. /*
  604. * S
  605. */
  606. *Metaph++ = 'S';
  607. break;
  608. }
  609. }
  610. }
  611. }
  612. *Metaph = 0; /* Null terminate */
  613. return( slapi_ch_strdup( buf ) );
  614. }
  615. #endif /* METAPHONE */
  616. #endif /* !SOUNDEX */