phonetic.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493
  1. /** BEGIN COPYRIGHT BLOCK
  2. * This Program is free software; you can redistribute it and/or modify it under
  3. * the terms of the GNU General Public License as published by the Free Software
  4. * Foundation; version 2 of the License.
  5. *
  6. * This Program is distributed in the hope that it will be useful, but WITHOUT
  7. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  8. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
  9. *
  10. * You should have received a copy of the GNU General Public License along with
  11. * this Program; if not, write to the Free Software Foundation, Inc., 59 Temple
  12. * Place, Suite 330, Boston, MA 02111-1307 USA.
  13. *
  14. * In addition, as a special exception, Red Hat, Inc. gives You the additional
  15. * right to link the code of this Program with code not covered under the GNU
  16. * General Public License ("Non-GPL Code") and to distribute linked combinations
  17. * including the two, subject to the limitations in this paragraph. Non-GPL Code
  18. * permitted under this exception must only link to the code of this Program
  19. * through those well defined interfaces identified in the file named EXCEPTION
  20. * found in the source code files (the "Approved Interfaces"). The files of
  21. * Non-GPL Code may instantiate templates or use macros or inline functions from
  22. * the Approved Interfaces without causing the resulting work to be covered by
  23. * the GNU General Public License. Only Red Hat, Inc. may make changes or
  24. * additions to the list of Approved Interfaces. You must obey the GNU General
  25. * Public License in all respects for all of the Program code and other code used
  26. * in conjunction with the Program except the Non-GPL Code covered by this
  27. * exception. If you modify this file, you may extend this exception to your
  28. * version of the file, but you are not obligated to do so. If you do not wish to
  29. * provide this exception without modification, you must delete this exception
  30. * statement from your version and license this file solely under the GPL without
  31. * exception.
  32. *
  33. *
  34. * Copyright (C) 2001 Sun Microsystems, Inc. Used by permission.
  35. * Copyright (C) 2005 Red Hat, Inc.
  36. * All rights reserved.
  37. * END COPYRIGHT BLOCK **/
  38. /* phonetic.c - routines to do phonetic matching */
  39. #include <stdio.h>
  40. #include <string.h>
  41. #include <ctype.h>
  42. #include <sys/types.h>
  43. #include "syntax.h"
  44. #include "portable.h"
  45. #if !defined(METAPHONE) && !defined(SOUNDEX)
  46. #define METAPHONE
  47. #endif
  48. #define iswordbreak(s) \
  49. (isascii(*(s)) \
  50. ? (isspace(*(s)) || \
  51. ispunct(*(s)) || \
  52. isdigit(*(s)) || \
  53. *(s) == '\0') \
  54. : utf8iswordbreak(s))
  55. static int
  56. utf8iswordbreak( const char* s )
  57. {
  58. switch( LDAP_UTF8GETCC( s )) {
  59. case 0x00A0: /* non-breaking space */
  60. case 0x3000: /* ideographic space */
  61. case 0xFEFF: /* zero-width non-breaking space */
  62. return 1;
  63. default: break;
  64. }
  65. return 0;
  66. }
  67. char *
  68. first_word( char *s )
  69. {
  70. if ( s == NULL ) {
  71. return( NULL );
  72. }
  73. while ( iswordbreak( s ) ) {
  74. if ( *s == '\0' ) {
  75. return( NULL );
  76. } else {
  77. LDAP_UTF8INC( s );
  78. }
  79. }
  80. return( s );
  81. }
  82. char *
  83. next_word( char *s )
  84. {
  85. if ( s == NULL ) {
  86. return( NULL );
  87. }
  88. while ( ! iswordbreak( s ) ) {
  89. LDAP_UTF8INC( s );
  90. }
  91. while ( iswordbreak( s ) ) {
  92. if ( *s == '\0' ) {
  93. return( NULL );
  94. } else {
  95. LDAP_UTF8INC( s );
  96. }
  97. }
  98. return( s );
  99. }
  100. char *
  101. word_dup( char *w )
  102. {
  103. char *s, *ret;
  104. char save;
  105. for ( s = w; !iswordbreak( s ); LDAP_UTF8INC( s ))
  106. ; /* NULL */
  107. save = *s;
  108. *s = '\0';
  109. ret = slapi_ch_strdup( w );
  110. *s = save;
  111. return( ret );
  112. }
  113. #ifndef MAXPHONEMELEN
  114. #define MAXPHONEMELEN 4
  115. #endif
  116. #if defined(SOUNDEX)
  117. /* lifted from isode-8.0 */
  118. char *
  119. phonetic( char *s )
  120. {
  121. char code, adjacent, ch;
  122. char *p;
  123. char **c;
  124. int i, cmax;
  125. char phoneme[MAXPHONEMELEN + 1];
  126. p = s;
  127. if ( p == NULL || *p == '\0' ) {
  128. return( NULL );
  129. }
  130. adjacent = '0';
  131. phoneme[0] = TOUPPER(*p);
  132. phoneme[1] = '\0';
  133. for ( i = 0; i < 99 && (! iswordbreak(p)); LDAP_UTF8INC( p )) {
  134. ch = TOUPPER (*p);
  135. code = '0';
  136. switch (ch) {
  137. case 'B':
  138. case 'F':
  139. case 'P':
  140. case 'V':
  141. code = (adjacent != '1') ? '1' : '0';
  142. break;
  143. case 'S':
  144. case 'C':
  145. case 'G':
  146. case 'J':
  147. case 'K':
  148. case 'Q':
  149. case 'X':
  150. case 'Z':
  151. code = (adjacent != '2') ? '2' : '0';
  152. break;
  153. case 'D':
  154. case 'T':
  155. code = (adjacent != '3') ? '3' : '0';
  156. break;
  157. case 'L':
  158. code = (adjacent != '4') ? '4' : '0';
  159. break;
  160. case 'M':
  161. case 'N':
  162. code = (adjacent != '5') ? '5' : '0';
  163. break;
  164. case 'R':
  165. code = (adjacent != '6') ? '6' : '0';
  166. break;
  167. default:
  168. adjacent = '0';
  169. }
  170. if ( i == 0 ) {
  171. adjacent = code;
  172. i++;
  173. } else if ( code != '0' ) {
  174. if ( i == MAXPHONEMELEN )
  175. break;
  176. adjacent = phoneme[i] = code;
  177. i++;
  178. }
  179. }
  180. if ( i > 0 )
  181. phoneme[i] = '\0';
  182. return( slapi_ch_strdup( phoneme ) );
  183. }
  184. #else
  185. #if defined(METAPHONE)
  186. /*
  187. * Metaphone copied from C Gazette, June/July 1991, pp 56-57,
  188. * author Gary A. Parker, with changes by Bernard Tiffany of the
  189. * University of Michigan, and more changes by Tim Howes of the
  190. * University of Michigan.
  191. */
  192. /* Character coding array */
  193. static char vsvfn[26] = {
  194. 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2,
  195. /* A B C D E F G H I J K L M */
  196. 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0};
  197. /* N O P Q R S T U V W X Y Z */
  198. /* Macros to access character coding array */
  199. #define vowel(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 1) /* AEIOU */
  200. #define same(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 2) /* FJLMNR */
  201. #define varson(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 4) /* CGPST */
  202. #define frontv(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 8) /* EIY */
  203. #define noghf(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 16) /* BDH */
  204. char *
  205. phonetic( char *Word )
  206. {
  207. char *n, *n_start, *n_end; /* pointers to string */
  208. char *metaph_end; /* pointers to metaph */
  209. char ntrans[42]; /* word with uppercase letters */
  210. int KSflag; /* state flag for X -> KS */
  211. char buf[MAXPHONEMELEN + 2];
  212. char *Metaph;
  213. /*
  214. * Copy Word to internal buffer, dropping non-alphabetic characters
  215. * and converting to upper case
  216. */
  217. n = ntrans + 4; n_end = ntrans + 35;
  218. while (!iswordbreak( Word ) && n < n_end) {
  219. if (isascii(*Word)) {
  220. if (isalpha(*Word)) {
  221. *n++ = TOUPPER(*Word);
  222. }
  223. ++Word;
  224. } else {
  225. auto const size_t len = LDAP_UTF8COPY(n, Word);
  226. n += len; Word += len;
  227. }
  228. }
  229. Metaph = buf;
  230. *Metaph = '\0';
  231. if (n == ntrans + 4) {
  232. return( slapi_ch_strdup( buf ) ); /* Return if null */
  233. }
  234. n_end = n; /* Set n_end to end of string */
  235. /* ntrans[0] will always be == 0 */
  236. ntrans[0] = '\0';
  237. ntrans[1] = '\0';
  238. ntrans[2] = '\0';
  239. ntrans[3] = '\0';
  240. *n++ = 0;
  241. *n++ = 0;
  242. *n++ = 0;
  243. *n = 0; /* Pad with nulls */
  244. n = ntrans + 4; /* Assign pointer to start */
  245. /* Check for PN, KN, GN, AE, WR, WH, and X at start */
  246. switch (*n) {
  247. case 'P':
  248. case 'K':
  249. case 'G':
  250. /* 'PN', 'KN', 'GN' becomes 'N' */
  251. if (*(n + 1) == 'N')
  252. *n++ = 0;
  253. break;
  254. case 'A':
  255. /* 'AE' becomes 'E' */
  256. if (*(n + 1) == 'E')
  257. *n++ = 0;
  258. break;
  259. case 'W':
  260. /* 'WR' becomes 'R', and 'WH' to 'H' */
  261. if (*(n + 1) == 'R')
  262. *n++ = 0;
  263. else if (*(n + 1) == 'H') {
  264. *(n + 1) = *n;
  265. *n++ = 0;
  266. }
  267. break;
  268. case 'X':
  269. /* 'X' becomes 'S' */
  270. *n = 'S';
  271. break;
  272. }
  273. /*
  274. * Now, loop step through string, stopping at end of string or when
  275. * the computed 'metaph' is MAXPHONEMELEN characters long
  276. */
  277. KSflag = 0; /* state flag for KS translation */
  278. for (metaph_end = Metaph + MAXPHONEMELEN, n_start = n;
  279. n <= n_end && Metaph < metaph_end; n++) {
  280. if (KSflag) {
  281. KSflag = 0;
  282. *Metaph++ = 'S';
  283. } else if (!isascii(*n)) {
  284. *Metaph++ = *n;
  285. } else {
  286. /* Drop duplicates except for CC */
  287. if (*(n - 1) == *n && *n != 'C')
  288. continue;
  289. /* Check for F J L M N R or first letter vowel */
  290. if (same(*n) || (n == n_start && vowel(*n))) {
  291. *Metaph++ = *n;
  292. } else {
  293. switch (*n) {
  294. case 'B':
  295. /*
  296. * B unless in -MB
  297. */
  298. if (n < (n_end - 1) && *(n - 1) != 'M') {
  299. *Metaph++ = *n;
  300. }
  301. break;
  302. case 'C':
  303. /*
  304. * X if in -CIA-, -CH- else S if in
  305. * -CI-, -CE-, -CY- else dropped if
  306. * in -SCI-, -SCE-, -SCY- else K
  307. */
  308. if (*(n - 1) != 'S' || !frontv(*(n + 1))) {
  309. if (*(n + 1) == 'I' && *(n + 2) == 'A') {
  310. *Metaph++ = 'X';
  311. } else if (frontv(*(n + 1))) {
  312. *Metaph++ = 'S';
  313. } else if (*(n + 1) == 'H') {
  314. *Metaph++ = ((n == n_start && !vowel(*(n + 2)))
  315. || *(n - 1) == 'S')
  316. ? (char) 'K' : (char) 'X';
  317. } else {
  318. *Metaph++ = 'K';
  319. }
  320. }
  321. break;
  322. case 'D':
  323. /*
  324. * J if in DGE or DGI or DGY else T
  325. */
  326. *Metaph++ = (*(n + 1) == 'G' && frontv(*(n + 2)))
  327. ? (char) 'J' : (char) 'T';
  328. break;
  329. case 'G':
  330. /*
  331. * F if in -GH and not B--GH, D--GH,
  332. * -H--GH, -H---GH else dropped if
  333. * -GNED, -GN, -DGE-, -DGI-, -DGY-
  334. * else J if in -GE-, -GI-, -GY- and
  335. * not GG else K
  336. */
  337. if ((*(n + 1) != 'J' || vowel(*(n + 2))) &&
  338. (*(n + 1) != 'N' || ((n + 1) < n_end &&
  339. (*(n + 2) != 'E' || *(n + 3) != 'D'))) &&
  340. (*(n - 1) != 'D' || !frontv(*(n + 1))))
  341. *Metaph++ = (frontv(*(n + 1)) &&
  342. *(n + 2) != 'G') ? (char) 'G' : (char) 'K';
  343. else if (*(n + 1) == 'H' && !noghf(*(n - 3)) &&
  344. *(n - 4) != 'H')
  345. *Metaph++ = 'F';
  346. break;
  347. case 'H':
  348. /*
  349. * H if before a vowel and not after
  350. * C, G, P, S, T else dropped
  351. */
  352. if (!varson(*(n - 1)) && (!vowel(*(n - 1)) ||
  353. vowel(*(n + 1))))
  354. *Metaph++ = 'H';
  355. break;
  356. case 'K':
  357. /*
  358. * dropped if after C else K
  359. */
  360. if (*(n - 1) != 'C')
  361. *Metaph++ = 'K';
  362. break;
  363. case 'P':
  364. /*
  365. * F if before H, else P
  366. */
  367. *Metaph++ = *(n + 1) == 'H' ?
  368. (char) 'F' : (char) 'P';
  369. break;
  370. case 'Q':
  371. /*
  372. * K
  373. */
  374. *Metaph++ = 'K';
  375. break;
  376. case 'S':
  377. /*
  378. * X in -SH-, -SIO- or -SIA- else S
  379. */
  380. *Metaph++ = (*(n + 1) == 'H' ||
  381. (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
  382. *(n + 2) == 'A')))
  383. ? (char) 'X' : (char) 'S';
  384. break;
  385. case 'T':
  386. /*
  387. * X in -TIA- or -TIO- else 0 (zero)
  388. * before H else dropped if in -TCH-
  389. * else T
  390. */
  391. if (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
  392. *(n + 2) == 'A'))
  393. *Metaph++ = 'X';
  394. else if (*(n + 1) == 'H')
  395. *Metaph++ = '0';
  396. else if (*(n + 1) != 'C' || *(n + 2) != 'H')
  397. *Metaph++ = 'T';
  398. break;
  399. case 'V':
  400. /*
  401. * F
  402. */
  403. *Metaph++ = 'F';
  404. break;
  405. case 'W':
  406. /*
  407. * W after a vowel, else dropped
  408. */
  409. case 'Y':
  410. /*
  411. * Y unless followed by a vowel
  412. */
  413. if (vowel(*(n + 1)))
  414. *Metaph++ = *n;
  415. break;
  416. case 'X':
  417. /*
  418. * KS
  419. */
  420. if (n == n_start)
  421. *Metaph++ = 'S';
  422. else {
  423. *Metaph++ = 'K'; /* Insert K, then S */
  424. KSflag = 1;
  425. }
  426. break;
  427. case 'Z':
  428. /*
  429. * S
  430. */
  431. *Metaph++ = 'S';
  432. break;
  433. }
  434. }
  435. }
  436. }
  437. *Metaph = 0; /* Null terminate */
  438. return( slapi_ch_strdup( buf ) );
  439. }
  440. #endif /* METAPHONE */
  441. #endif /* !SOUNDEX */