Apq
/
389-ds-base
의 미러 https://github.com/389ds/389-ds-base.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649
							/** BEGIN COPYRIGHT BLOCK
 * Copyright (C) 2001 Sun Microsystems, Inc. Used by permission.
 * Copyright (C) 2005 Red Hat, Inc.
 * All rights reserved.
 *
 * License: GPL (version 3 or any later version).
 * See LICENSE for details.
 * END COPYRIGHT BLOCK **/

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

/* phonetic.c - routines to do phonetic matching */

#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <sys/types.h>
#include "syntax.h"
#include "portable.h"

#if !defined(METAPHONE) && !defined(SOUNDEX)
#define METAPHONE
#endif

#define iswordbreak(s)       \
    (isascii(*(s))           \
         ? (isspace(*(s)) || \
            ispunct(*(s)) || \
            isdigit(*(s)) || \
            *(s) == '\0')    \
         : utf8iswordbreak(s))

static int
utf8iswordbreak(const char *s)
{
    switch (LDAP_UTF8GETCC(s)) {
    case 0x00A0: /* non-breaking space */
    case 0x3000: /* ideographic space */
    case 0xFEFF: /* zero-width non-breaking space */
        return 1;
    default:
        break;
    }
    return 0;
}

char *
first_word(char *s)
{
    if (s == NULL) {
        return (NULL);
    }

    while (iswordbreak(s)) {
        if (*s == '\0') {
            return (NULL);
        } else {
            LDAP_UTF8INC(s);
        }
    }

    return (s);
}

char *
next_word(char *s)
{
    if (s == NULL) {
        return (NULL);
    }

    while (!iswordbreak(s)) {
        LDAP_UTF8INC(s);
    }

    while (iswordbreak(s)) {
        if (*s == '\0') {
            return (NULL);
        } else {
            LDAP_UTF8INC(s);
        }
    }

    return (s);
}

char *
word_dup(char *w)
{
    char *s, *ret;
    char save;

    for (s = w; !iswordbreak(s); LDAP_UTF8INC(s))
        ; /* NULL */
    save = *s;
    *s = '\0';
    ret = slapi_ch_strdup(w);
    *s = save;

    return (ret);
}

#ifndef MAXPHONEMELEN
#define MAXPHONEMELEN 6
#endif

#if defined(SOUNDEX)

/* lifted from isode-8.0 */
char *
phonetic(char *s)
{
    char code, adjacent, ch;
    char *p;
    char **c;
    int i, cmax;
    char phoneme[MAXPHONEMELEN + 1];

    p = s;
    if (p == NULL || *p == '\0') {
        return (NULL);
    }

    adjacent = '0';
    phoneme[0] = TOUPPER(*p);

    phoneme[1] = '\0';
    for (i = 0; i < 99 && (!iswordbreak(p)); LDAP_UTF8INC(p)) {
        ch = TOUPPER(*p);

        code = '0';

        switch (ch) {
        case 'B':
        case 'F':
        case 'P':
        case 'V':
            code = (adjacent != '1') ? '1' : '0';
            break;
        case 'S':
        case 'C':
        case 'G':
        case 'J':
        case 'K':
        case 'Q':
        case 'X':
        case 'Z':
            code = (adjacent != '2') ? '2' : '0';
            break;
        case 'D':
        case 'T':
            code = (adjacent != '3') ? '3' : '0';
            break;
        case 'L':
            code = (adjacent != '4') ? '4' : '0';
            break;
        case 'M':
        case 'N':
            code = (adjacent != '5') ? '5' : '0';
            break;
        case 'R':
            code = (adjacent != '6') ? '6' : '0';
            break;
        default:
            adjacent = '0';
        }

        if (i == 0) {
            adjacent = code;
            i++;
        } else if (code != '0') {
            if (i == MAXPHONEMELEN)
                break;
            adjacent = phoneme[i] = code;
            i++;
        }
    }

    if (i > 0)
        phoneme[i] = '\0';

    return (slapi_ch_strdup(phoneme));
}

#else
#if defined(METAPHONE)

/*
 * Metaphone copied from C Gazette, June/July 1991, pp 56-57,
 * author Gary A. Parker, with changes by Bernard Tiffany of the
 * University of Michigan, and more changes by Tim Howes of the
 * University of Michigan.
 */

/* Character coding array */
static char vsvfn[26] = {
    1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2,
    /* A   B  C   D  E  F  G   H  I  J  K  L  M  */
    2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0};
/* N  O  P  Q  R  S  T  U  V  W  X  Y  Z  */

/* Macros to access character coding array */
#define vowel(x) ((*(x) != '\0' && vsvfn[(*(x)) - 'A'] & 1) || /* AEIOU */                                                                       \
                  (((*(x) == 0xC3) && (*((x) + 1))) ? ((0x80 <= *((x) + 1) && *((x) + 1) < 0x87) ||                                              \
                                                       (0x88 <= *((x) + 1) && *((x) + 1) < 0x90) || (0x92 <= *((x) + 1) && *((x) + 1) < 0x97) || \
                                                       (0x98 <= *((x) + 1) && *((x) + 1) < 0x9D) || (0xA0 <= *((x) + 1) && *((x) + 1) < 0xA7) || \
                                                       (0xA8 <= *((x) + 1) && *((x) + 1) < 0xB0) || (0xB2 <= *((x) + 1) && *((x) + 1) < 0xB7) || \
                                                       (0xB8 <= *((x) + 1) && *((x) + 1) < 0xBD))                                                \
                                                    : 0) /* Latin-1 characters */)
/*
    case 0xC3:
*/
#define same(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 2)           /* FJLMNR */
#define varson(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 4)         /* CGPST */
#define frontv(x) ((*(x) != '\0' && vsvfn[(*(x)) - 'A'] & 8) || /* EIY */                            \
                   (((*(x) == 0xC3) && (*((x) + 1))) ? ((0x88 <= *((x) + 1) && *((x) + 1) < 0x90) || \
                                                        (0xA8 <= *((x) + 1) && *((x) + 1) < 0xB0))   \
                                                     : 0) /* Latin-1 E/I */)
#define noghf(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 16) /* BDH */

char *
phonetic(char *Word)
{
    unsigned char *n_start, *n, *n_end; /* pointers to string */
    char *metaph_end;                   /* pointers to metaph */
    unsigned char ntrans[42];           /* word with uppercase letters */
    int KSflag;                         /* state flag for X -> KS */
    char buf[MAXPHONEMELEN + 2];
    char *Metaph;

    /*
     * Copy Word to internal buffer, dropping non-alphabetic characters
     * and converting to upper case
     */
    n = ntrans + 4;
    n_end = ntrans + 35;
    while (!iswordbreak(Word) && n < n_end) {
        if (isascii(*Word)) {
            if (isalpha(*Word)) {
                *n++ = TOUPPER(*Word);
            }
            ++Word;
        } else {
            auto const size_t len = LDAP_UTF8COPY((char *)n, Word);
            n += len;
            Word += len;
        }
    }
    Metaph = buf;
    *Metaph = '\0';
    if (n == ntrans + 4) {
        return (slapi_ch_strdup(buf)); /* Return if null */
    }
    n_end = n; /* Set n_end to end of string */

    /* ntrans[0] will always be == 0 */
    ntrans[0] = '\0';
    ntrans[1] = '\0';
    ntrans[2] = '\0';
    ntrans[3] = '\0';
    *n++ = 0;
    *n++ = 0;
    *n++ = 0;
    *n = 0;         /* Pad with nulls */
    n = ntrans + 4; /* Assign pointer to start */

    /* Check for PN, KN, GN, AE, WR, WH, and X at start */
    switch (*n) {
    case 'P':
    case 'K':
    case 'G':
        /* 'PN', 'KN', 'GN' becomes 'N' */
        if (*(n + 1) == 'N')
            *n++ = 0;
        break;
    case 'A':
        /* 'AE' becomes 'E' */
        if (*(n + 1) == 'E')
            *n++ = 0;
        break;
    case 'W':
        /* 'WR' becomes 'R', and 'WH' to 'H' */
        if (*(n + 1) == 'R')
            *n++ = 0;
        else if (*(n + 1) == 'H') {
            *n++ = 0;
        }
        break;
    case 'X':
        /* 'X' becomes 'S' */
        *n = 'S';
        break;
    case 0xC3:
        switch (*(n + 1)) {
        case 0x80:
        case 0x81:
        case 0x82:
        case 0x83:
        case 0x84:
        case 0x85:
            *n++ = 0;
            *n = 'A';
            break;
        case 0x87:
            *n++ = 0;
            *n = 'C';
            break;
        case 0x86:
        case 0x88:
        case 0x89:
        case 0x8A:
        case 0x8B:
            *n++ = 0;
            *n = 'E';
            break;
        case 0x8C:
        case 0x8D:
        case 0x8E:
        case 0x8F:
            *n++ = 0;
            *n = 'I';
            break;
        case 0x90: /* eth: TH */
            *n++ = 0;
            *n = '0';
            break;
        case 0x91:
            *n++ = 0;
            *n = 'N';
            break;
        case 0x92:
        case 0x93:
        case 0x94:
        case 0x95:
        case 0x96:
        case 0x98:
            *n++ = 0;
            *n = 'O';
            break;
        case 0x99:
        case 0x9A:
        case 0x9B:
        case 0x9C:
            *n++ = 0;
            *n = 'U';
            break;
        case 0x9D:
            *n++ = 0;
            *n = 'Y';
            break;
        case 0x9E:
            *n++ = 0;
            *n = '0'; /* thorn: TH */
            break;
        case 0x9F:
            *n++ = 0;
            *n = 's';
            break;
        case 0xA0:
        case 0xA1:
        case 0xA2:
        case 0xA3:
        case 0xA4:
        case 0xA5:
            *n++ = 0;
            *n = 'a';
            break;
        case 0xA6:
            *n++ = 0;
            *n = 'e';
            break;
        case 0xA7:
            *n++ = 0;
            *n = 'c';
            break;
        case 0xA8:
        case 0xA9:
        case 0xAA:
        case 0xAB:
            *n++ = 0;
            *n = 'e';
            break;
        case 0xAC:
        case 0xAD:
        case 0xAE:
        case 0xAF:
            *n++ = 0;
            *n = 'i';
            break;
        case 0xB0:
            *n++ = 0;
            *n = '0'; /* eth: th */
            break;
        case 0xB1:
            *n++ = 0;
            *n = 'n';
            break;
        case 0xB2:
        case 0xB3:
        case 0xB4:
        case 0xB5:
        case 0xB6:
        case 0xB8:
            *n++ = 0;
            *n = 'o';
            break;
        case 0xB9:
        case 0xBA:
        case 0xBB:
        case 0xBC:
            *n++ = 0;
            *n = 'u';
            break;
        case 0xBD:
        case 0xBF:
            *n++ = 0;
            *n = 'y';
            break;
        case 0xBE:
            *n++ = 0;
            *n = '0'; /* thorn: th */
            break;
        }
        break;
    }

    /*
     * Now, loop step through string, stopping at end of string or when
     * the computed 'metaph' is MAXPHONEMELEN characters long
     */

    KSflag = 0; /* state flag for KS translation */
    for (metaph_end = Metaph + MAXPHONEMELEN, n_start = n;
         n <= n_end && Metaph < metaph_end; n++) {
        if (KSflag) {
            KSflag = 0;
            *Metaph++ = 'S';
        } else if (!isascii(*n)) {
            switch (*n) {
            case 0xC3:
                if (n + 1 <= n_end) {
                    switch (*(++n)) {
                    case 0x87: /* C with cedilla */
                    case 0x9F: /* ess-zed */
                    case 0xA7: /* c with cedilla */
                        *Metaph++ = 'S';
                        break;
                    case 0x90: /* eth: TH */
                    case 0x9E: /* thorn: TH */
                    case 0xB0: /* eth: th */
                    case 0xBE: /* thorn: th */
                        *Metaph++ = '0';
                        break;
                    case 0x91:
                    case 0xB1:
                        *Metaph++ = 'N';
                        break;
                    case 0x9D:
                    case 0xBD:
                    case 0xBF:
                        *Metaph++ = 'Y';
                        break;
                    default: /* skipping the rest */
                        break;
                    }
                }
                break;
            default:
                *Metaph++ = *n;
            }
        } else {
            /* Drop duplicates except for CC */
            if (*(n - 1) == *n && *n != 'C')
                continue;
            /* Check for F J L M N R or first letter vowel */
            if (same(*n) || (n == n_start && vowel(n))) {
                *Metaph++ = *n;
            } else {
                switch (*n) {
                case 'B':

                    /*
                     * B unless in -MB
                     */
                    if (n < (n_end - 1) && *(n - 1) != 'M') {
                        *Metaph++ = *n;
                    }
                    break;
                case 'C':

                    /*
                     * X if in -CIA-, -CH- else S if in
                     * -CI-, -CE-, -CY- else dropped if
                     * in -SCI-, -SCE-, -SCY- else K
                     */
                    if (*(n - 1) != 'S' || !frontv((n + 1))) {
                        if (*(n + 1) == 'I' && *(n + 2) == 'A') {
                            *Metaph++ = 'X';
                        } else if (frontv((n + 1))) {
                            *Metaph++ = 'S';
                        } else if (*(n + 1) == 'H') {
                            *Metaph++ = ((n == n_start && !vowel((n + 2))) || *(n - 1) == 'S')
                                            ? (char)'K'
                                            : (char)'X';
                        } else {
                            *Metaph++ = 'K';
                        }
                    }
                    break;
                case 'D':

                    /*
                     * J if in DGE or DGI or DGY else T
                     */
                    *Metaph++ = (*(n + 1) == 'G' && frontv((n + 2)))
                                    ? (char)'J'
                                    : (char)'T';
                    break;
                case 'G':

                    /*
                     * F if in -GH and not B--GH, D--GH,
                     * -H--GH, -H---GH else dropped if
                     * -GNED, -GN, -DGE-, -DGI-, -DGY-
                     * else J if in -GE-, -GI-, -GY- and
                     * not GG else K
                     */
                    if ((*(n + 1) != 'J' || vowel((n + 2))) &&
                        (*(n + 1) != 'N' || ((n + 1) < n_end &&
                                             (*(n + 2) != 'E' || *(n + 3) != 'D'))) &&
                        (*(n - 1) != 'D' || !frontv((n + 1))))
                        *Metaph++ = (frontv((n + 1)) &&
                                     *(n + 2) != 'G')
                                        ? (char)'G'
                                        : (char)'K';
                    else if (*(n + 1) == 'H' && !noghf(*(n - 3)) &&
                             *(n - 4) != 'H')
                        *Metaph++ = 'F';
                    break;
                case 'H':

                    /*
                     * H if before a vowel and not after
                     * C, G, P, S, T else dropped
                     */
                    if (!varson(*(n - 1)) && (!vowel((n - 1)) ||
                                              vowel((n + 1))))
                        *Metaph++ = 'H';
                    break;
                case 'K':

                    /*
                     * dropped if after C else K
                     */
                    if (*(n - 1) != 'C')
                        *Metaph++ = 'K';
                    break;
                case 'P':

                    /*
                     * F if before H, else P
                     */
                    *Metaph++ = *(n + 1) == 'H' ? (char)'F' : (char)'P';
                    break;
                case 'Q':

                    /*
                     * K
                     */
                    *Metaph++ = 'K';
                    break;
                case 'S':

                    /*
                     * X in -SH-, -SIO- or -SIA- else S
                     */
                    *Metaph++ = (*(n + 1) == 'H' ||
                                 (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
                                                      *(n + 2) == 'A')))
                                    ? (char)'X'
                                    : (char)'S';
                    break;
                case 'T':

                    /*
                     * X in -TIA- or -TIO- else 0 (zero)
                     * before H else dropped if in -TCH-
                     * else T
                     */
                    if (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
                                            *(n + 2) == 'A'))
                        *Metaph++ = 'X';
                    else if (*(n + 1) == 'H')
                        *Metaph++ = '0';
                    else if (*(n + 1) != 'C' || *(n + 2) != 'H')
                        *Metaph++ = 'T';
                    break;
                case 'V':

                    /*
                     * F
                     */
                    *Metaph++ = 'F';
                    break;
                case 'W':

                /*
                     * W after a vowel, else dropped
                     */
                case 'Y':

                    /*
                     * Y unless followed by a vowel
                     */
                    if (vowel((n + 1)))
                        *Metaph++ = *n;
                    break;
                case 'X':

                    /*
                     * KS
                     */
                    if (n == n_start)
                        *Metaph++ = 'S';
                    else {
                        *Metaph++ = 'K'; /* Insert K, then S */
                        KSflag = 1;
                    }
                    break;
                case 'Z':

                    /*
                     * S
                     */
                    *Metaph++ = 'S';
                    break;
                }
            }
        }
    }

    *Metaph = 0; /* Null terminate */
    return (slapi_ch_strdup(buf));
}

#endif /* METAPHONE */
#endif /* !SOUNDEX */