gnunetbib

Bibliography (BibTeX, based on AnonBib)
Log | Files | Refs | README | LICENSE

metaphone.py (4819B)


      1 #!/usr/bin/python2
      2 # Copyright 2003-2008, Nick Mathewson.  See LICENSE for licensing info.
      3 
      4 """metaphone.py -- Pure-python metaphone implementation.
      5 
      6    (This is not guaranteed to match the real metaphone algorithm; I
      7    haven't tested it thorougly enough.  Let me know if you find bugs.
      8 
      9    Based on the original C++ metaphone implementation.)
     10 """
     11 
     12 TRIPLES = {
     13     'dge': 'j',
     14     'dgi': 'j',
     15     'dgy': 'j',
     16     'sia': '+x',
     17     'sio': '+x',
     18     'tia': '+x',
     19     'tio': '+x',
     20     'tch': '',
     21     'tha': '0',
     22     'the': '0',
     23     'thi': '0',
     24     'tho': '0',
     25     'thu': '0',
     26     }
     27 
     28 DOUBLES = {
     29     'ph' : 'f',
     30     'sh' : 'x'
     31     }
     32 
     33 SINGLETONS = {
     34     'd': 't',
     35     'f': 'f',
     36     'j': 'j',
     37     'l': 'l',
     38     'm': 'm',
     39     'n': 'n',
     40     'r': 'r',
     41     'p': 'p',
     42     'q': 'k',
     43     'v': 'f',
     44     'x': 'ks',
     45     'z': 's',
     46 }
     47 
     48 ALLCHARS = "".join(map(chr, range(256)))
     49 NONLCCHARS = "".join([c for c in ALLCHARS if not c.islower()])
     50 def metaphone(s):
     51     """Return the metaphone equivalent of a provided string"""
     52     s = s.lower()
     53     s = s.translate(ALLCHARS, NONLCCHARS)
     54 
     55     if not s: return ""
     56 
     57     # If ae, gn, kn, pn, wr then drop the first letter.
     58     if s[:2] in ("ae", "gn", "kn", "pn", "wr"):
     59         s = s[1:]
     60 
     61     # Change "x" to "s"
     62     if s[0] == 'x':
     63         s = "s%s" % s[1:]
     64 
     65     # Get rid of "h" in "wh".
     66     if s[:2] == 'wh':
     67         s = "w%s" % s[1:]
     68 
     69     # Get rid of s from end.
     70     if s[-1] == 's':
     71         s = s[:-1]
     72 
     73     result = []
     74     prevLtr = ' '
     75     vowelBefore = 0
     76     lastChar = len(s)-1
     77     for idx in range(len(s)):
     78         curLtr = s[idx]
     79         # If first char is a vowel, keep it.
     80         if curLtr in "aeiou":
     81             if idx == 0:
     82                 result.append(curLtr)
     83             continue
     84 
     85         # Skip double letters.
     86         if idx < lastChar:
     87             if curLtr == s[idx+1]:
     88                 continue
     89 
     90         try:
     91             r = TRIPLES[s[idx:idx+3]]
     92             if r == "+x":
     93                 if idx > 1:
     94                     result.append("x")
     95                     continue
     96             else:
     97                 result.append(r)
     98                 continue
     99         except KeyError:
    100             pass
    101         try:
    102             r = DOUBLES[s[idx:idx+2]]
    103             result.append(r)
    104             continue
    105         except KeyError:
    106             pass
    107         try:
    108             r = SINGLETONS[s[idx]]
    109             result.append(r)
    110             continue
    111         except KeyError:
    112             pass
    113 
    114         if idx > 0:
    115             prevLtr = s[idx-1]
    116             vowelBefore = prevLtr in "aeiou"
    117         curLtr = s[idx]
    118 
    119         nextLtr2 = ' '
    120         if idx < lastChar:
    121             nextLtr = s[idx+1]
    122             vowelAfter = nextLtr in "aeiou"
    123             frontvAfter = nextLtr in "eiy"
    124             if idx+1 < lastChar:
    125                 nextLtr2 = s[idx+2]
    126         else:
    127             nextLtr = ' '
    128             vowelAfter = frontvAfter = 0
    129 
    130 
    131         if curLtr == 'b':
    132             if idx == lastChar and prevLtr == 'm':
    133                 pass
    134             else:
    135                 result.append(curLtr)
    136         elif curLtr == 'c':
    137             # silent 'sci', 'sce, 'scy', 'sci', etc OK.
    138             if not (prevLtr == 's' and frontvAfter):
    139                 if nextLtr in 'ia':
    140                     result.append("x")
    141                 elif frontvAfter:
    142                     result.append("s")
    143                 elif prevLtr == 's' and nextLtr == 'h':
    144                     result.append('k')
    145                 elif nextLtr == 'h':
    146                     if idx == 0 and nextLtr2 in "aeiou":
    147                         result.append('k')
    148                     else:
    149                         result.append('x')
    150                 elif prevLtr == 'c':
    151                     result.append('c')
    152                 else:
    153                     result.append('k')
    154         elif curLtr == 'g':
    155             if (idx < lastChar-1) and nextLtr == 'h':
    156                 pass
    157             elif s[idx:] == 'gned':
    158                 pass
    159             elif s[idx:] == 'gn':
    160                 pass
    161             elif prevLtr == 'd' and frontvAfter:
    162                 pass
    163             else:
    164                 hard = (prevLtr == 'g')
    165                 if frontvAfter and not hard:
    166                     result.append('j')
    167                 else:
    168                     result.append('k')
    169         elif curLtr == 'h':
    170             if prevLtr in 'csptg':
    171                 pass
    172             elif vowelBefore and not vowelAfter:
    173                 pass
    174             else:
    175                 result.append('h')
    176         elif curLtr == 'k':
    177             if prevLtr != 'c': result.append('k')
    178         elif curLtr in 'wy':
    179             if vowelAfter:
    180                 result.append(curLtr)
    181 
    182     return "".join(result)
    183 
    184 def demo(a):
    185     print a, "=>", metaphone(a)
    186 
    187 if __name__ == '__main__':
    188     demo("Nick. Mathewson")
    189 
    190     demo("joe schmidt")
    191     demo("Beethoven")
    192 
    193     demo("Because the world is round")