metaphone.py (4819B)
1 #!/usr/bin/python2 2 # Copyright 2003-2008, Nick Mathewson. See LICENSE for licensing info. 3 4 """metaphone.py -- Pure-python metaphone implementation. 5 6 (This is not guaranteed to match the real metaphone algorithm; I 7 haven't tested it thorougly enough. Let me know if you find bugs. 8 9 Based on the original C++ metaphone implementation.) 10 """ 11 12 TRIPLES = { 13 'dge': 'j', 14 'dgi': 'j', 15 'dgy': 'j', 16 'sia': '+x', 17 'sio': '+x', 18 'tia': '+x', 19 'tio': '+x', 20 'tch': '', 21 'tha': '0', 22 'the': '0', 23 'thi': '0', 24 'tho': '0', 25 'thu': '0', 26 } 27 28 DOUBLES = { 29 'ph' : 'f', 30 'sh' : 'x' 31 } 32 33 SINGLETONS = { 34 'd': 't', 35 'f': 'f', 36 'j': 'j', 37 'l': 'l', 38 'm': 'm', 39 'n': 'n', 40 'r': 'r', 41 'p': 'p', 42 'q': 'k', 43 'v': 'f', 44 'x': 'ks', 45 'z': 's', 46 } 47 48 ALLCHARS = "".join(map(chr, range(256))) 49 NONLCCHARS = "".join([c for c in ALLCHARS if not c.islower()]) 50 def metaphone(s): 51 """Return the metaphone equivalent of a provided string""" 52 s = s.lower() 53 s = s.translate(ALLCHARS, NONLCCHARS) 54 55 if not s: return "" 56 57 # If ae, gn, kn, pn, wr then drop the first letter. 58 if s[:2] in ("ae", "gn", "kn", "pn", "wr"): 59 s = s[1:] 60 61 # Change "x" to "s" 62 if s[0] == 'x': 63 s = "s%s" % s[1:] 64 65 # Get rid of "h" in "wh". 66 if s[:2] == 'wh': 67 s = "w%s" % s[1:] 68 69 # Get rid of s from end. 70 if s[-1] == 's': 71 s = s[:-1] 72 73 result = [] 74 prevLtr = ' ' 75 vowelBefore = 0 76 lastChar = len(s)-1 77 for idx in range(len(s)): 78 curLtr = s[idx] 79 # If first char is a vowel, keep it. 80 if curLtr in "aeiou": 81 if idx == 0: 82 result.append(curLtr) 83 continue 84 85 # Skip double letters. 86 if idx < lastChar: 87 if curLtr == s[idx+1]: 88 continue 89 90 try: 91 r = TRIPLES[s[idx:idx+3]] 92 if r == "+x": 93 if idx > 1: 94 result.append("x") 95 continue 96 else: 97 result.append(r) 98 continue 99 except KeyError: 100 pass 101 try: 102 r = DOUBLES[s[idx:idx+2]] 103 result.append(r) 104 continue 105 except KeyError: 106 pass 107 try: 108 r = SINGLETONS[s[idx]] 109 result.append(r) 110 continue 111 except KeyError: 112 pass 113 114 if idx > 0: 115 prevLtr = s[idx-1] 116 vowelBefore = prevLtr in "aeiou" 117 curLtr = s[idx] 118 119 nextLtr2 = ' ' 120 if idx < lastChar: 121 nextLtr = s[idx+1] 122 vowelAfter = nextLtr in "aeiou" 123 frontvAfter = nextLtr in "eiy" 124 if idx+1 < lastChar: 125 nextLtr2 = s[idx+2] 126 else: 127 nextLtr = ' ' 128 vowelAfter = frontvAfter = 0 129 130 131 if curLtr == 'b': 132 if idx == lastChar and prevLtr == 'm': 133 pass 134 else: 135 result.append(curLtr) 136 elif curLtr == 'c': 137 # silent 'sci', 'sce, 'scy', 'sci', etc OK. 138 if not (prevLtr == 's' and frontvAfter): 139 if nextLtr in 'ia': 140 result.append("x") 141 elif frontvAfter: 142 result.append("s") 143 elif prevLtr == 's' and nextLtr == 'h': 144 result.append('k') 145 elif nextLtr == 'h': 146 if idx == 0 and nextLtr2 in "aeiou": 147 result.append('k') 148 else: 149 result.append('x') 150 elif prevLtr == 'c': 151 result.append('c') 152 else: 153 result.append('k') 154 elif curLtr == 'g': 155 if (idx < lastChar-1) and nextLtr == 'h': 156 pass 157 elif s[idx:] == 'gned': 158 pass 159 elif s[idx:] == 'gn': 160 pass 161 elif prevLtr == 'd' and frontvAfter: 162 pass 163 else: 164 hard = (prevLtr == 'g') 165 if frontvAfter and not hard: 166 result.append('j') 167 else: 168 result.append('k') 169 elif curLtr == 'h': 170 if prevLtr in 'csptg': 171 pass 172 elif vowelBefore and not vowelAfter: 173 pass 174 else: 175 result.append('h') 176 elif curLtr == 'k': 177 if prevLtr != 'c': result.append('k') 178 elif curLtr in 'wy': 179 if vowelAfter: 180 result.append(curLtr) 181 182 return "".join(result) 183 184 def demo(a): 185 print a, "=>", metaphone(a) 186 187 if __name__ == '__main__': 188 demo("Nick. Mathewson") 189 190 demo("joe schmidt") 191 demo("Beethoven") 192 193 demo("Because the world is round")