diff options
Diffstat (limited to 'metaphone.py')
| -rw-r--r-- | metaphone.py | 190 |
1 files changed, 190 insertions, 0 deletions
diff --git a/metaphone.py b/metaphone.py new file mode 100644 index 0000000..99ae068 --- /dev/null +++ b/metaphone.py @@ -0,0 +1,190 @@ +#!/usr/bin/python2 + +import string + +TRIPLES = { + 'dge': 'j', + 'dgi': 'j', + 'dgy': 'j', + 'sia': '+x', + 'sio': '+x', + 'tia': '+x', + 'tio': '+x', + 'tch': '', + 'tha': '0', + 'the': '0', + 'thi': '0', + 'tho': '0', + 'thu': '0', + } + +DOUBLES = { + 'ph' : 'f', + 'sh' : 'x' + } + +SINGLETONS = { + 'd': 't', + 'f': 'f', + 'j': 'j', + 'l': 'l', + 'm': 'm', + 'n': 'n', + 'r': 'r', + 'p': 'p', + 'q': 'k', + 'v': 'f', + 'x': 'ks', + 'z': 's', + + +} + +ALLCHARS = "".join(map(chr, range(256))) +NONLCCHARS = "".join([c for c in ALLCHARS if not c.islower()]) +def metaphone(s): + s = s.lower() + s = s.translate(ALLCHARS, NONLCCHARS) + + if not s: return "" + + # If ae, gn, kn, pn, wr then drop the first letter. + if s[:2] in ("ae", "gn", "kn", "pn", "wr"): + s = s[1:] + + # Change "x" to "s" + if s[0] == 'x': + x = "s%s" % s[1:] + + # Get rid of "h" in "wh". + if s[:2] == 'wh': + s = "w%s" % s[1:] + + # Get rid of s from end. + if s[-1] == 's': + s = s[:-1] + + result = [] + prevLtr = ' ' + vowelBefore = 0 + lastChar = len(s)-1 + for idx in range(len(s)): + curLtr = s[idx] + # If first char is a vowel, keep it. + if curLtr in "aeiou": + if idx == 0: + result.append(curLtr) + continue + + # Skip double letters. + if idx < lastChar: + if curLtr == s[idx+1]: + continue + + try: + r = TRIPLES[s[idx:idx+3]] + if r == "+x": + if idx > 1: + result.append("x") + continue + else: + result.append(r) + continue + except KeyError: + pass + try: + r = DOUBLES[s[idx:idx+2]] + result.append(r) + continue + except KeyError: + pass + try: + r = SINGLETONS[s[idx]] + result.append(r) + continue + except KeyError: + pass + + if idx > 0: + prevLtr = s[idx-1] + vowelBefore = prevLtr in "aeiou" + curLtr = s[idx] + + nextLtr2 = nextLtr3 = ' ' + if idx < lastChar: + nextLtr = s[idx+1] + vowelAfter = nextLtr in "aeiou" + frontvAfter = nextLtr in "eiy" + if idx+1 < lastChar: + nextLtr2 = s[idx+2] + if idx+2 < lastChar: + nextLtr3 = s[idx+3] + else: + nextLtr = ' ' + vowelAfter = frontvAfter = 0 + + + if curLtr == 'b': + if idx == lastChar and prevLtr == 'm': + pass + else: + result.append(curLtr) + elif curLtr == 'c': + # silent 'sci', 'sce, 'scy', 'sci', etc OK. + if not (prevLtr == 's' and frontvAfter): + if nextLtr in 'ia': + result.append("x") + elif frontvAfter: + result.append("s") + elif prevLtr == 's' and nextLtr == 'h': + result.append('k') + elif nextLtr == 'h': + if idx == 0 and nextLtr2 in "aeiou": + result.append('k') + else: + result.append('x') + elif prevLtr == 'c': + result.append('c') + else: + result.append('k') + elif curLtr == 'g': + if (idx < lastChar-1) and nextLtr == 'h': + pass + elif s[idx:] == 'gned': + pass + elif s[idx:] == 'gn': + pass + elif prevLtr == 'd' and frontvAfter: + pass + else: + hard = (prevLtr == 'g') + if frontvAfter and not hard: + result.append('j') + else: + result.append('k') + elif curLtr == 'h': + if prevLtr in 'csptg': + pass + elif vowelBefore and not vowelAfter: + pass + else: + result.append('h') + elif curLtr == 'k': + if prevLtr != 'c': result.append('k') + elif curLtr in 'wy': + if vowelAfter: + result.append(curLtr) + + return "".join(result) + +def demo(a): + print a, "=>", metaphone(a) + +print __name__ +if __name__ == '__main__': + demo("Nick. Mathewson") + + demo("joe schmidt") + demo("Beethoven") + + demo("Andrea Plaid") |
