1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
|
# -*- coding: utf-8 -*-
def levenshtein(word1, word2):
"""Return triplet of number of (substitutions, insertions, deletions) to
transform word1 into word2.
Dynamic programming implementation storing only two rows of the full matrix
at a time.
TODO: write this in a Cython module.
"""
s, t = len(word1), len(word2)
if word1 == word2:
return 0
if not min(s, t):
return max(s, t)
v0 = [i for i in xrange(t + 1)] # v0[i] is d(0, i)
v1 = [0] * (t + 1)
for i in xrange(s):
# v0[j] = d(i, j) for all j and we compute v1[j] = d(i+1, j) for all j
v1[0] = i + 1
for j in xrange(t):
diff = int(word1[i] != word2[j])
v1[j+1] = min(v1[j] + 1, v0[j+1] + 1, v0[j] + diff)
v0 = list(v1) # copy v1 into v0 for the next iteration
return v1[t]
def cut(word, left, right):
"""Return pair of strings (p + "-", s) such that p+s == word and
L(p + "-", left) + L(s, right) is minimal, where L is the levenshtein
distance.
Implementation is suboptimal since the computation of the Levenshtein
distances will involve comparing the same segments repeatedly.
TODO: handle the case when word contains an hyphen (e.g. c'est-à-dire)
"""
def aux(i):
leftw, rightw = word[:i] + "-", word[i:]
return (leftw, rightw,
levenshtein(leftw, left) + levenshtein(rightw, right))
l = [aux(i) for i in xrange(len(word) + 1)]
return min(l, key=lambda x: x[2])[:2]
def LCS(X, Y):
m = len(X)
n = len(Y)
# An (m+1) times (n+1) matrix
C = [[0] * (n+1) for i in range(m+1)]
for i in range(1, m+1):
for j in range(1, n+1):
if X[i-1] == Y[j-1]:
C[i][j] = C[i-1][j-1] + 1
else:
C[i][j] = max(C[i][j-1], C[i-1][j])
return C
def printDiff(C, X, Y, i, j):
if i > 0 and j > 0 and X[i-1] == Y[j-1]:
printDiff(C, X, Y, i-1, j-1)
print " " + X[i-1]
else:
if j > 0 and (i == 0 or C[i][j-1] >= C[i-1][j]):
printDiff(C, X, Y, i, j-1)
print "+ " + Y[j-1]
elif i > 0 and (j == 0 or C[i][j-1] < C[i-1][j]):
printDiff(C, X, Y, i-1, j)
print "- " + X[i-1]
|