aboutsummaryrefslogtreecommitdiffstats
path: root/string_utils.py
blob: 8b7a3a3a10b36eb6924637046901c8bb59ad708d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# -*- coding: utf-8 -*-

def levenshtein(word1, word2):
    """Return triplet of number of (substitutions, insertions, deletions) to
    transform word1 into word2.

    Dynamic programming implementation storing only two rows of the full matrix
    at a time.

    TODO: write this in a Cython module.
    """

    s, t = len(word1), len(word2)
    if word1 == word2:
        return 0
    if not min(s, t):
        return max(s, t)

    v0 = [i for i in xrange(t + 1)] # v0[i] is d(0, i)
    v1 = [0] * (t + 1)
    for i in xrange(s):
        # v0[j] = d(i, j) for all j and we compute v1[j] = d(i+1, j) for all j
        v1[0] = i + 1

        for j in xrange(t):
            diff = int(word1[i] != word2[j])
            v1[j+1] = min(v1[j] + 1, v0[j+1] + 1, v0[j] + diff)

        v0 = list(v1) # copy v1 into v0 for the next iteration

    return v1[t]

def cut(word, left, right):
    """Return pair of strings (p + "-", s) such that p+s == word and
    L(p + "-", left) + L(s, right) is minimal, where L is the levenshtein
    distance.

    Implementation is suboptimal since the computation of the Levenshtein
    distances will involve comparing the same segments repeatedly.

    TODO: handle the case when word contains an hyphen (e.g. c'est-à-dire)
    """

    def aux(i):
        leftw, rightw = word[:i] + "-", word[i:]
        return (leftw, rightw,
                levenshtein(leftw, left) + levenshtein(rightw, right))

    l = [aux(i) for i in xrange(len(word) + 1)]
    return min(l, key=lambda x: x[2])[:2]

def LCS(X, Y):
    m = len(X)
    n = len(Y)
    # An (m+1) times (n+1) matrix
    C = [[0] * (n+1) for i in range(m+1)]
    for i in range(1, m+1):
        for j in range(1, n+1):
            if X[i-1] == Y[j-1]:
                C[i][j] = C[i-1][j-1] + 1
            else:
                C[i][j] = max(C[i][j-1], C[i-1][j])
    return C

def printDiff(C, X, Y, i, j):
    if i > 0 and j > 0 and X[i-1] == Y[j-1]:
        printDiff(C, X, Y, i-1, j-1)
        print "  " + X[i-1]
    else:
        if j > 0 and (i == 0 or C[i][j-1] >= C[i-1][j]):
            printDiff(C, X, Y, i, j-1)
            print "+ " + Y[j-1]
        elif i > 0 and (j == 0 or C[i][j-1] < C[i-1][j]):
            printDiff(C, X, Y, i-1, j)
            print "- " + X[i-1]