lev.pyx


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79

from libc.stdlib cimport malloc, free
from Levenshtein import distance as levenshtein

cdef unicode join_words(list l):
    if len(l) >= 2 and l[-2][-1] == "-":
        l[-2] = l[-2][:-1]

    return "".join(l)

def align(list l1, list l2):
    """Compute the optimal alignment between two list of words
    à la Needleman-Wunsch.

    The function returns a (score, alignment) pair. An alignment is simply
    a list of size len(l1) giving for each word in l1, the index of the word in
    l2 it maps to (or -1 if the word maps to nothing).

    Note that we also allow the index to be a tuple when a word in l1 maps to
    a sequence of words in l2. Conversly, consecutive words in l1 can map to
    the same word in l2.
    """

    # Throughout the function, l1 is to be thought of as the proofread text,
    # and l2 as the OCR text. The deletion costs are not symmetric: removing
    # junk from the OCR is frequent while removing a word from the proofread
    # text should be rare.
    cdef int del_cost1, del_cost2, n, m, i, j, k, l, s, d, min_s, w
    cdef list b, temp
    del_cost1 = 30
    del_cost2 = 2
    w = 1 # multiplicative cost factor for the Levenshtein distance

    n, m = len(l1), len(l2)
    # a is the (score, alignment) matrix. a[i][j] is the (score, alignment)
    # pair of the first i words of l1 to the first j words of l2
    a = [[(0, [])] * (m + 1) for i in xrange(n + 1)]

    for j in xrange(1, m + 1):
        a[0][j] = 0, []

    for i in xrange(1, n + 1):
        a[i][0] = del_cost1 * i, [-1] * i

        for j in xrange(1, m + 1):

            s, b = a[i-1][j-1]
            d = levenshtein(l1[i-1], l2[j-1])
            min_s = s + w * d
            min_b  = b + [j-1]

            s, b = a[i-1][j]
            if s + del_cost1 < min_s:
                min_s = s + del_cost1
                min_b = b + [-1]

            s, b = a[i][j-1]
            if s + del_cost2 < min_s:
                min_s = s + del_cost2
                min_b = b

            for k in xrange(1, 5):
                for l in xrange(1, 5):
                    if k + l <= 2:
                        continue
                    if k+l > 6:
                        break
                    if j < l or i < k:
                        break
                    s, b = a[i-k][j-l]
                    d = levenshtein(join_words(l1[i-k:i]),
                                    join_words(l2[j-l:j]))
                    if s + w * d < min_s:
                        temp = [j-1] if l == 1 else [tuple(range(j-l, j))]
                        min_s = s + w *d
                        min_b =  b + temp*k

            a[i][j] = min_s, min_b

    return a[n][m]