From 76054347cffacd7a4a6759f2187717e185d8082c Mon Sep 17 00:00:00 2001 From: Thibaut Horel Date: Thu, 27 Feb 2014 11:29:59 -0500 Subject: Cython implementation of SW algorithm --- lev.pyx | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 lev.pyx (limited to 'lev.pyx') diff --git a/lev.pyx b/lev.pyx new file mode 100644 index 0000000..5e4dd52 --- /dev/null +++ b/lev.pyx @@ -0,0 +1,79 @@ +from libc.stdlib cimport malloc, free +from Levenshtein import distance as levenshtein + +cdef unicode join_words(list l): + if len(l) >= 2 and l[-2][-1] == "-": + l[-2] = l[-2][:-1] + + return "".join(l) + +def align(list l1, list l2): + """Compute the optimal alignment between two list of words + à la Needleman-Wunsch. + + The function returns a (score, alignment) pair. An alignment is simply + a list of size len(l1) giving for each word in l1, the index of the word in + l2 it maps to (or -1 if the word maps to nothing). + + Note that we also allow the index to be a tuple when a word in l1 maps to + a sequence of words in l2. Conversly, consecutive words in l1 can map to + the same word in l2. + """ + + # Throughout the function, l1 is to be thought of as the proofread text, + # and l2 as the OCR text. The deletion costs are not symmetric: removing + # junk from the OCR is frequent while removing a word from the proofread + # text should be rare. + cdef int del_cost1, del_cost2, n, m, i, j, k, l, s, d, min_s, w + cdef list b, temp + del_cost1 = 30 + del_cost2 = 2 + w = 1 # multiplicative cost factor for the Levenshtein distance + + n, m = len(l1), len(l2) + # a is the (score, alignment) matrix. a[i][j] is the (score, alignment) + # pair of the first i words of l1 to the first j words of l2 + a = [[(0, [])] * (m + 1) for i in xrange(n + 1)] + + for j in xrange(1, m + 1): + a[0][j] = 0, [] + + for i in xrange(1, n + 1): + a[i][0] = del_cost1 * i, [-1] * i + + for j in xrange(1, m + 1): + + s, b = a[i-1][j-1] + d = levenshtein(l1[i-1], l2[j-1]) + min_s = s + w * d + min_b = b + [j-1] + + s, b = a[i-1][j] + if s + del_cost1 < min_s: + min_s = s + del_cost1 + min_b = b + [-1] + + s, b = a[i][j-1] + if s + del_cost2 < min_s: + min_s = s + del_cost2 + min_b = b + + for k in xrange(1, 5): + for l in xrange(1, 5): + if k + l <= 2: + continue + if k+l > 6: + break + if j < l or i < k: + break + s, b = a[i-k][j-l] + d = levenshtein(join_words(l1[i-k:i]), + join_words(l2[j-l:j])) + if s + w * d < min_s: + temp = [j-1] if l == 1 else [tuple(range(j-l, j))] + min_s = s + w *d + min_b = b + temp*k + + a[i][j] = min_s, min_b + + return a[n][m] -- cgit v1.2.3-70-g09d2