aboutsummaryrefslogtreecommitdiffstats
path: root/lev.pyx
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2014-02-27 11:29:59 -0500
committerThibaut Horel <thibaut.horel@gmail.com>2014-02-27 11:29:59 -0500
commit76054347cffacd7a4a6759f2187717e185d8082c (patch)
treec5820b92e6d67019f9732813f4e4cf7f56412f65 /lev.pyx
parent31ca595e875dd4e2cb07b2d3610b0d4b4f590abe (diff)
downloadocr-layer-curation-76054347cffacd7a4a6759f2187717e185d8082c.tar.gz
Cython implementation of SW algorithm
Diffstat (limited to 'lev.pyx')
-rw-r--r--lev.pyx79
1 files changed, 79 insertions, 0 deletions
diff --git a/lev.pyx b/lev.pyx
new file mode 100644
index 0000000..5e4dd52
--- /dev/null
+++ b/lev.pyx
@@ -0,0 +1,79 @@
+from libc.stdlib cimport malloc, free
+from Levenshtein import distance as levenshtein
+
+cdef unicode join_words(list l):
+ if len(l) >= 2 and l[-2][-1] == "-":
+ l[-2] = l[-2][:-1]
+
+ return "".join(l)
+
+def align(list l1, list l2):
+ """Compute the optimal alignment between two list of words
+ à la Needleman-Wunsch.
+
+ The function returns a (score, alignment) pair. An alignment is simply
+ a list of size len(l1) giving for each word in l1, the index of the word in
+ l2 it maps to (or -1 if the word maps to nothing).
+
+ Note that we also allow the index to be a tuple when a word in l1 maps to
+ a sequence of words in l2. Conversly, consecutive words in l1 can map to
+ the same word in l2.
+ """
+
+ # Throughout the function, l1 is to be thought of as the proofread text,
+ # and l2 as the OCR text. The deletion costs are not symmetric: removing
+ # junk from the OCR is frequent while removing a word from the proofread
+ # text should be rare.
+ cdef int del_cost1, del_cost2, n, m, i, j, k, l, s, d, min_s, w
+ cdef list b, temp
+ del_cost1 = 30
+ del_cost2 = 2
+ w = 1 # multiplicative cost factor for the Levenshtein distance
+
+ n, m = len(l1), len(l2)
+ # a is the (score, alignment) matrix. a[i][j] is the (score, alignment)
+ # pair of the first i words of l1 to the first j words of l2
+ a = [[(0, [])] * (m + 1) for i in xrange(n + 1)]
+
+ for j in xrange(1, m + 1):
+ a[0][j] = 0, []
+
+ for i in xrange(1, n + 1):
+ a[i][0] = del_cost1 * i, [-1] * i
+
+ for j in xrange(1, m + 1):
+
+ s, b = a[i-1][j-1]
+ d = levenshtein(l1[i-1], l2[j-1])
+ min_s = s + w * d
+ min_b = b + [j-1]
+
+ s, b = a[i-1][j]
+ if s + del_cost1 < min_s:
+ min_s = s + del_cost1
+ min_b = b + [-1]
+
+ s, b = a[i][j-1]
+ if s + del_cost2 < min_s:
+ min_s = s + del_cost2
+ min_b = b
+
+ for k in xrange(1, 5):
+ for l in xrange(1, 5):
+ if k + l <= 2:
+ continue
+ if k+l > 6:
+ break
+ if j < l or i < k:
+ break
+ s, b = a[i-k][j-l]
+ d = levenshtein(join_words(l1[i-k:i]),
+ join_words(l2[j-l:j]))
+ if s + w * d < min_s:
+ temp = [j-1] if l == 1 else [tuple(range(j-l, j))]
+ min_s = s + w *d
+ min_b = b + temp*k
+
+ a[i][j] = min_s, min_b
+
+ return a[n][m]