aboutsummaryrefslogtreecommitdiffstats
path: root/string_utils.py
diff options
context:
space:
mode:
Diffstat (limited to 'string_utils.py')
-rw-r--r--string_utils.py27
1 files changed, 13 insertions, 14 deletions
diff --git a/string_utils.py b/string_utils.py
index 186f2eb..55e0428 100644
--- a/string_utils.py
+++ b/string_utils.py
@@ -65,7 +65,7 @@ def join_ocr_words(l, c):
def join_words(l):
return "".join(l)
-def align(l1, l2, c2):
+def align(l1, l2, c2, del1, del2, join1, join2, backtrack1, bactrack2):
"""Compute the optimal alignment between two list of words
à la Needleman-Wunsch.
@@ -82,9 +82,6 @@ def align(l1, l2, c2):
# and l2 as the OCR text. The deletion costs are not symmetric: removing
# junk from the OCR is frequent while removing a word from the proofread
# text should be rare.
- del_cost1 = 50
- def del_cost2(w):
- return 1+3*len([c for c in w if c.isalnum()])
w = 3 # multiplicative cost factor for the Levenshtein distance
n, m = len(l1), len(l2)
@@ -94,9 +91,11 @@ def align(l1, l2, c2):
for j in xrange(1, m + 1):
a[0][j] = j, []
-
+
+ delcost =
for i in xrange(1, n + 1):
- a[i][0] = i * del_cost1, [[]] * i
+ delcost += del_cost1(l1[i], 0)
+ a[i][0] = delcost, [[]] * i
for j in xrange(1, m + 1):
@@ -105,24 +104,24 @@ def align(l1, l2, c2):
min_s, min_b = s + w * d, b + [[j-1]]
s, b = a[i-1][j]
- if s + del_cost1 < min_s:
+ if s + del1(l1[i-1], j) < min_s:
min_s, min_b = s + del_cost1, b + [[]]
s, b = a[i][j-1]
- if s + del_cost2(l2[j-1]) < min_s:
- min_s, min_b = s + del_cost2(l2[j-1]), b
+ if s + del2(l2[j-1], i) < min_s:
+ min_s, min_b = s + del2(l2[j-1], i), b
- for k in xrange(1, 8):
- for l in xrange(1, 5):
+ for k in xrange(1, backtrack1):
+ for l in xrange(1, backtrack2):
if k + l <= 2:
continue
- if k+l > 7:
+ if k+l > (backtrack1+backtrack2)/2.:
break
if j < l or i < k:
break
s, b = a[i-k][j-l]
- d = levenshtein(join_words(l1[i-k:i]),
- join_ocr_words(l2[j-l:j], c2[j-l:j]))
+ d = levenshtein(join1(l1[i-k:i]),
+ join2(l2[j-l:j], c2[j-l:j]))
if s + w * d < min_s:
temp = [[j-1]] if l == 1 else [range(j-l, j)]
min_s, min_b = s + w * d, b + temp * k