From 8dd400ab39e84afc13afba3acd15aa5f6918f03f Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Wed, 21 Aug 2013 17:35:51 -0400 Subject: begin refactoring --- string_utils.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'string_utils.py') diff --git a/string_utils.py b/string_utils.py index 186f2eb..55e0428 100644 --- a/string_utils.py +++ b/string_utils.py @@ -65,7 +65,7 @@ def join_ocr_words(l, c): def join_words(l): return "".join(l) -def align(l1, l2, c2): +def align(l1, l2, c2, del1, del2, join1, join2, backtrack1, bactrack2): """Compute the optimal alignment between two list of words à la Needleman-Wunsch. @@ -82,9 +82,6 @@ def align(l1, l2, c2): # and l2 as the OCR text. The deletion costs are not symmetric: removing # junk from the OCR is frequent while removing a word from the proofread # text should be rare. - del_cost1 = 50 - def del_cost2(w): - return 1+3*len([c for c in w if c.isalnum()]) w = 3 # multiplicative cost factor for the Levenshtein distance n, m = len(l1), len(l2) @@ -94,9 +91,11 @@ def align(l1, l2, c2): for j in xrange(1, m + 1): a[0][j] = j, [] - + + delcost = for i in xrange(1, n + 1): - a[i][0] = i * del_cost1, [[]] * i + delcost += del_cost1(l1[i], 0) + a[i][0] = delcost, [[]] * i for j in xrange(1, m + 1): @@ -105,24 +104,24 @@ def align(l1, l2, c2): min_s, min_b = s + w * d, b + [[j-1]] s, b = a[i-1][j] - if s + del_cost1 < min_s: + if s + del1(l1[i-1], j) < min_s: min_s, min_b = s + del_cost1, b + [[]] s, b = a[i][j-1] - if s + del_cost2(l2[j-1]) < min_s: - min_s, min_b = s + del_cost2(l2[j-1]), b + if s + del2(l2[j-1], i) < min_s: + min_s, min_b = s + del2(l2[j-1], i), b - for k in xrange(1, 8): - for l in xrange(1, 5): + for k in xrange(1, backtrack1): + for l in xrange(1, backtrack2): if k + l <= 2: continue - if k+l > 7: + if k+l > (backtrack1+backtrack2)/2.: break if j < l or i < k: break s, b = a[i-k][j-l] - d = levenshtein(join_words(l1[i-k:i]), - join_ocr_words(l2[j-l:j], c2[j-l:j])) + d = levenshtein(join1(l1[i-k:i]), + join2(l2[j-l:j], c2[j-l:j])) if s + w * d < min_s: temp = [[j-1]] if l == 1 else [range(j-l, j)] min_s, min_b = s + w * d, b + temp * k -- cgit v1.2.3-70-g09d2