aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGuillaume Horel <guillaume.horel@gmail.com>2013-12-28 10:13:29 -0500
committerGuillaume Horel <guillaume.horel@gmail.com>2013-12-28 10:13:29 -0500
commit5ad22522df6b4b725fa7fdb46ff6c78d627775a5 (patch)
treeeea4db1155925a05925505c8c6437fd7635b9c96
parent5dc1322e4c7e78ca98e3ad910f816ad45ac7bfd8 (diff)
parent8dd400ab39e84afc13afba3acd15aa5f6918f03f (diff)
downloadocr-layer-curation-5ad22522df6b4b725fa7fdb46ff6c78d627775a5.tar.gz
Merge branch 'refactor_align' of horel.org:thibaut/ocr-layer-curation into refactor_align
-rw-r--r--compare.py11
-rw-r--r--string_utils.py27
2 files changed, 24 insertions, 14 deletions
diff --git a/compare.py b/compare.py
index b6a4c8a..daec25c 100644
--- a/compare.py
+++ b/compare.py
@@ -14,8 +14,19 @@ l1, c1 = ocrpage['words'], ocrpage["coords"]
l2 = get_page(wikibook, n)
print len(l2.split())
l3 = su.simplify(l2)
+
+def del_cost1(w, pos):
+ return 50
+
+def del_cost2(w, pos):
+ return 1+3*len([c for c in w if c.isalnum()])
+bactrack1 = 8
+backtrack2 = 5
+
C = su.align(l3.split(), l1, c1)
pdb.set_trace()
+su.print_alignment(l2.split(), l1, c1, C[1])
+
sexp = page_sexp(wikibook, n)
su.alignment_to_sexp(C[1], sexp, l2.split())
su.print_alignment(l2.split(), l1, c1, C[1])
diff --git a/string_utils.py b/string_utils.py
index 186f2eb..55e0428 100644
--- a/string_utils.py
+++ b/string_utils.py
@@ -65,7 +65,7 @@ def join_ocr_words(l, c):
def join_words(l):
return "".join(l)
-def align(l1, l2, c2):
+def align(l1, l2, c2, del1, del2, join1, join2, backtrack1, bactrack2):
"""Compute the optimal alignment between two list of words
à la Needleman-Wunsch.
@@ -82,9 +82,6 @@ def align(l1, l2, c2):
# and l2 as the OCR text. The deletion costs are not symmetric: removing
# junk from the OCR is frequent while removing a word from the proofread
# text should be rare.
- del_cost1 = 50
- def del_cost2(w):
- return 1+3*len([c for c in w if c.isalnum()])
w = 3 # multiplicative cost factor for the Levenshtein distance
n, m = len(l1), len(l2)
@@ -94,9 +91,11 @@ def align(l1, l2, c2):
for j in xrange(1, m + 1):
a[0][j] = j, []
-
+
+ delcost =
for i in xrange(1, n + 1):
- a[i][0] = i * del_cost1, [[]] * i
+ delcost += del_cost1(l1[i], 0)
+ a[i][0] = delcost, [[]] * i
for j in xrange(1, m + 1):
@@ -105,24 +104,24 @@ def align(l1, l2, c2):
min_s, min_b = s + w * d, b + [[j-1]]
s, b = a[i-1][j]
- if s + del_cost1 < min_s:
+ if s + del1(l1[i-1], j) < min_s:
min_s, min_b = s + del_cost1, b + [[]]
s, b = a[i][j-1]
- if s + del_cost2(l2[j-1]) < min_s:
- min_s, min_b = s + del_cost2(l2[j-1]), b
+ if s + del2(l2[j-1], i) < min_s:
+ min_s, min_b = s + del2(l2[j-1], i), b
- for k in xrange(1, 8):
- for l in xrange(1, 5):
+ for k in xrange(1, backtrack1):
+ for l in xrange(1, backtrack2):
if k + l <= 2:
continue
- if k+l > 7:
+ if k+l > (backtrack1+backtrack2)/2.:
break
if j < l or i < k:
break
s, b = a[i-k][j-l]
- d = levenshtein(join_words(l1[i-k:i]),
- join_ocr_words(l2[j-l:j], c2[j-l:j]))
+ d = levenshtein(join1(l1[i-k:i]),
+ join2(l2[j-l:j], c2[j-l:j]))
if s + w * d < min_s:
temp = [[j-1]] if l == 1 else [range(j-l, j)]
min_s, min_b = s + w * d, b + temp * k