Merge branch 'refactor_align' of horel.org:thibaut/ocr-layer-curation into refactor_align

author: Guillaume Horel <guillaume.horel@gmail.com> 2013-12-28 10:13:29 -0500
committer: Guillaume Horel <guillaume.horel@gmail.com> 2013-12-28 10:13:29 -0500
commit: 5ad22522df6b4b725fa7fdb46ff6c78d627775a5 (patch)
tree: eea4db1155925a05925505c8c6437fd7635b9c96
parent: 5dc1322e4c7e78ca98e3ad910f816ad45ac7bfd8 (diff)
parent: 8dd400ab39e84afc13afba3acd15aa5f6918f03f (diff)
download: ocr-layer-curation-5ad22522df6b4b725fa7fdb46ff6c78d627775a5.tar.gz
2 files changed, 24 insertions, 14 deletions
diff --git a/compare.py b/compare.py
index b6a4c8a..daec25c 100644
--- a/compare.py
+++ b/compare.py
@@ -14,8 +14,19 @@ l1, c1 = ocrpage['words'], ocrpage["coords"]
 l2 = get_page(wikibook, n)
 print len(l2.split())
 l3 = su.simplify(l2)
+
+def del_cost1(w, pos):
+    return 50
+
+def del_cost2(w, pos):
+    return 1+3*len([c for c in w if c.isalnum()])
+bactrack1 = 8
+backtrack2 = 5
+
 C = su.align(l3.split(), l1, c1)
 pdb.set_trace()
+su.print_alignment(l2.split(), l1, c1, C[1])
+
 sexp = page_sexp(wikibook, n)
 su.alignment_to_sexp(C[1], sexp, l2.split())
 su.print_alignment(l2.split(), l1, c1, C[1])
diff --git a/string_utils.py b/string_utils.py
index 186f2eb..55e0428 100644
--- a/string_utils.py
+++ b/string_utils.py
@@ -65,7 +65,7 @@ def join_ocr_words(l, c):
 def join_words(l):
     return "".join(l)
 
-def align(l1, l2, c2):
+def align(l1, l2, c2, del1, del2, join1, join2, backtrack1, bactrack2):
     """Compute the optimal alignment between two list of words
     à la Needleman-Wunsch.
 
@@ -82,9 +82,6 @@ def align(l1, l2, c2):
     # and l2 as the OCR text. The deletion costs are not symmetric: removing
     # junk from the OCR is frequent while removing a word from the proofread
     # text should be rare.
-    del_cost1 = 50
-    def del_cost2(w):
-        return 1+3*len([c for c in w if c.isalnum()])
     w = 3 # multiplicative cost factor for the Levenshtein distance
 
     n, m = len(l1), len(l2)
@@ -94,9 +91,11 @@ def align(l1, l2, c2):
 
     for j in xrange(1, m + 1):
         a[0][j] = j, []
-
+    
+    delcost = 
     for i in xrange(1, n + 1):
-        a[i][0] = i * del_cost1, [[]] * i
+        delcost += del_cost1(l1[i], 0)
+        a[i][0] = delcost, [[]] * i
 
         for j in xrange(1, m + 1):
 
@@ -105,24 +104,24 @@ def align(l1, l2, c2):
             min_s, min_b  = s + w * d, b + [[j-1]]
 
             s, b = a[i-1][j]
-            if s + del_cost1 < min_s:
+            if s + del1(l1[i-1], j) < min_s:
                 min_s, min_b = s + del_cost1, b + [[]]
 
             s, b = a[i][j-1]
-            if s + del_cost2(l2[j-1]) < min_s:
-                min_s, min_b = s + del_cost2(l2[j-1]), b
+            if s + del2(l2[j-1], i) < min_s:
+                min_s, min_b = s + del2(l2[j-1], i), b
 
-            for k in xrange(1, 8):
-                for l in xrange(1, 5):
+            for k in xrange(1, backtrack1):
+                for l in xrange(1, backtrack2):
                     if k + l <= 2:
                         continue
-                    if k+l > 7:
+                    if k+l > (backtrack1+backtrack2)/2.:
                         break
                     if j < l or i < k:
                         break
                     s, b = a[i-k][j-l]
-                    d = levenshtein(join_words(l1[i-k:i]),
-                                    join_ocr_words(l2[j-l:j], c2[j-l:j]))
+                    d = levenshtein(join1(l1[i-k:i]),
+                                    join2(l2[j-l:j], c2[j-l:j]))
                     if s + w * d < min_s:
                         temp = [[j-1]] if l == 1 else [range(j-l, j)]
                         min_s, min_b = s + w * d, b + temp * k
author	Guillaume Horel <guillaume.horel@gmail.com>	2013-12-28 10:13:29 -0500
committer	Guillaume Horel <guillaume.horel@gmail.com>	2013-12-28 10:13:29 -0500
commit	5ad22522df6b4b725fa7fdb46ff6c78d627775a5 (patch)
tree	eea4db1155925a05925505c8c6437fd7635b9c96
parent	5dc1322e4c7e78ca98e3ad910f816ad45ac7bfd8 (diff)
parent	8dd400ab39e84afc13afba3acd15aa5f6918f03f (diff)
download	ocr-layer-curation-5ad22522df6b4b725fa7fdb46ff6c78d627775a5.tar.gz