Take line jumps into accounts when grouping words

author: Thibaut Horel <thibaut.horel@gmail.com> 2013-08-17 18:25:04 +0200
committer: Thibaut Horel <thibaut.horel@gmail.com> 2013-08-17 18:25:04 +0200
commit: f633ea807ef9a83f84f41767e71fad71656a4439 (patch)
tree: 5dca000d4892c4f23c61f46cd90603ea0ae7091e
parent: b5a00a5e914da988dcd81c6d276f7bb22a46aa20 (diff)
download: ocr-layer-curation-f633ea807ef9a83f84f41767e71fad71656a4439.tar.gz
2 files changed, 21 insertions, 18 deletions
diff --git a/compare.py b/compare.py
index a458924..7305517 100644
--- a/compare.py
+++ b/compare.py
@@ -7,10 +7,10 @@ import string_utils as su
 # wikibook = "Bloy - Le Sang du pauvre, Stock, 1932.djvu".replace(" ", "_")
 wikibook = "Villiers de L'Isle-Adam - Tribulat Bonhomet, 1908.djvu".replace(" ", "_")
 
-n = 79
+n = 88
 ocrpage = parse_page_sexp(wikibook, n)
-l1 = ocrpage['words']
+l1, c1 = ocrpage['words'], ocrpage["coords"]
 l2 = get_page(wikibook, n)
 l3 = su.simplify(l2)
-C = su.align(l2.split(), l1)
-su.print_alignment(l3.split(), l1, C[1])
+C = su.align(l3.split(), l1, c1)
+su.print_alignment(l2.split(), l1, c1, C[1])
diff --git a/string_utils.py b/string_utils.py
index 2293186..1b94ce3 100644
--- a/string_utils.py
+++ b/string_utils.py
@@ -3,7 +3,7 @@ from Levenshtein import distance as levenshtein
 
 def simplify(text):
     mapp = [(u"’", u"'"), (u"↑", u"."), (u"…", u"..."), (u"É", u"E"),
-            (u"À", u"A"), (u"Ô", u"O")]
+            (u"À", u"A"), (u"Ô", u"O"), (u"—", u"-")]
 
     for a, b in mapp:
         text = text.replace(a, b)
@@ -54,13 +54,16 @@ def printDiff(C, X, Y, i, j):
             printDiff(C, X, Y, i-1, j)
             print "- " + X[i-1]
 
-def join_words(l):
-    if len(l) >= 2 and l[-2][-1] == "-":
+def join_ocr_words(l, c):
+    m = list(l)
+    if len(l) >= 2 and c[-2][2] > c[-1][0] and (not l[-2][-1].isalnum()):
         l[-2] = l[-2][:-1]
+    return "".join(l)
 
+def join_words(l):
     return "".join(l)
 
-def align(l1, l2):
+def align(l1, l2, c2):
     """Compute the optimal alignment between two list of words
     à la Needleman-Wunsch.
 
@@ -77,10 +80,10 @@ def align(l1, l2):
     # and l2 as the OCR text. The deletion costs are not symmetric: removing
     # junk from the OCR is frequent while removing a word from the proofread
     # text should be rare.
-    del_cost1 = 30
+    del_cost1 = 50
     def del_cost2(w):
-        return 1 + len([c for c in w if c.isalnum()])
-    w = 4 # multiplicative cost factor for the Levenshtein distance
+        return 1+3*len([c for c in w if c.isalnum()])
+    w = 3 # multiplicative cost factor for the Levenshtein distance
 
     n, m = len(l1), len(l2)
     # a is the (score, alignment) matrix. a[i][j] is the (score, alignment)
@@ -107,26 +110,26 @@ def align(l1, l2):
             if s + del_cost2(l2[j-1]) < min_s:
                 min_s, min_b = s + del_cost2(l2[j-1]), b
 
-            for k in xrange(1, 5):
+            for k in xrange(1, 8):
                 for l in xrange(1, 5):
                     if k + l <= 2:
                         continue
-                    if k+l > 6:
+                    if k+l > 7:
                         break
                     if j < l or i < k:
                         break
                     s, b = a[i-k][j-l]
                     d = levenshtein(join_words(l1[i-k:i]),
-                                    join_words(l2[j-l:j]))
-                    if s + w * d + k < min_s:
+                                    join_ocr_words(l2[j-l:j], c2[j-l:j]))
+                    if s + w * d < min_s:
                         temp = [j-1] if l == 1 else [tuple(range(j-l, j))]
-                        min_s, min_b = s + w * d + k, b + temp * k
+                        min_s, min_b = s + w * d, b + temp * k
 
             a[i][j] = min_s, min_b
 
     return a[n][m]
 
-def print_alignment(l1, l2, alignment):
+def print_alignment(l1, l2, c2, alignment):
     """Given two list of words and an alignment (as defined in :func:`align`)
     print the two list of words side-by-side and aligned.
     """
@@ -160,7 +163,7 @@ def print_alignment(l1, l2, alignment):
 
             if end > begin:
                 print u"{0:>25} | {1:<25} (M)".format(word,
-                                                      join_words(l2[begin:end+1]))
+                                                      join_ocr_words(l2[begin:end+1], c2[begin:end+1]))
             else:
                 print u"{0:>25} | {1:<25}".format(word, l2[begin])
author	Thibaut Horel <thibaut.horel@gmail.com>	2013-08-17 18:25:04 +0200
committer	Thibaut Horel <thibaut.horel@gmail.com>	2013-08-17 18:25:04 +0200
commit	f633ea807ef9a83f84f41767e71fad71656a4439 (patch)
tree	5dca000d4892c4f23c61f46cd90603ea0ae7091e
parent	b5a00a5e914da988dcd81c6d276f7bb22a46aa20 (diff)
download	ocr-layer-curation-f633ea807ef9a83f84f41767e71fad71656a4439.tar.gz