aboutsummaryrefslogtreecommitdiffstats
path: root/compare.py
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2013-08-05 22:29:24 +0200
committerThibaut Horel <thibaut.horel@gmail.com>2013-08-05 22:45:45 +0200
commitb51137379fab2d5579e0caeada5389c682ec3cc5 (patch)
tree49e1c6bab1a69143e9aa290e48e5849e7b2b6756 /compare.py
parent80133c8d2d0a7334e8f53a11eaa48a8d7b70c1da (diff)
downloadocr-layer-curation-b51137379fab2d5579e0caeada5389c682ec3cc5.tar.gz
Use a Needleman-Wunsch type algorithm for text alignment
Diffstat (limited to 'compare.py')
-rw-r--r--compare.py15
1 files changed, 9 insertions, 6 deletions
diff --git a/compare.py b/compare.py
index aeb9092..c37cde9 100644
--- a/compare.py
+++ b/compare.py
@@ -1,12 +1,15 @@
+# -*- coding: utf-8 -*-
+
from wikisource import get_page
from parsedjvutext import parse_page_sexp
-from string_utils import LCS, printDiff
+import string_utils as su
-wikibook = "Villiers de L'Isle-Adam - Tribulat Bonhomet, 1908.djvu"
+wikibook = "Bloy - Le Sang du pauvre, Stock, 1932.djvu".replace(" ", "_")
+# wikibook = "Villiers de L'Isle-Adam - Tribulat Bonhomet, 1908.djvu".replace(" ", "_")
-n = 42
+n = 79
ocrpage = parse_page_sexp(wikibook, n)
l1 = ocrpage['words']
-l2 = get_page(wikibook, n).split()
-C = LCS(l1, l2)
-printDiff(C, l1, l2, len(l1), len(l2))
+l2 = get_page(wikibook, n).replace(u"’", u"'").split()
+C = su.align(l2, l1)
+su.print_alignment(l2, l1, C[1])