From 17c23c5d2b6680f90117a7804e65dd7fe541848f Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Sun, 18 Aug 2013 16:54:53 -0400 Subject: try to fix the alignment_to_sexp function --- compare.py | 11 ++++++++--- string_utils.py | 15 ++++++++++----- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/compare.py b/compare.py index 7305517..b6a4c8a 100644 --- a/compare.py +++ b/compare.py @@ -1,16 +1,21 @@ # -*- coding: utf-8 -*- from wikisource import get_page -from parsedjvutext import parse_page_sexp +from parsedjvutext import page_sexp, parse_page_sexp import string_utils as su +import pdb -# wikibook = "Bloy - Le Sang du pauvre, Stock, 1932.djvu".replace(" ", "_") -wikibook = "Villiers de L'Isle-Adam - Tribulat Bonhomet, 1908.djvu".replace(" ", "_") +wikibook = "Bloy - Le Sang du pauvre, Stock, 1932.djvu".replace(" ", "_") +#wikibook = "Villiers de L'Isle-Adam - Tribulat Bonhomet, 1908.djvu".replace(" ", "_") n = 88 ocrpage = parse_page_sexp(wikibook, n) l1, c1 = ocrpage['words'], ocrpage["coords"] l2 = get_page(wikibook, n) +print len(l2.split()) l3 = su.simplify(l2) C = su.align(l3.split(), l1, c1) +pdb.set_trace() +sexp = page_sexp(wikibook, n) +su.alignment_to_sexp(C[1], sexp, l2.split()) su.print_alignment(l2.split(), l1, c1, C[1]) diff --git a/string_utils.py b/string_utils.py index c4439da..12d22b8 100644 --- a/string_utils.py +++ b/string_utils.py @@ -172,9 +172,14 @@ def alignment_to_sexp(alignment, sexp, l2): if index == -1: break else: - re.sub("(?P\d+ \d+ \d+ \d+\s) \w+(?P\)+$)", - "\g{0}\g".format( - " ".join([l2[i] for i in list(index)])), - line) - line.encode('string-escape') + if type(index) == tuple: + word = " ".join([l2[i] for i in list(index)]) + else: + try: + word = l2[index] + except IndexError: + print index + word = word.encode("utf-8").encode("string-escape") + re.sub("(?P\d+ \d+ \d+ \d+\s) \w+(?P\)+$)", + "\g{0}\g".format(word), line) print line -- cgit v1.2.3-70-g09d2