diff options
| author | Guillaume Horel <guillaume.horel@serenitascapital.com> | 2014-02-28 17:31:19 -0500 |
|---|---|---|
| committer | Guillaume Horel <guillaume.horel@serenitascapital.com> | 2014-02-28 17:31:54 -0500 |
| commit | 6e694d555e1004da58ec3425d33043b2f1b5f715 (patch) | |
| tree | 01fc9f907401f50cf6c869c7e1c057287b2dc405 /compare.py | |
| parent | aaa42a8efcd53576ced9bf2311e84d8ff2a5c8cf (diff) | |
| download | ocr-layer-curation-6e694d555e1004da58ec3425d33043b2f1b5f715.tar.gz | |
update with the new functions
Diffstat (limited to 'compare.py')
| -rw-r--r-- | compare.py | 13 |
1 files changed, 5 insertions, 8 deletions
@@ -1,21 +1,18 @@ # -*- coding: utf-8 -*- from wikisource import get_page -from parsedjvutext import page_sexp, parse_page_sexp +from parsedjvutext import parse_book import string_utils as su -import pdb wikibook = "Bloy - Le Sang du pauvre, Stock, 1932.djvu".replace(" ", "_") #wikibook = "Villiers de L'Isle-Adam - Tribulat Bonhomet, 1908.djvu".replace(" ", "_") n = 88 -ocrpage = parse_page_sexp(wikibook, n) -l1, c1 = ocrpage['words'], ocrpage["coords"] +ocrpage = parse_book(wikibook, n) +l1, c1 = zip(*ocrpage[0]) +l1 = list(l1) +c1 = list(c1) l2 = get_page(wikibook, n) -print len(l2.split()) l3 = su.simplify(l2) C = su.align(l3.split(), l1, c1) -#pdb.set_trace() -#sexp = page_sexp(wikibook, n) -#su.alignment_to_sexp(C[1], sexp, l2.split()) su.print_alignment(l2.split(), l1, c1, C[1]) |
