diff options
| author | Guillaume Horel <guillaume.horel@gmail.com> | 2014-03-01 15:26:45 -0500 |
|---|---|---|
| committer | Guillaume Horel <guillaume.horel@gmail.com> | 2014-03-01 15:30:27 -0500 |
| commit | bf74cd2294598c3dc1d73edd74ca88c87b7d6cd6 (patch) | |
| tree | fe74e894aabdc5ca6a8ee08fa3a0c1f7d0cb4094 /web/utils.py | |
| parent | c5734b6b776727959f1b485651f1ddc7c8121a85 (diff) | |
| download | ocr-layer-curation-bf74cd2294598c3dc1d73edd74ca88c87b7d6cd6.tar.gz | |
Preliminary support for corrected text
* It's slow, need to figure out how to load it in the background maybe
* The bouding boxes could be improved
Diffstat (limited to 'web/utils.py')
| -rw-r--r-- | web/utils.py | 21 |
1 files changed, 15 insertions, 6 deletions
diff --git a/web/utils.py b/web/utils.py index bb9a4fe..8522841 100644 --- a/web/utils.py +++ b/web/utils.py @@ -1,12 +1,21 @@ -from parsedjvutext import parse_book +import djvu_utils as du import sys - +import string_utils as su +from wikisource import get_page def gen_html(book, page_number): - d = parse_book(book, page=int(page_number), html=True) - if d[0]: - words, coords = zip(*d[0]) - return (list(enumerate(coords)), list(enumerate(words))) + doc = du.get_document("../" + book) + page = doc.pages[int(page_number)-1] + d = du.parse_page(page) + corrected_text = get_page(book, int(page_number)) + corrected_words = su.simplify(corrected_text).split() + if d: + words, coords = zip(*d) + C = su.align(corrected_words, list(words), list(coords)) + r = su.alignment_to_sexp(corrected_text.split(), words, coords, C[1]) + corrected_words, coords = zip(*r) + coords_html = du.convert_to_htmlcoord(coords, page.size[1]) + return (list(enumerate(coords_html)), list(enumerate(corrected_words))) if __name__ == "__main__": gen_html(*sys.argv[1:3]) |
