diff options
| author | Thibaut Horel <thibaut.horel@gmail.com> | 2014-02-27 12:03:28 -0500 |
|---|---|---|
| committer | Thibaut Horel <thibaut.horel@gmail.com> | 2014-02-27 12:03:28 -0500 |
| commit | 6d386d892ffde28d051cf5ba066391c8834dc3c6 (patch) | |
| tree | 20fda9cca00f363a0cc44b141fbf9646c34044d9 /parsedjvutext.py | |
| parent | ef764c648172a4ebd011bce43ff56bc9533659ca (diff) | |
| download | ocr-layer-curation-6d386d892ffde28d051cf5ba066391c8834dc3c6.tar.gz | |
Simplify parse_book a bit, also making it more natural to use
Diffstat (limited to 'parsedjvutext.py')
| -rw-r--r-- | parsedjvutext.py | 22 |
1 files changed, 10 insertions, 12 deletions
diff --git a/parsedjvutext.py b/parsedjvutext.py index 11be498..598151f 100644 --- a/parsedjvutext.py +++ b/parsedjvutext.py @@ -65,19 +65,17 @@ def parse_book(djvubook, page=None, html=False): toparse = [document.pages[page - 1]] else: toparse = document.pages - words = [[]] * len(toparse) - coords = [[]] * len(toparse) - page_size = None - for i, page in enumerate(toparse): - if page.text.sexpr: - if html: - page_size = page.size[1] - gen = parse_page_sexp(page.text.sexpr, page_size) - word_coords = zip(*gen) - words[i] = word_coords[0] - coords[i] = word_coords[1] - return {"words": words, "coords": coords} + def gen_pages(): + page_size = None + for i, page in enumerate(toparse): + if page.text.sexpr: + if html: + page_size = page.size[1] + gen = parse_page_sexp(page.text.sexpr, page_size) + yield zip(*gen) + + return list(gen_pages()) if __name__ == "__main__": book = parse_book(sys.argv[1]) |
