diff options
| author | Guillaume Horel <guillaume.horel@gmail.com> | 2014-02-25 09:23:51 -0500 |
|---|---|---|
| committer | Guillaume Horel <guillaume.horel@gmail.com> | 2014-02-25 09:23:51 -0500 |
| commit | c2e34cd284f5fd043c26f110fe5e85411ab9051e (patch) | |
| tree | d67f0797a8dfcc87e96e98ec5a1046115ccc8bb2 /parsedjvutext.py | |
| parent | c40ad3ecef221e3f3d6a8633687c896603d493a0 (diff) | |
| download | ocr-layer-curation-c2e34cd284f5fd043c26f110fe5e85411ab9051e.tar.gz | |
using the djvu library for parsing djvu documents
Diffstat (limited to 'parsedjvutext.py')
| -rw-r--r-- | parsedjvutext.py | 53 |
1 files changed, 35 insertions, 18 deletions
diff --git a/parsedjvutext.py b/parsedjvutext.py index 773a1d4..696601d 100644 --- a/parsedjvutext.py +++ b/parsedjvutext.py @@ -1,6 +1,9 @@ import sys from bs4 import BeautifulSoup import subprocess +import djvu +from djvu.decode import Context +from itertools import chain def parse_book_xml(djvubook): args = ["djvutoxml", djvubook] @@ -43,25 +46,39 @@ def parse_page_sexp(djvubook, pagenumber): if "word" in line] return {"words": [a for a, b in page], "coords": [b for a, b in page]} -def parse_book_sexp(djvubook): +def parse_sexp(s): + if type(s) is djvu.sexpr.ListExpression: + if len(s) == 0: + return [] + if str(s[0].value) == "word": + coords = [s[i].value for i in xrange(1, 5)] + word = s[5].value + return [(word, coords)] + else: + gen = chain.from_iterable(parse_sexp(child) for child in s[5:]) + return list(gen) + else: + return [] + +def parse_book_sexp(djvubook, page=None, html=False): + """ + returns the list of words and coordinates from a djvu book. + if page is None, returns the whole book. + if html is True, coordinates are computed from the bottom of the page + """ book = {"words": [], "coords": []} - page_coords = [] - page_words = [] - firstpage = True - args = ["djvused", "-e", "print-txt", djvubook] - for line in subprocess.check_output(args).split("\n"): - if "page" in line: - if firstpage: - firstpage = False - else: - book["words"].append(page_words) - book["coords"].append(page_coords) - page_coords = [] - page_words = [] - if "word" in line: - word, coords = parse_wordline(line) - page_words.append(word) - page_coords.append(coords) + c = Context() + document = c.new_document(djvu.decode.FileURI(djvubook)) + document.decoding_job.wait() + if page: + toparse = [document.pages[page-1]] + else: + toparse = document.pages + for page in toparse: + gen = parse_sexp(page.text.sexpr) + word_coords = zip(*gen) + book["words"].append(word_coords[0]) + book["coords"].append(word_coords[1]) return book if __name__=="__main__": |
