diff options
| author | Guillaume Horel <guillaume.horel@serenitascapital.com> | 2014-02-26 17:52:14 -0500 |
|---|---|---|
| committer | Guillaume Horel <guillaume.horel@serenitascapital.com> | 2014-02-26 17:52:14 -0500 |
| commit | b620744bbc7554c8e7a320019a8601f7ca2a5188 (patch) | |
| tree | 8ab1d91b6d0b9a26efc9f88b1c25c9c488b7ab4c | |
| parent | c2e34cd284f5fd043c26f110fe5e85411ab9051e (diff) | |
| download | ocr-layer-curation-b620744bbc7554c8e7a320019a8601f7ca2a5188.tar.gz | |
fix the djvu parsing and add html coordinates
| -rw-r--r-- | parsedjvutext.py | 56 |
1 files changed, 24 insertions, 32 deletions
diff --git a/parsedjvutext.py b/parsedjvutext.py index 696601d..06cecb9 100644 --- a/parsedjvutext.py +++ b/parsedjvutext.py @@ -29,44 +29,29 @@ def parse_page_xml(djvubook, pagenumber): for word in all_words] return {"words": words, "coords": coords} -def parse_wordline(line): - line = line.lstrip(" (").rstrip(")").split(" ") - word = line[5] - word = word[1:-1].decode("string_escape").decode("utf-8") - coords = map(int, line[1:5]) - return word, coords - -def page_sexp(djvubook, pagenumber): - args = ["djvused", "-e", "select {0};print-txt".format(pagenumber), - djvubook] - return subprocess.check_output(args).split("\n") - -def parse_page_sexp(djvubook, pagenumber): - page = [parse_wordline(line) for line in page_sexp(djvubook, pagenumber) \ - if "word" in line] - return {"words": [a for a, b in page], "coords": [b for a, b in page]} - -def parse_sexp(s): +def parse_page_sexp(s, page_size=None): if type(s) is djvu.sexpr.ListExpression: if len(s) == 0: - return [] + pass if str(s[0].value) == "word": coords = [s[i].value for i in xrange(1, 5)] + if page_size: + coords[1]=page_size-coords[1] + coords[3]=page_size-coords[3] word = s[5].value - return [(word, coords)] + yield (word, coords) else: - gen = chain.from_iterable(parse_sexp(child) for child in s[5:]) - return list(gen) + for c in chain.from_iterable(parse_page_sexp(child, page_size) for child in s[5:]): + yield c else: - return [] + pass -def parse_book_sexp(djvubook, page=None, html=False): +def parse_book(djvubook, page=None, html=False): """ returns the list of words and coordinates from a djvu book. if page is None, returns the whole book. if html is True, coordinates are computed from the bottom of the page """ - book = {"words": [], "coords": []} c = Context() document = c.new_document(djvu.decode.FileURI(djvubook)) document.decoding_job.wait() @@ -74,12 +59,19 @@ def parse_book_sexp(djvubook, page=None, html=False): toparse = [document.pages[page-1]] else: toparse = document.pages - for page in toparse: - gen = parse_sexp(page.text.sexpr) - word_coords = zip(*gen) - book["words"].append(word_coords[0]) - book["coords"].append(word_coords[1]) - return book + words = [[]] * len(toparse) + coords = [[]] * len(toparse) + page_size = None + for i, page in enumerate(toparse): + if page.text.sexpr: + if html: + page_size= p.size[1] + gen = parse_page_sexp(page.text.sexpr, page_size) + word_coords = zip(*gen) + words[i] = word_coords[0] + coords[i] = word_coords[1] + + return {"words": words, "coords": coords} if __name__=="__main__": - book_sexp = parse_book_sexp(sys.argv[1]) + book = parse_book(sys.argv[1]) |
