diff options
| author | Thibaut Horel <thibaut.horel@gmail.com> | 2014-02-27 11:27:51 -0500 |
|---|---|---|
| committer | Thibaut Horel <thibaut.horel@gmail.com> | 2014-02-27 11:27:51 -0500 |
| commit | b8013d90c16cf4e1225205fc309f24f7c06ea6c5 (patch) | |
| tree | 9b8ca439ddc2897b1375362c73037b35be9dd167 | |
| parent | 473ef7e26fc8d2c6b26e66b80d50e49c18fa24f8 (diff) | |
| parent | b620744bbc7554c8e7a320019a8601f7ca2a5188 (diff) | |
| download | ocr-layer-curation-b8013d90c16cf4e1225205fc309f24f7c06ea6c5.tar.gz | |
Merge branch 'master' of horel.org:thibaut/ocr-layer-curation
cessary,
| -rw-r--r-- | parsedjvutext.py | 81 |
1 files changed, 45 insertions, 36 deletions
diff --git a/parsedjvutext.py b/parsedjvutext.py index 773a1d4..06cecb9 100644 --- a/parsedjvutext.py +++ b/parsedjvutext.py @@ -1,6 +1,9 @@ import sys from bs4 import BeautifulSoup import subprocess +import djvu +from djvu.decode import Context +from itertools import chain def parse_book_xml(djvubook): args = ["djvutoxml", djvubook] @@ -26,43 +29,49 @@ def parse_page_xml(djvubook, pagenumber): for word in all_words] return {"words": words, "coords": coords} -def parse_wordline(line): - line = line.lstrip(" (").rstrip(")").split(" ") - word = line[5] - word = word[1:-1].decode("string_escape").decode("utf-8") - coords = map(int, line[1:5]) - return word, coords +def parse_page_sexp(s, page_size=None): + if type(s) is djvu.sexpr.ListExpression: + if len(s) == 0: + pass + if str(s[0].value) == "word": + coords = [s[i].value for i in xrange(1, 5)] + if page_size: + coords[1]=page_size-coords[1] + coords[3]=page_size-coords[3] + word = s[5].value + yield (word, coords) + else: + for c in chain.from_iterable(parse_page_sexp(child, page_size) for child in s[5:]): + yield c + else: + pass -def page_sexp(djvubook, pagenumber): - args = ["djvused", "-e", "select {0};print-txt".format(pagenumber), - djvubook] - return subprocess.check_output(args).split("\n") +def parse_book(djvubook, page=None, html=False): + """ + returns the list of words and coordinates from a djvu book. + if page is None, returns the whole book. + if html is True, coordinates are computed from the bottom of the page + """ + c = Context() + document = c.new_document(djvu.decode.FileURI(djvubook)) + document.decoding_job.wait() + if page: + toparse = [document.pages[page-1]] + else: + toparse = document.pages + words = [[]] * len(toparse) + coords = [[]] * len(toparse) + page_size = None + for i, page in enumerate(toparse): + if page.text.sexpr: + if html: + page_size= p.size[1] + gen = parse_page_sexp(page.text.sexpr, page_size) + word_coords = zip(*gen) + words[i] = word_coords[0] + coords[i] = word_coords[1] -def parse_page_sexp(djvubook, pagenumber): - page = [parse_wordline(line) for line in page_sexp(djvubook, pagenumber) \ - if "word" in line] - return {"words": [a for a, b in page], "coords": [b for a, b in page]} - -def parse_book_sexp(djvubook): - book = {"words": [], "coords": []} - page_coords = [] - page_words = [] - firstpage = True - args = ["djvused", "-e", "print-txt", djvubook] - for line in subprocess.check_output(args).split("\n"): - if "page" in line: - if firstpage: - firstpage = False - else: - book["words"].append(page_words) - book["coords"].append(page_coords) - page_coords = [] - page_words = [] - if "word" in line: - word, coords = parse_wordline(line) - page_words.append(word) - page_coords.append(coords) - return book + return {"words": words, "coords": coords} if __name__=="__main__": - book_sexp = parse_book_sexp(sys.argv[1]) + book = parse_book(sys.argv[1]) |
