diff options
| author | Thibaut Horel <thibaut.horel@gmail.com> | 2014-02-27 11:46:30 -0500 |
|---|---|---|
| committer | Thibaut Horel <thibaut.horel@gmail.com> | 2014-02-27 11:46:30 -0500 |
| commit | 38a7cef65e167a4004f37f6651777e99ab5ba6d3 (patch) | |
| tree | 2a102ad1f25c465471cdec5e8f7ce89a681a2265 | |
| parent | 76054347cffacd7a4a6759f2187717e185d8082c (diff) | |
| download | ocr-layer-curation-38a7cef65e167a4004f37f6651777e99ab5ba6d3.tar.gz | |
Adapting the web client code to the new behavior of parsedjvutext
| -rw-r--r-- | parsedjvutext.py | 26 | ||||
| -rw-r--r-- | web/utils.py | 7 |
2 files changed, 19 insertions, 14 deletions
diff --git a/parsedjvutext.py b/parsedjvutext.py index 06cecb9..2ded9d2 100644 --- a/parsedjvutext.py +++ b/parsedjvutext.py @@ -5,6 +5,7 @@ import djvu from djvu.decode import Context from itertools import chain + def parse_book_xml(djvubook): args = ["djvutoxml", djvubook] soup = BeautifulSoup(subprocess.check_output(args), "lxml") @@ -12,23 +13,26 @@ def parse_book_xml(djvubook): coords = [] for page in soup.find_all("hiddentext"): words.append([word.text for word in page.find_all("word")]) - coords.append([tuple(map(int, word["coords"].split(","))) \ - for word in page.find_all("word")]) + coords.append([tuple(map(int, word["coords"].split(","))) + for word in page.find_all("word")]) return {"words": words, "coords": coords} + def get_npages(djvubook): args = ["djvused", "-e", "n", djvubook] return int(subprocess.check_output(args)) + def parse_page_xml(djvubook, pagenumber): args = ["djvutoxml", "--page", str(pagenumber), djvubook] soup = BeautifulSoup(subprocess.check_output(args), "lxml") all_words = soup.find_all("word") words = [word.text for word in all_words] - coords = [tuple(map(int, word["coords"].split(","))) \ - for word in all_words] + coords = [tuple(map(int, word["coords"].split(","))) + for word in all_words] return {"words": words, "coords": coords} + def parse_page_sexp(s, page_size=None): if type(s) is djvu.sexpr.ListExpression: if len(s) == 0: @@ -36,16 +40,18 @@ def parse_page_sexp(s, page_size=None): if str(s[0].value) == "word": coords = [s[i].value for i in xrange(1, 5)] if page_size: - coords[1]=page_size-coords[1] - coords[3]=page_size-coords[3] + coords[1] = page_size - coords[1] + coords[3] = page_size - coords[3] word = s[5].value yield (word, coords) else: - for c in chain.from_iterable(parse_page_sexp(child, page_size) for child in s[5:]): + for c in chain.from_iterable(parse_page_sexp(child, page_size) + for child in s[5:]): yield c else: pass + def parse_book(djvubook, page=None, html=False): """ returns the list of words and coordinates from a djvu book. @@ -56,7 +62,7 @@ def parse_book(djvubook, page=None, html=False): document = c.new_document(djvu.decode.FileURI(djvubook)) document.decoding_job.wait() if page: - toparse = [document.pages[page-1]] + toparse = [document.pages[page - 1]] else: toparse = document.pages words = [[]] * len(toparse) @@ -65,7 +71,7 @@ def parse_book(djvubook, page=None, html=False): for i, page in enumerate(toparse): if page.text.sexpr: if html: - page_size= p.size[1] + page_size = int(page.size[1]) gen = parse_page_sexp(page.text.sexpr, page_size) word_coords = zip(*gen) words[i] = word_coords[0] @@ -73,5 +79,5 @@ def parse_book(djvubook, page=None, html=False): return {"words": words, "coords": coords} -if __name__=="__main__": +if __name__ == "__main__": book = parse_book(sys.argv[1]) diff --git a/web/utils.py b/web/utils.py index 72d05dd..583cd1c 100644 --- a/web/utils.py +++ b/web/utils.py @@ -1,15 +1,14 @@ -from parsedjvutext import parse_page_sexp +from parsedjvutext import parse_book import sys def gen_html(book, page_number): book = "../Villiers_de_L\'Isle-Adam_-_Tribulat_Bonhomet,_1908.djvu" - d = parse_page_sexp(book, page_number) - coords, words = d["coords"], d["words"] + d = parse_book(book, page=int(page_number), html=True) + coords, words = d["coords"][0], d["words"][0] def get_areas(): for i, coord in enumerate(coords): - coord[1], coord[3] = 2764 - coord[3], 2764 - coord[1] coord_str = ",".join(map(str, coord)) yield i, coord_str |
