From 424aa29f600bc17c8391a3802206385962648519 Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Sat, 1 Mar 2014 15:24:31 -0500 Subject: reorganize djvu_utils a bit --- djvu_utils.py | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++ parsedjvutext.py | 69 ---------------------------------------------------- web/djvu_utils.py | 1 + web/parsedjvutext.py | 1 - 4 files changed, 68 insertions(+), 70 deletions(-) create mode 100644 djvu_utils.py delete mode 100644 parsedjvutext.py create mode 120000 web/djvu_utils.py delete mode 120000 web/parsedjvutext.py diff --git a/djvu_utils.py b/djvu_utils.py new file mode 100644 index 0000000..21692a1 --- /dev/null +++ b/djvu_utils.py @@ -0,0 +1,67 @@ +import sys +from bs4 import BeautifulSoup +import djvu +from djvu.decode import Context +from itertools import chain +import collections +from PIL import Image + +def parse_page(page): + s = page.text.sexpr + + def aux(s): + if type(s) is djvu.sexpr.ListExpression: + if len(s) == 0: + pass + if str(s[0].value) == "word": + coords = [s[i].value for i in xrange(1, 5)] + word = s[5].value + yield (word.decode("utf-8"), coords) + else: + for c in chain.from_iterable(aux(child) for child in s[5:]): + yield c + else: + pass + return aux(s) if s else None + +def convert_to_htmlcoord(coords, page_size): + return [",".join(map(str, [c[0], page_size - c[3], + c[2], page_size - c[1]])) for c in coords] + +def get_document(djvufile): + c = Context() + document = c.new_document(djvu.decode.FileURI(djvufile)) + document.decoding_job.wait() + return document + +def parse_book(djvubook, page=None): + """ + returns the list of words and coordinates from a djvu book. + if page is None, returns the whole book. + """ + document = get_document(djvubook) + + if type(page) is int: + toparse = [document.pages[page - 1]] + elif isinstance(page, collections.Iterable): + toparse = [document.pages[p - 1] for p in page] + else: + toparse = document.pages + + return [parse_page(page) for page in toparse] + +def image_from_book(djvubook, page): + document = get_document(djvubook) + mode = djvu.decode.RENDER_COLOR + djvu_pixel_format = djvu.decode.PixelFormatRgb() + page = document.pages[page-1] + page_job = page.decode(wait=True) + width, height = page_job.size + rect = (0, 0, width, height) + buf = page_job.render(mode, rect, rect, djvu_pixel_format) + return Image.frombuffer("RGB", (width, height), buf, 'raw', 'RGB', 0, -1) + +if __name__ == "__main__": + book = parse_book(sys.argv[1], page=[10,11], html=True) + im = image_from_book(sys.argv[1], 11) + im.save("test.jpeg") diff --git a/parsedjvutext.py b/parsedjvutext.py deleted file mode 100644 index 6bd9950..0000000 --- a/parsedjvutext.py +++ /dev/null @@ -1,69 +0,0 @@ -import sys -from bs4 import BeautifulSoup -import djvu -from djvu.decode import Context -from itertools import chain -import collections -from PIL import Image - -def parse_page(page, html=False): - s, page_size = page.text.sexpr, page.size[1] - - def aux(s, html): - if type(s) is djvu.sexpr.ListExpression: - if len(s) == 0: - pass - if str(s[0].value) == "word": - if html: - coords = (s[1].value, page_size - s[4].value, - s[3].value, page_size - s[2].value) - coords = ",".join(map(str,coords)) - else: - coords = [s[i].value for i in xrange(1, 5)] - word = s[5].value - yield (word.decode("utf-8"), coords) - else: - for c in chain.from_iterable(aux(child, html) for child in s[5:]): - yield c - else: - pass - return aux(s, html) if s else None - -def get_document(djvufile): - c = Context() - document = c.new_document(djvu.decode.FileURI(djvufile)) - document.decoding_job.wait() - return document - -def parse_book(djvubook, page=None, html=False): - """ - returns the list of words and coordinates from a djvu book. - if page is None, returns the whole book. - if html is True, coordinates are computed from the bottom of the page - """ - document = get_document(djvubook) - - if type(page) is int: - toparse = [document.pages[page - 1]] - elif isinstance(page, collections.Iterable): - toparse = [document.pages[p - 1] for p in page] - else: - toparse = document.pages - - return [parse_page(page, html=html) for page in toparse] - -def image_from_book(djvubook, page): - document = get_document(djvubook) - mode = djvu.decode.RENDER_COLOR - djvu_pixel_format = djvu.decode.PixelFormatRgb() - page = document.pages[page-1] - page_job = page.decode(wait=True) - width, height = page_job.size - rect = (0, 0, width, height) - buf = page_job.render(mode, rect, rect, djvu_pixel_format) - return Image.frombuffer("RGB", (width, height), buf, 'raw', 'RGB', 0, -1) - -if __name__ == "__main__": - book = parse_book(sys.argv[1], page=[10,11], html=True) - im = image_from_book(sys.argv[1], 11) - im.save("test.webp") diff --git a/web/djvu_utils.py b/web/djvu_utils.py new file mode 120000 index 0000000..0742170 --- /dev/null +++ b/web/djvu_utils.py @@ -0,0 +1 @@ +../djvu_utils.py \ No newline at end of file diff --git a/web/parsedjvutext.py b/web/parsedjvutext.py deleted file mode 120000 index e07ccf8..0000000 --- a/web/parsedjvutext.py +++ /dev/null @@ -1 +0,0 @@ -../parsedjvutext.py \ No newline at end of file -- cgit v1.2.3-70-g09d2