add function to return image from a book

author: Guillaume Horel <guillaume.horel@gmail.com> 2014-02-27 23:40:21 -0500
committer: Guillaume Horel <guillaume.horel@gmail.com> 2014-02-27 23:40:21 -0500
commit: 5b47c960ff350556fa4bc02e62ad1d2e62695765 (patch)
tree: 7dadbb34a686d0f82267b732d452e9ea4dae9fa3 /parsedjvutext.py
parent: 3e343111fc525f093f8bd98707000eb44f08a97f (diff)
download: ocr-layer-curation-5b47c960ff350556fa4bc02e62ad1d2e62695765.tar.gz
1 files changed, 23 insertions, 6 deletions
diff --git a/parsedjvutext.py b/parsedjvutext.py
index 301b449..3183d7e 100644
--- a/parsedjvutext.py
+++ b/parsedjvutext.py
@@ -4,6 +4,7 @@ import djvu
 from djvu.decode import Context
 from itertools import chain
 import collections
+from PIL import Image
 
 def parse_page(page, html=False):
     s, page_size = page.text.sexpr, page.size[1]
@@ -26,8 +27,13 @@ def parse_page(page, html=False):
                     yield c
         else:
             pass
-    return aux(s, html)
+    return aux(s, html) if s else None
 
+def get_document(djvufile):
+    c = Context()
+    document = c.new_document(djvu.decode.FileURI(djvufile))
+    document.decoding_job.wait()
+    return document
 
 def parse_book(djvubook, page=None, html=False):
     """
@@ -35,9 +41,8 @@ def parse_book(djvubook, page=None, html=False):
     if page is None, returns the whole book.
     if html is True, coordinates are computed from the bottom of the page
     """
-    c = Context()
-    document = c.new_document(djvu.decode.FileURI(djvubook))
-    document.decoding_job.wait()
+    document = get_document(djvubook)
+
     if type(page) is int:
         toparse = [document.pages[page - 1]]
     elif isinstance(page, collections.Iterable):
@@ -45,8 +50,20 @@ def parse_book(djvubook, page=None, html=False):
     else:
         toparse = document.pages
 
-    return list(zip(*parse_page(page, html=html)) for page in toparse
-                if page.text.sexpr)
+    return [parse_page(page, html=html) for page in toparse]
+
+def image_from_book(djvubook, page):
+    document = get_document(djvubook)
+    mode = djvu.decode.RENDER_COLOR
+    djvu_pixel_format = djvu.decode.PixelFormatRgb()
+    page = document.pages[page-1]
+    page_job = page.decode(wait=True)
+    width, height = page_job.size
+    rect = (0, 0, width, height)
+    buf = page_job.render(mode, rect, rect, djvu_pixel_format)
+    return Image.frombuffer("RGB", (width, height), buf, 'raw', 'RGB', 0, 1)
 
 if __name__ == "__main__":
     book = parse_book(sys.argv[1], page=[10,11], html=True)
+    im = image_from_book(sys.argv[1], 11)
+    im.save("test.webp")
author	Guillaume Horel <guillaume.horel@gmail.com>	2014-02-27 23:40:21 -0500
committer	Guillaume Horel <guillaume.horel@gmail.com>	2014-02-27 23:40:21 -0500
commit	5b47c960ff350556fa4bc02e62ad1d2e62695765 (patch)
tree	7dadbb34a686d0f82267b732d452e9ea4dae9fa3 /parsedjvutext.py
parent	3e343111fc525f093f8bd98707000eb44f08a97f (diff)
download	ocr-layer-curation-5b47c960ff350556fa4bc02e62ad1d2e62695765.tar.gz