using the djvu library for parsing djvu documents

author: Guillaume Horel <guillaume.horel@gmail.com> 2014-02-25 09:23:51 -0500
committer: Guillaume Horel <guillaume.horel@gmail.com> 2014-02-25 09:23:51 -0500
commit: c2e34cd284f5fd043c26f110fe5e85411ab9051e (patch)
tree: d67f0797a8dfcc87e96e98ec5a1046115ccc8bb2 /parsedjvutext.py
parent: c40ad3ecef221e3f3d6a8633687c896603d493a0 (diff)
download: ocr-layer-curation-c2e34cd284f5fd043c26f110fe5e85411ab9051e.tar.gz
1 files changed, 35 insertions, 18 deletions
diff --git a/parsedjvutext.py b/parsedjvutext.py
index 773a1d4..696601d 100644
--- a/parsedjvutext.py
+++ b/parsedjvutext.py
@@ -1,6 +1,9 @@
 import sys
 from bs4 import BeautifulSoup
 import subprocess
+import djvu
+from djvu.decode import Context
+from itertools import chain
 
 def parse_book_xml(djvubook):
     args = ["djvutoxml", djvubook]
@@ -43,25 +46,39 @@ def parse_page_sexp(djvubook, pagenumber):
             if "word" in line]
     return {"words": [a for a, b in page], "coords": [b for a, b in page]}
 
-def parse_book_sexp(djvubook):
+def parse_sexp(s):
+    if type(s) is djvu.sexpr.ListExpression:
+        if len(s) == 0:
+            return []
+        if str(s[0].value) == "word":
+            coords = [s[i].value for i in xrange(1, 5)]
+            word = s[5].value
+            return [(word, coords)]
+        else:
+            gen = chain.from_iterable(parse_sexp(child) for child in s[5:])
+            return list(gen)
+    else:
+        return []
+
+def parse_book_sexp(djvubook, page=None, html=False):
+    """
+    returns the list of words and coordinates from a djvu book.
+    if page is None, returns the whole book.
+    if html is True, coordinates are computed from the bottom of the page
+    """
     book = {"words": [], "coords": []}
-    page_coords = []
-    page_words = []
-    firstpage = True
-    args = ["djvused", "-e", "print-txt", djvubook]
-    for line in subprocess.check_output(args).split("\n"):
-        if "page" in line:
-            if firstpage:
-                firstpage = False
-            else:
-                book["words"].append(page_words)
-                book["coords"].append(page_coords)
-                page_coords = []
-                page_words = []
-        if "word" in line:
-            word, coords = parse_wordline(line)
-            page_words.append(word)
-            page_coords.append(coords)
+    c = Context()
+    document = c.new_document(djvu.decode.FileURI(djvubook))
+    document.decoding_job.wait()
+    if page:
+        toparse = [document.pages[page-1]]
+    else:
+        toparse = document.pages
+    for page in toparse:
+        gen = parse_sexp(page.text.sexpr)
+        word_coords = zip(*gen)
+        book["words"].append(word_coords[0])
+        book["coords"].append(word_coords[1])
     return book
 
 if __name__=="__main__":
author	Guillaume Horel <guillaume.horel@gmail.com>	2014-02-25 09:23:51 -0500
committer	Guillaume Horel <guillaume.horel@gmail.com>	2014-02-25 09:23:51 -0500
commit	c2e34cd284f5fd043c26f110fe5e85411ab9051e (patch)
tree	d67f0797a8dfcc87e96e98ec5a1046115ccc8bb2 /parsedjvutext.py
parent	c40ad3ecef221e3f3d6a8633687c896603d493a0 (diff)
download	ocr-layer-curation-c2e34cd284f5fd043c26f110fe5e85411ab9051e.tar.gz