Merge branch 'master' of horel.org:thibaut/ocr-layer-curation

cessary,
author: Thibaut Horel <thibaut.horel@gmail.com> 2014-02-27 11:27:51 -0500
committer: Thibaut Horel <thibaut.horel@gmail.com> 2014-02-27 11:27:51 -0500
commit: b8013d90c16cf4e1225205fc309f24f7c06ea6c5 (patch)
tree: 9b8ca439ddc2897b1375362c73037b35be9dd167
parent: 473ef7e26fc8d2c6b26e66b80d50e49c18fa24f8 (diff)
parent: b620744bbc7554c8e7a320019a8601f7ca2a5188 (diff)
download: ocr-layer-curation-b8013d90c16cf4e1225205fc309f24f7c06ea6c5.tar.gz
1 files changed, 45 insertions, 36 deletions
diff --git a/parsedjvutext.py b/parsedjvutext.py
index 773a1d4..06cecb9 100644
--- a/parsedjvutext.py
+++ b/parsedjvutext.py
@@ -1,6 +1,9 @@
 import sys
 from bs4 import BeautifulSoup
 import subprocess
+import djvu
+from djvu.decode import Context
+from itertools import chain
 
 def parse_book_xml(djvubook):
     args = ["djvutoxml", djvubook]
@@ -26,43 +29,49 @@ def parse_page_xml(djvubook, pagenumber):
                 for word in all_words]
     return {"words": words, "coords": coords}
 
-def parse_wordline(line):
-    line = line.lstrip(" (").rstrip(")").split(" ")
-    word = line[5]
-    word = word[1:-1].decode("string_escape").decode("utf-8")
-    coords = map(int, line[1:5])
-    return word, coords
+def parse_page_sexp(s, page_size=None):
+    if type(s) is djvu.sexpr.ListExpression:
+        if len(s) == 0:
+            pass
+        if str(s[0].value) == "word":
+            coords = [s[i].value for i in xrange(1, 5)]
+            if page_size:
+                coords[1]=page_size-coords[1]
+                coords[3]=page_size-coords[3]
+            word = s[5].value
+            yield (word, coords)
+        else:
+            for c in chain.from_iterable(parse_page_sexp(child, page_size) for child in s[5:]):
+                yield c
+    else:
+        pass
 
-def page_sexp(djvubook, pagenumber):
-    args = ["djvused", "-e", "select {0};print-txt".format(pagenumber),
-            djvubook]
-    return subprocess.check_output(args).split("\n")
+def parse_book(djvubook, page=None, html=False):
+    """
+    returns the list of words and coordinates from a djvu book.
+    if page is None, returns the whole book.
+    if html is True, coordinates are computed from the bottom of the page
+    """
+    c = Context()
+    document = c.new_document(djvu.decode.FileURI(djvubook))
+    document.decoding_job.wait()
+    if page:
+        toparse = [document.pages[page-1]]
+    else:
+        toparse = document.pages
+    words = [[]] * len(toparse)
+    coords = [[]] * len(toparse)
+    page_size = None
+    for i, page in enumerate(toparse):
+        if page.text.sexpr:
+            if html:
+                page_size= p.size[1]
+            gen = parse_page_sexp(page.text.sexpr, page_size)
+            word_coords = zip(*gen)
+            words[i] = word_coords[0]
+            coords[i] = word_coords[1]
 
-def parse_page_sexp(djvubook, pagenumber):
-    page = [parse_wordline(line) for line in page_sexp(djvubook, pagenumber) \
-            if "word" in line]
-    return {"words": [a for a, b in page], "coords": [b for a, b in page]}
-
-def parse_book_sexp(djvubook):
-    book = {"words": [], "coords": []}
-    page_coords = []
-    page_words = []
-    firstpage = True
-    args = ["djvused", "-e", "print-txt", djvubook]
-    for line in subprocess.check_output(args).split("\n"):
-        if "page" in line:
-            if firstpage:
-                firstpage = False
-            else:
-                book["words"].append(page_words)
-                book["coords"].append(page_coords)
-                page_coords = []
-                page_words = []
-        if "word" in line:
-            word, coords = parse_wordline(line)
-            page_words.append(word)
-            page_coords.append(coords)
-    return book
+    return {"words": words, "coords": coords}
 
 if __name__=="__main__":
-    book_sexp = parse_book_sexp(sys.argv[1])
+    book = parse_book(sys.argv[1])
author	Thibaut Horel <thibaut.horel@gmail.com>	2014-02-27 11:27:51 -0500
committer	Thibaut Horel <thibaut.horel@gmail.com>	2014-02-27 11:27:51 -0500
commit	b8013d90c16cf4e1225205fc309f24f7c06ea6c5 (patch)
tree	9b8ca439ddc2897b1375362c73037b35be9dd167
parent	473ef7e26fc8d2c6b26e66b80d50e49c18fa24f8 (diff)
parent	b620744bbc7554c8e7a320019a8601f7ca2a5188 (diff)
download	ocr-layer-curation-b8013d90c16cf4e1225205fc309f24f7c06ea6c5.tar.gz