Last simplification

author: Thibaut Horel <thibaut.horel@gmail.com> 2014-02-27 12:21:45 -0500
committer: Thibaut Horel <thibaut.horel@gmail.com> 2014-02-27 12:21:45 -0500
commit: 0ceb87ba43bc52a7f73b62ce0795b7e78f57ea04 (patch)
tree: 143647fc134facec83f4cb65fdf8387213632d1c /parsedjvutext.py
parent: 6c39c85b41d31dec1054833e80b566127103d889 (diff)
download: ocr-layer-curation-0ceb87ba43bc52a7f73b62ce0795b7e78f57ea04.tar.gz
1 files changed, 21 insertions, 24 deletions
diff --git a/parsedjvutext.py b/parsedjvutext.py
index 2c7c46b..44340cc 100644
--- a/parsedjvutext.py
+++ b/parsedjvutext.py
@@ -33,23 +33,26 @@ def parse_page_xml(djvubook, pagenumber):
     return {"words": words, "coords": coords}
 
 
-def parse_page_sexp(s, page_size=None):
-    if type(s) is djvu.sexpr.ListExpression:
-        if len(s) == 0:
-            pass
-        if str(s[0].value) == "word":
-            coords = [s[i].value for i in xrange(1, 5)]
-            if page_size:
-                coords[1] = page_size - coords[1]
-                coords[3] = page_size - coords[3]
-            word = s[5].value
-            yield (word, coords)
+def parse_page(page, html=False):
+    s, page_size = page.text.sexpr, page.size[1]
+
+    def aux(s):
+        if type(s) is djvu.sexpr.ListExpression:
+            if len(s) == 0:
+                pass
+            if str(s[0].value) == "word":
+                coords = [s[i].value for i in xrange(1, 5)]
+                if html:
+                    coords[1] = page_size - coords[1]
+                    coords[3] = page_size - coords[3]
+                word = s[5].value
+                yield (word, coords)
+            else:
+                for c in chain.from_iterable(aux(child) for child in s[5:]):
+                    yield c
         else:
-            for c in chain.from_iterable(parse_page_sexp(child, page_size)
-                                         for child in s[5:]):
-                yield c
-    else:
-        pass
+            pass
+    return aux(s)
 
 
 def parse_book(djvubook, page=None, html=False):
@@ -66,14 +69,8 @@ def parse_book(djvubook, page=None, html=False):
     else:
         toparse = document.pages
 
-    def gen_pages():
-        for i, page in enumerate(toparse):
-            if page.text.sexpr:
-                page_size = page.size[1] if html else None
-                gen = parse_page_sexp(page.text.sexpr, page_size)
-                yield zip(*gen)
-
-    return list(gen_pages())
+    return list(zip(*parse_page(page, html=html)) for page in toparse
+                if page.text.sexpr)
 
 if __name__ == "__main__":
     book = parse_book(sys.argv[1])
author	Thibaut Horel <thibaut.horel@gmail.com>	2014-02-27 12:21:45 -0500
committer	Thibaut Horel <thibaut.horel@gmail.com>	2014-02-27 12:21:45 -0500
commit	0ceb87ba43bc52a7f73b62ce0795b7e78f57ea04 (patch)
tree	143647fc134facec83f4cb65fdf8387213632d1c /parsedjvutext.py
parent	6c39c85b41d31dec1054833e80b566127103d889 (diff)
download	ocr-layer-curation-0ceb87ba43bc52a7f73b62ce0795b7e78f57ea04.tar.gz