Adapting the web client code to the new behavior of parsedjvutext

author: Thibaut Horel <thibaut.horel@gmail.com> 2014-02-27 11:46:30 -0500
committer: Thibaut Horel <thibaut.horel@gmail.com> 2014-02-27 11:46:30 -0500
commit: 38a7cef65e167a4004f37f6651777e99ab5ba6d3 (patch)
tree: 2a102ad1f25c465471cdec5e8f7ce89a681a2265
parent: 76054347cffacd7a4a6759f2187717e185d8082c (diff)
download: ocr-layer-curation-38a7cef65e167a4004f37f6651777e99ab5ba6d3.tar.gz
2 files changed, 19 insertions, 14 deletions
diff --git a/parsedjvutext.py b/parsedjvutext.py
index 06cecb9..2ded9d2 100644
--- a/parsedjvutext.py
+++ b/parsedjvutext.py
@@ -5,6 +5,7 @@ import djvu
 from djvu.decode import Context
 from itertools import chain
 
+
 def parse_book_xml(djvubook):
     args = ["djvutoxml", djvubook]
     soup = BeautifulSoup(subprocess.check_output(args), "lxml")
@@ -12,23 +13,26 @@ def parse_book_xml(djvubook):
     coords = []
     for page in soup.find_all("hiddentext"):
         words.append([word.text for word in page.find_all("word")])
-        coords.append([tuple(map(int, word["coords"].split(","))) \
-                        for word in page.find_all("word")])
+        coords.append([tuple(map(int, word["coords"].split(",")))
+                       for word in page.find_all("word")])
     return {"words": words, "coords": coords}
 
+
 def get_npages(djvubook):
     args = ["djvused", "-e", "n", djvubook]
     return int(subprocess.check_output(args))
 
+
 def parse_page_xml(djvubook, pagenumber):
     args = ["djvutoxml", "--page", str(pagenumber), djvubook]
     soup = BeautifulSoup(subprocess.check_output(args), "lxml")
     all_words = soup.find_all("word")
     words = [word.text for word in all_words]
-    coords = [tuple(map(int, word["coords"].split(","))) \
-                for word in all_words]
+    coords = [tuple(map(int, word["coords"].split(",")))
+              for word in all_words]
     return {"words": words, "coords": coords}
 
+
 def parse_page_sexp(s, page_size=None):
     if type(s) is djvu.sexpr.ListExpression:
         if len(s) == 0:
@@ -36,16 +40,18 @@ def parse_page_sexp(s, page_size=None):
         if str(s[0].value) == "word":
             coords = [s[i].value for i in xrange(1, 5)]
             if page_size:
-                coords[1]=page_size-coords[1]
-                coords[3]=page_size-coords[3]
+                coords[1] = page_size - coords[1]
+                coords[3] = page_size - coords[3]
             word = s[5].value
             yield (word, coords)
         else:
-            for c in chain.from_iterable(parse_page_sexp(child, page_size) for child in s[5:]):
+            for c in chain.from_iterable(parse_page_sexp(child, page_size)
+                                         for child in s[5:]):
                 yield c
     else:
         pass
 
+
 def parse_book(djvubook, page=None, html=False):
     """
     returns the list of words and coordinates from a djvu book.
@@ -56,7 +62,7 @@ def parse_book(djvubook, page=None, html=False):
     document = c.new_document(djvu.decode.FileURI(djvubook))
     document.decoding_job.wait()
     if page:
-        toparse = [document.pages[page-1]]
+        toparse = [document.pages[page - 1]]
     else:
         toparse = document.pages
     words = [[]] * len(toparse)
@@ -65,7 +71,7 @@ def parse_book(djvubook, page=None, html=False):
     for i, page in enumerate(toparse):
         if page.text.sexpr:
             if html:
-                page_size= p.size[1]
+                page_size = int(page.size[1])
             gen = parse_page_sexp(page.text.sexpr, page_size)
             word_coords = zip(*gen)
             words[i] = word_coords[0]
@@ -73,5 +79,5 @@ def parse_book(djvubook, page=None, html=False):
 
     return {"words": words, "coords": coords}
 
-if __name__=="__main__":
+if __name__ == "__main__":
     book = parse_book(sys.argv[1])
diff --git a/web/utils.py b/web/utils.py
index 72d05dd..583cd1c 100644
--- a/web/utils.py
+++ b/web/utils.py
@@ -1,15 +1,14 @@
-from parsedjvutext import parse_page_sexp
+from parsedjvutext import parse_book
 import sys
 
 
 def gen_html(book, page_number):
     book = "../Villiers_de_L\'Isle-Adam_-_Tribulat_Bonhomet,_1908.djvu"
-    d = parse_page_sexp(book, page_number)
-    coords, words = d["coords"], d["words"]
+    d = parse_book(book, page=int(page_number), html=True)
+    coords, words = d["coords"][0], d["words"][0]
 
     def get_areas():
         for i, coord in enumerate(coords):
-            coord[1], coord[3] = 2764 - coord[3], 2764 - coord[1]
             coord_str = ",".join(map(str, coord))
             yield i, coord_str
author	Thibaut Horel <thibaut.horel@gmail.com>	2014-02-27 11:46:30 -0500
committer	Thibaut Horel <thibaut.horel@gmail.com>	2014-02-27 11:46:30 -0500
commit	38a7cef65e167a4004f37f6651777e99ab5ba6d3 (patch)
tree	2a102ad1f25c465471cdec5e8f7ce89a681a2265
parent	76054347cffacd7a4a6759f2187717e185d8082c (diff)
download	ocr-layer-curation-38a7cef65e167a4004f37f6651777e99ab5ba6d3.tar.gz