aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGuillaume Horel <guillaume.horel@gmail.com>2014-03-01 15:26:45 -0500
committerGuillaume Horel <guillaume.horel@gmail.com>2014-03-01 15:30:27 -0500
commitbf74cd2294598c3dc1d73edd74ca88c87b7d6cd6 (patch)
treefe74e894aabdc5ca6a8ee08fa3a0c1f7d0cb4094
parentc5734b6b776727959f1b485651f1ddc7c8121a85 (diff)
downloadocr-layer-curation-bf74cd2294598c3dc1d73edd74ca88c87b7d6cd6.tar.gz
Preliminary support for corrected text
* It's slow, need to figure out how to load it in the background maybe * The bouding boxes could be improved
-rw-r--r--web/main.py4
-rw-r--r--web/settings.py3
-rw-r--r--web/utils.py21
3 files changed, 19 insertions, 9 deletions
diff --git a/web/main.py b/web/main.py
index 582d33c..206eb86 100644
--- a/web/main.py
+++ b/web/main.py
@@ -3,7 +3,7 @@ from tornado.web import RequestHandler, Application
import tornado.ioloop
from settings import settings
import utils
-from parsedjvutext import image_from_book
+from djvu_utils import image_from_book
import io
class MainHandler(RequestHandler):
@@ -16,7 +16,7 @@ class MainHandler(RequestHandler):
class ImageHandler(RequestHandler):
def get(self, page_number):
- im = image_from_book(self.settings["book"], int(page_number))
+ im = image_from_book("../" + self.settings["book"], int(page_number))
self.set_header('Content-Type', 'image/jpg')
img_buff = io.BytesIO()
im.save(img_buff, format="JPEG")
diff --git a/web/settings.py b/web/settings.py
index 98c490b..5a8c9aa 100644
--- a/web/settings.py
+++ b/web/settings.py
@@ -4,5 +4,6 @@ settings = {
"static_path": "static",
"cookie_secret": "toto",
"login_url": "/login",
- "book": "../Bloy_-_Le_Sang_du_pauvre,_Stock,_1932.djvu"
+ #"book": "Bloy_-_Le_Sang_du_pauvre,_Stock,_1932.djvu"
+ "book": "Villiers_de_L'Isle-Adam_-_Tribulat_Bonhomet,_1908.djvu"
}
diff --git a/web/utils.py b/web/utils.py
index bb9a4fe..8522841 100644
--- a/web/utils.py
+++ b/web/utils.py
@@ -1,12 +1,21 @@
-from parsedjvutext import parse_book
+import djvu_utils as du
import sys
-
+import string_utils as su
+from wikisource import get_page
def gen_html(book, page_number):
- d = parse_book(book, page=int(page_number), html=True)
- if d[0]:
- words, coords = zip(*d[0])
- return (list(enumerate(coords)), list(enumerate(words)))
+ doc = du.get_document("../" + book)
+ page = doc.pages[int(page_number)-1]
+ d = du.parse_page(page)
+ corrected_text = get_page(book, int(page_number))
+ corrected_words = su.simplify(corrected_text).split()
+ if d:
+ words, coords = zip(*d)
+ C = su.align(corrected_words, list(words), list(coords))
+ r = su.alignment_to_sexp(corrected_text.split(), words, coords, C[1])
+ corrected_words, coords = zip(*r)
+ coords_html = du.convert_to_htmlcoord(coords, page.size[1])
+ return (list(enumerate(coords_html)), list(enumerate(corrected_words)))
if __name__ == "__main__":
gen_html(*sys.argv[1:3])