diff options
| author | Guillaume Horel <guillaume.horel@gmail.com> | 2014-09-07 18:21:37 -0400 |
|---|---|---|
| committer | Guillaume Horel <guillaume.horel@gmail.com> | 2014-09-07 18:24:08 -0400 |
| commit | 0e8b0c88a4d3009cbbea695f606e49faef27f373 (patch) | |
| tree | 85a14a7aef3ee36e73544382c6fdec8aa6bd375c | |
| parent | 74604d7b8ae98b125f1c800da753f8ab67474eb5 (diff) | |
| download | ocr-layer-curation-0e8b0c88a4d3009cbbea695f606e49faef27f373.tar.gz | |
Reorganize the code
hope I did it right. We have two packages now, one for the server
and one for the actual library.
| -rw-r--r-- | utils/__init__.py | 0 | ||||
| -rw-r--r-- | utils/djvu_utils.py (renamed from djvu_utils.py) | 0 | ||||
| -rw-r--r-- | utils/string_utils.py (renamed from string_utils.py) | 0 | ||||
| -rw-r--r-- | utils/wikisource.py (renamed from wikisource.py) | 17 | ||||
| -rw-r--r-- | web/__init__.py | 2 | ||||
| l--------- | web/djvu_utils.py | 1 | ||||
| -rw-r--r-- | web/server.py (renamed from web/main.py) | 18 | ||||
| -rw-r--r-- | web/settings.py | 8 | ||||
| -rw-r--r-- | web/utils.py | 20 |
9 files changed, 29 insertions, 37 deletions
diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/utils/__init__.py diff --git a/djvu_utils.py b/utils/djvu_utils.py index 21692a1..21692a1 100644 --- a/djvu_utils.py +++ b/utils/djvu_utils.py diff --git a/string_utils.py b/utils/string_utils.py index b6c8ce0..b6c8ce0 100644 --- a/string_utils.py +++ b/utils/string_utils.py diff --git a/wikisource.py b/utils/wikisource.py index af72d34..589c88e 100644 --- a/wikisource.py +++ b/utils/wikisource.py @@ -4,7 +4,8 @@ import sys from bs4 import BeautifulSoup, NavigableString from itertools import takewhile, count from types import SliceType -from string_utils import align +import string_utils as su +import djvu_utils as du URL = "http://fr.wikisource.org/w/index.php" @@ -71,10 +72,22 @@ def get_pages(title, begin=1, end=None): return takewhile(lambda x: x is not None, (get_page(title, i) for i in count(begin))) +def gen_html(book, page_number): + doc = du.get_document(book) + page = doc.pages[int(page_number)-1] + d = du.parse_page(page) + corrected_text = get_page(book, int(page_number)) + corrected_words = su.simplify(corrected_text).split() + if d: + orig_words, orig_coords = zip(*d) + C = su.align(corrected_words, list(orig_words), list(orig_coords)) + corr_words = corrected_text.split() + orig_coords_html = du.convert_to_htmlcoord(orig_coords, page.size[1]) + return orig_coords_html, orig_words, corr_words, C[1] if __name__ == "__main__": b = BeautifulSoup("<a>asd</a>") c = HtmlText(b) print type(c[0]) - print align(c, [u"asd"], None) + print su.align(c, [u"asd"], None) print c[0:1] diff --git a/web/__init__.py b/web/__init__.py new file mode 100644 index 0000000..139597f --- /dev/null +++ b/web/__init__.py @@ -0,0 +1,2 @@ + + diff --git a/web/djvu_utils.py b/web/djvu_utils.py deleted file mode 120000 index 0742170..0000000 --- a/web/djvu_utils.py +++ /dev/null @@ -1 +0,0 @@ -../djvu_utils.py
\ No newline at end of file diff --git a/web/main.py b/web/server.py index a6826c1..1e67ad4 100644 --- a/web/main.py +++ b/web/server.py @@ -2,22 +2,22 @@ import tornado.httpserver from tornado.web import RequestHandler, Application import tornado.ioloop from settings import settings -import utils -from djvu_utils import image_from_book +from utils.djvu_utils import image_from_book +from utils.wikisource import gen_html import io class MainHandler(RequestHandler): def get(self, page_number): orig_coords, orig_words, corr_words, align = \ - utils.gen_html(self.settings["book"], page_number) + gen_html(self.settings["book"], page_number) self.render("index.html", page_number=page_number, orig_coords=orig_coords, orig_words=orig_words, corr_words=corr_words, align=align) class ImageHandler(RequestHandler): def get(self, page_number): - im = image_from_book("../" + self.settings["book"], int(page_number)) + im = image_from_book(self.settings["book"], int(page_number)) self.set_header('Content-Type', 'image/jpg') img_buff = io.BytesIO() im.save(img_buff, format="JPEG") @@ -25,12 +25,10 @@ class ImageHandler(RequestHandler): self.write(img_buff.read()) self.finish() -application = Application([ - (r'/(\d+)/?', MainHandler), - (r'/(\d+)\.jpg/?', ImageHandler)] - , **settings) - -if __name__ == '__main__': +def run(): + application = Application([ + (r'/(\d+)/?', MainHandler), + (r'/(\d+)\.jpg/?', ImageHandler)], **settings) http_server = tornado.httpserver.HTTPServer(application) http_server.listen(8888) print "Listening on 8888" diff --git a/web/settings.py b/web/settings.py index 5a8c9aa..32693b8 100644 --- a/web/settings.py +++ b/web/settings.py @@ -1,9 +1,9 @@ settings = { "debug": True, - "template_path": "templates", - "static_path": "static", + "template_path": "web/templates", + "static_path": "web/static", "cookie_secret": "toto", "login_url": "/login", - #"book": "Bloy_-_Le_Sang_du_pauvre,_Stock,_1932.djvu" - "book": "Villiers_de_L'Isle-Adam_-_Tribulat_Bonhomet,_1908.djvu" + "book": "Bloy_-_Le_Sang_du_pauvre,_Stock,_1932.djvu" + #"book": "Villiers_de_L'Isle-Adam_-_Tribulat_Bonhomet,_1908.djvu" } diff --git a/web/utils.py b/web/utils.py deleted file mode 100644 index 7e20858..0000000 --- a/web/utils.py +++ /dev/null @@ -1,20 +0,0 @@ -import djvu_utils as du -import sys -import string_utils as su -from wikisource import get_page - -def gen_html(book, page_number): - doc = du.get_document("../" + book) - page = doc.pages[int(page_number)-1] - d = du.parse_page(page) - corrected_text = get_page(book, int(page_number)) - corrected_words = su.simplify(corrected_text).split() - if d: - orig_words, orig_coords = zip(*d) - C = su.align(corrected_words, list(orig_words), list(orig_coords)) - corr_words = corrected_text.split() - orig_coords_html = du.convert_to_htmlcoord(orig_coords, page.size[1]) - return orig_coords_html, orig_words, corr_words, C[1] - -if __name__ == "__main__": - gen_html(*sys.argv[1:3]) |
