aboutsummaryrefslogtreecommitdiffstats
path: root/web/utils.py
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2014-09-07 15:55:27 -0400
committerThibaut Horel <thibaut.horel@gmail.com>2014-09-07 15:55:27 -0400
commitd28394833d54a68f5ca13d2edaa261128f6c5170 (patch)
treeab00b100b99066b80979613b06fca2c9b7087701 /web/utils.py
parentdfcd65c8f10aa94f19fe40940565681ab9a73e44 (diff)
downloadocr-layer-curation-d28394833d54a68f5ca13d2edaa261128f6c5170.tar.gz
Compute alignment on the final (html formatted) text
Diffstat (limited to 'web/utils.py')
-rw-r--r--web/utils.py15
1 files changed, 7 insertions, 8 deletions
diff --git a/web/utils.py b/web/utils.py
index 8522841..1947f8b 100644
--- a/web/utils.py
+++ b/web/utils.py
@@ -1,21 +1,20 @@
import djvu_utils as du
import sys
import string_utils as su
-from wikisource import get_page
+from wikisource import get_page2
+
def gen_html(book, page_number):
doc = du.get_document("../" + book)
- page = doc.pages[int(page_number)-1]
+ page = doc.pages[int(page_number) - 1]
d = du.parse_page(page)
- corrected_text = get_page(book, int(page_number))
- corrected_words = su.simplify(corrected_text).split()
+ elem, corrected_text = get_page2(open("test.txt").read())
if d:
words, coords = zip(*d)
- C = su.align(corrected_words, list(words), list(coords))
- r = su.alignment_to_sexp(corrected_text.split(), words, coords, C[1])
- corrected_words, coords = zip(*r)
+ C = su.align(corrected_text.split(), list(words), list(coords))
+ coords = [coords[e[0]] for e in C[1]]
coords_html = du.convert_to_htmlcoord(coords, page.size[1])
- return (list(enumerate(coords_html)), list(enumerate(corrected_words)))
+ return (list(enumerate(coords_html)), str(elem))
if __name__ == "__main__":
gen_html(*sys.argv[1:3])