aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2014-09-07 15:55:27 -0400
committerThibaut Horel <thibaut.horel@gmail.com>2014-09-07 15:55:27 -0400
commitd28394833d54a68f5ca13d2edaa261128f6c5170 (patch)
treeab00b100b99066b80979613b06fca2c9b7087701
parentdfcd65c8f10aa94f19fe40940565681ab9a73e44 (diff)
downloadocr-layer-curation-d28394833d54a68f5ca13d2edaa261128f6c5170.tar.gz
Compute alignment on the final (html formatted) text
-rw-r--r--web/static/css/style.css18
-rw-r--r--web/templates/index.html5
-rw-r--r--web/templates/layout.html2
-rw-r--r--web/utils.py15
-rw-r--r--wikisource.py57
5 files changed, 81 insertions, 16 deletions
diff --git a/web/static/css/style.css b/web/static/css/style.css
index e42975d..ec73fd0 100644
--- a/web/static/css/style.css
+++ b/web/static/css/style.css
@@ -10,3 +10,21 @@ span:hover {
float: left;
margin-right: 1em;
}
+
+#text {
+ width: 460px;
+ float: left;
+}
+
+.pagetext p {
+ text-align: justify;
+ -moz-hyphens: auto;
+ margin: 0;
+ text-indent: 1.5em;
+}
+
+.pagetext {
+ padding: 3.5em 2em;
+ font-size: 18px;
+ line-height: 180%;
+}
diff --git a/web/templates/index.html b/web/templates/index.html
index b3b5fe0..dc286aa 100644
--- a/web/templates/index.html
+++ b/web/templates/index.html
@@ -5,10 +5,9 @@
<img id="page" src="{{page_number}}.jpg" usemap="#wordmap" />
</div>
<map name="wordmap">{% for id, coords in areas %}
- <area href="#" shape="rect" coords="{{coords}}" data-id="{{id}}" />{% end %}
+ <area href="#" shape="rect" coords="{{coords}}" data-id="word-{{id}}" />{% end %}
</map>
<div id="text">
- {% for id, word in words %}
- <span id="{{id}}">{{word}}</span> {% end %}
+ {% raw words %}
</div>
{% end %}
diff --git a/web/templates/layout.html b/web/templates/layout.html
index ff4077d..2fef4d3 100644
--- a/web/templates/layout.html
+++ b/web/templates/layout.html
@@ -1,5 +1,5 @@
<!doctype html>
-<html lang="en">
+<html lang="fr">
<head>
<meta charset="utf-8">
<script src="static/js/jquery.js"></script>
diff --git a/web/utils.py b/web/utils.py
index 8522841..1947f8b 100644
--- a/web/utils.py
+++ b/web/utils.py
@@ -1,21 +1,20 @@
import djvu_utils as du
import sys
import string_utils as su
-from wikisource import get_page
+from wikisource import get_page2
+
def gen_html(book, page_number):
doc = du.get_document("../" + book)
- page = doc.pages[int(page_number)-1]
+ page = doc.pages[int(page_number) - 1]
d = du.parse_page(page)
- corrected_text = get_page(book, int(page_number))
- corrected_words = su.simplify(corrected_text).split()
+ elem, corrected_text = get_page2(open("test.txt").read())
if d:
words, coords = zip(*d)
- C = su.align(corrected_words, list(words), list(coords))
- r = su.alignment_to_sexp(corrected_text.split(), words, coords, C[1])
- corrected_words, coords = zip(*r)
+ C = su.align(corrected_text.split(), list(words), list(coords))
+ coords = [coords[e[0]] for e in C[1]]
coords_html = du.convert_to_htmlcoord(coords, page.size[1])
- return (list(enumerate(coords_html)), list(enumerate(corrected_words)))
+ return (list(enumerate(coords_html)), str(elem))
if __name__ == "__main__":
gen_html(*sys.argv[1:3])
diff --git a/wikisource.py b/wikisource.py
index 1459468..af72d34 100644
--- a/wikisource.py
+++ b/wikisource.py
@@ -1,12 +1,53 @@
# -*- coding: utf-8 -*-
import requests
import sys
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, NavigableString
from itertools import takewhile, count
+from types import SliceType
+from string_utils import align
URL = "http://fr.wikisource.org/w/index.php"
+def spanify(string, start=0):
+ soup = BeautifulSoup()
+ for i, word in enumerate(string.split()):
+ span = soup.new_tag("span", id="word-" + str(start + i))
+ span.string = word
+ string.insert_before(span)
+ string.insert_before(" ")
+ string.replace_with("")
+ return start + i + 1
+
+
+class HtmlText():
+
+ def __init__(self, elem):
+ self.elem = elem
+ start = 0
+ strings = list(string for string in self.elem.strings
+ if string.strip())
+
+ for string in strings:
+ start = spanify(string, start)
+ self.length = start
+
+ def __len__(self):
+ return self.length
+
+ def __getitem__(self, key):
+ if type(key) is SliceType:
+ return [self[w] for w in range(*key.indices(self.length))]
+ if key >= self.length:
+ raise IndexError
+ if key < 0:
+ key = self.length - key
+ return self.elem.find("span", {"id": "word-" + str(key)}).text
+
+ def __str__(self):
+ return str(self.elem)
+
+
def get_page(title, page):
params = {"action": "render", "title": "Page:" + title + "/" + str(page)}
r = requests.get(URL, params=params)
@@ -17,6 +58,12 @@ def get_page(title, page):
return None
+def get_page2(text):
+ soup = BeautifulSoup(text, "lxml")
+ elem = soup.select("div.pagetext")[0]
+ return HtmlText(elem), elem.text
+
+
def get_pages(title, begin=1, end=None):
if end:
return (get_page(title, i) for i in xrange(begin, end + 1))
@@ -26,6 +73,8 @@ def get_pages(title, begin=1, end=None):
if __name__ == "__main__":
- title = sys.argv[1]
- for page in get_pages(title):
- print page
+ b = BeautifulSoup("<a>asd</a>")
+ c = HtmlText(b)
+ print type(c[0])
+ print align(c, [u"asd"], None)
+ print c[0:1]