1 files changed, 37 insertions, 19 deletions
diff --git a/utils/wikisource.py b/utils/wikisource.py
index 283d5fe..536a82c 100644
--- a/utils/wikisource.py
+++ b/utils/wikisource.py
@@ -13,7 +13,8 @@ URL = "http://fr.wikisource.org/w/index.php"
 def spanify(string, start=0):
     soup = BeautifulSoup()
     for i, word in enumerate(string.split()):
-        span = soup.new_tag("span", id="word-" + str(start + i))
+        span = soup.new_tag("span")
+        span["data-id"]=start + i
         span.string = word
         string.insert_before(span)
         string.insert_before(" ")
@@ -21,33 +22,53 @@ def spanify(string, start=0):
     return start + i + 1
 
 
-class HtmlText():
-
+class HtmlText(object):
+    ## This class takes the corrected html from a wikisource page
+    ## and adds extra information fo facilitate the mapping.
+    ## At initialization, it wraps each word into a <span>
+    ## with attribute data-id=i where i is the index of the corrected word.
+    ## Once we set align, it adds to each span an id attribute
+    ## of the form id="corr-x,y,z" where x, y, z are ids in
+    ## the image map.
     def __init__(self, elem):
-        self.elem = elem
+        self._elem = elem
         start = 0
-        strings = list(string for string in self.elem.strings
+        strings = list(string for string in self._elem.strings
                        if string.strip())
 
         for string in strings:
             start = spanify(string, start)
-        self.length = start
+        self._length = start
+        self._align = None
 
     def __len__(self):
-        return self.length
+        return self._length
 
     def __getitem__(self, key):
         if type(key) is SliceType:
             return [unicode(self[w]) for w in range(*key.indices(self.length))]
-        if key >= self.length:
+        if key >= len(self):
             raise IndexError
         if key < 0:
-            key = self.length - key
-        return self.elem.find("span", {"id": "word-" + str(key)}).text
+            key = len(self) - key
+        return self._elem.find("span", {"data-id": key}).text
+
+    def __unicode__(self):
+        return self._elem.text
 
     def __str__(self):
-        return str(self.elem)
+        return unicode(self).encode("utf-8")
+
+    @property
+    def align(self):
+        return self._align
 
+    @align.setter
+    def align(self, val):
+        self._align = val
+        for i in range(len(self)):
+            self._elem.find("span", {"data-id": i})['id']="corr-" + \
+            ",".join(map(str, val[i]))
 
 def get_page(title, page):
     params = {"action": "render", "title": "Page:" + title + "/" + str(page)}
@@ -66,21 +87,18 @@ def get_pages(title, begin=1, end=None):
                          (get_page(title, i) for i in count(begin)))
 
 def gen_html(book, page_number):
-    doc = du.get_document(book)
-    page = doc.pages[int(page_number)-1]
-    d = du.parse_page(page)
+    d = du.parse_book(book, page_number)[0]
     corrected_text = get_page(book, int(page_number))
-    corrected_words = su.simplify(corrected_text.elem.text).split()
+    corrected_words = su.simplify(unicode(corrected_text)).split()
     if d:
         orig_words, orig_coords = zip(*d)
         C = su.align(corrected_words, list(orig_words), list(orig_coords))
-        corr_words = corrected_text
-        orig_coords_html = du.convert_to_htmlcoord(orig_coords, page.size[1])
-    return orig_coords_html, orig_words, corr_words, C[1]
+        corrected_text.align = C[1]
+    return orig_coords, orig_words, corrected_text
 
 if __name__ == "__main__":
     wikibook = "Bloy - Le Sang du pauvre, Stock, 1932.djvu".replace(" ", "_")
-    test = get_page(wikibook, 28)
+    test = gen_html(wikibook, 28)
     # print type(c[0])
     # print su.align(c, [u"asd"], None)
     # print c[0:1]