Compute alignment on the final (html formatted) text

author: Thibaut Horel <thibaut.horel@gmail.com> 2014-09-07 15:55:27 -0400
committer: Thibaut Horel <thibaut.horel@gmail.com> 2014-09-07 15:55:27 -0400
commit: d28394833d54a68f5ca13d2edaa261128f6c5170 (patch)
tree: ab00b100b99066b80979613b06fca2c9b7087701 /wikisource.py
parent: dfcd65c8f10aa94f19fe40940565681ab9a73e44 (diff)
download: ocr-layer-curation-d28394833d54a68f5ca13d2edaa261128f6c5170.tar.gz
1 files changed, 53 insertions, 4 deletions
diff --git a/wikisource.py b/wikisource.py
index 1459468..af72d34 100644
--- a/wikisource.py
+++ b/wikisource.py
@@ -1,12 +1,53 @@
 # -*- coding: utf-8 -*-
 import requests
 import sys
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, NavigableString
 from itertools import takewhile, count
+from types import SliceType
+from string_utils import align
 
 URL = "http://fr.wikisource.org/w/index.php"
 
 
+def spanify(string, start=0):
+    soup = BeautifulSoup()
+    for i, word in enumerate(string.split()):
+        span = soup.new_tag("span", id="word-" + str(start + i))
+        span.string = word
+        string.insert_before(span)
+        string.insert_before(" ")
+    string.replace_with("")
+    return start + i + 1
+
+
+class HtmlText():
+
+    def __init__(self, elem):
+        self.elem = elem
+        start = 0
+        strings = list(string for string in self.elem.strings
+                       if string.strip())
+
+        for string in strings:
+            start = spanify(string, start)
+        self.length = start
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, key):
+        if type(key) is SliceType:
+            return [self[w] for w in range(*key.indices(self.length))]
+        if key >= self.length:
+            raise IndexError
+        if key < 0:
+            key = self.length - key
+        return self.elem.find("span", {"id": "word-" + str(key)}).text
+
+    def __str__(self):
+        return str(self.elem)
+
+
 def get_page(title, page):
     params = {"action": "render", "title": "Page:" + title + "/" + str(page)}
     r = requests.get(URL, params=params)
@@ -17,6 +58,12 @@ def get_page(title, page):
         return None
 
 
+def get_page2(text):
+    soup = BeautifulSoup(text, "lxml")
+    elem = soup.select("div.pagetext")[0]
+    return HtmlText(elem), elem.text
+
+
 def get_pages(title, begin=1, end=None):
     if end:
         return (get_page(title, i) for i in xrange(begin, end + 1))
@@ -26,6 +73,8 @@ def get_pages(title, begin=1, end=None):
 
 
 if __name__ == "__main__":
-    title = sys.argv[1]
-    for page in get_pages(title):
-        print page
+    b = BeautifulSoup("<a>asd</a>")
+    c = HtmlText(b)
+    print type(c[0])
+    print align(c, [u"asd"], None)
+    print c[0:1]
author	Thibaut Horel <thibaut.horel@gmail.com>	2014-09-07 15:55:27 -0400
committer	Thibaut Horel <thibaut.horel@gmail.com>	2014-09-07 15:55:27 -0400
commit	d28394833d54a68f5ca13d2edaa261128f6c5170 (patch)
tree	ab00b100b99066b80979613b06fca2c9b7087701 /wikisource.py
parent	dfcd65c8f10aa94f19fe40940565681ab9a73e44 (diff)
download	ocr-layer-curation-d28394833d54a68f5ca13d2edaa261128f6c5170.tar.gz