clean up get_pages function

author: Guillaume Horel <guillaume.horel@gmail.com> 2013-12-28 19:00:31 -0500
committer: Guillaume Horel <guillaume.horel@gmail.com> 2013-12-28 19:00:31 -0500
commit: d56a449c4f745ba661bc952aa36746a1ef06e7d4 (patch)
tree: 5b47031156521caab58d4e4254592b6189c5ef35 /wikisource.py
parent: 6f1f274e260b5ba9df98d0869f5277d39588c9a7 (diff)
download: ocr-layer-curation-d56a449c4f745ba661bc952aa36746a1ef06e7d4.tar.gz
1 files changed, 17 insertions, 6 deletions
diff --git a/wikisource.py b/wikisource.py
index 5227721..f0d230f 100644
--- a/wikisource.py
+++ b/wikisource.py
@@ -3,22 +3,33 @@ import requests
 import lxml
 import sys
 from bs4 import BeautifulSoup
+from itertools import takewhile, count
 
 URL = "http://fr.wikisource.org/w/index.php"
 
 def get_page(title, page):
     params = { "action": "render", "title": "Page:" + title + "/" + str(page) }
     r = requests.get(URL, params=params)
-    soup = BeautifulSoup(r.text, "lxml")
-    return soup.select("div.pagetext")[0].text
+    if r.status_code == requests.codes.ok:
+        soup = BeautifulSoup(r.text, "lxml")
+        return soup.select("div.pagetext")[0].text
+    else:
+        return None
 
 def get_pages(title, begin=1, end=None):
-    if not end:
-        end = 100
-    for page in xrange(begin, end+1):
-        yield get_page(title, page)
+    if end:
+        return (get_page(title, i) for i in xrange(begin, end+1))
+    else:
+        return takewhile(lambda x: x is not None,
+                         (get_page(title, i) for i in count(begin)))
+
 
 if __name__ == "__main__":
     title = sys.argv[1]
     for page in get_pages(title):
         print page
+
+
+def f(i):
+    if i <=10:
+        return i**2
author	Guillaume Horel <guillaume.horel@gmail.com>	2013-12-28 19:00:31 -0500
committer	Guillaume Horel <guillaume.horel@gmail.com>	2013-12-28 19:00:31 -0500
commit	d56a449c4f745ba661bc952aa36746a1ef06e7d4 (patch)
tree	5b47031156521caab58d4e4254592b6189c5ef35 /wikisource.py
parent	6f1f274e260b5ba9df98d0869f5277d39588c9a7 (diff)
download	ocr-layer-curation-d56a449c4f745ba661bc952aa36746a1ef06e7d4.tar.gz