diff options
| author | Guillaume Horel <guillaume.horel@gmail.com> | 2013-12-28 19:00:31 -0500 |
|---|---|---|
| committer | Guillaume Horel <guillaume.horel@gmail.com> | 2013-12-28 19:00:31 -0500 |
| commit | d56a449c4f745ba661bc952aa36746a1ef06e7d4 (patch) | |
| tree | 5b47031156521caab58d4e4254592b6189c5ef35 /wikisource.py | |
| parent | 6f1f274e260b5ba9df98d0869f5277d39588c9a7 (diff) | |
| download | ocr-layer-curation-d56a449c4f745ba661bc952aa36746a1ef06e7d4.tar.gz | |
clean up get_pages function
Diffstat (limited to 'wikisource.py')
| -rw-r--r-- | wikisource.py | 23 |
1 files changed, 17 insertions, 6 deletions
diff --git a/wikisource.py b/wikisource.py index 5227721..f0d230f 100644 --- a/wikisource.py +++ b/wikisource.py @@ -3,22 +3,33 @@ import requests import lxml import sys from bs4 import BeautifulSoup +from itertools import takewhile, count URL = "http://fr.wikisource.org/w/index.php" def get_page(title, page): params = { "action": "render", "title": "Page:" + title + "/" + str(page) } r = requests.get(URL, params=params) - soup = BeautifulSoup(r.text, "lxml") - return soup.select("div.pagetext")[0].text + if r.status_code == requests.codes.ok: + soup = BeautifulSoup(r.text, "lxml") + return soup.select("div.pagetext")[0].text + else: + return None def get_pages(title, begin=1, end=None): - if not end: - end = 100 - for page in xrange(begin, end+1): - yield get_page(title, page) + if end: + return (get_page(title, i) for i in xrange(begin, end+1)) + else: + return takewhile(lambda x: x is not None, + (get_page(title, i) for i in count(begin))) + if __name__ == "__main__": title = sys.argv[1] for page in get_pages(title): print page + + +def f(i): + if i <=10: + return i**2 |
