diff options
| author | Thibaut Horel <thibaut.horel@gmail.com> | 2013-08-03 17:20:35 +0200 |
|---|---|---|
| committer | Thibaut Horel <thibaut.horel@gmail.com> | 2013-08-03 17:20:50 +0200 |
| commit | bbb455d1487ecf9def1b179f80334287729c6c17 (patch) | |
| tree | 178997b4f1a38a33898895f869459c1d43a5acd3 | |
| parent | e024beea82f5bcf8df4e0177b118b3db12e9c2cb (diff) | |
| download | ocr-layer-curation-bbb455d1487ecf9def1b179f80334287729c6c17.tar.gz | |
Improve wikisource.py script
| -rw-r--r-- | wikisource.py | 13 |
1 files changed, 8 insertions, 5 deletions
diff --git a/wikisource.py b/wikisource.py index 2163483..7b48eb0 100644 --- a/wikisource.py +++ b/wikisource.py @@ -10,12 +10,15 @@ def get_page(title, page): params = { "action": "render", "title": "Page:" + title + "/" + str(page) } r = requests.get(URL, params=params) soup = BeautifulSoup(r.text, "lxml") - return soup.select("div.pagetext")[0] + return " ".join(soup.select("div.pagetext")[0].findAll(text=True)) -def get_book(title): - n_pages = 10 - return [get_page(title, page) for page in xrange(1, n_pages)] +def get_pages(title, begin=1, end=None): + if not end: + end = 10 + for page in xrange(begin, end+1): + yield get_page(title, page) if __name__ == "__main__": title = sys.argv[1] - print get_book(title) + for page in get_pages(title): + print page |
