From bbb455d1487ecf9def1b179f80334287729c6c17 Mon Sep 17 00:00:00 2001 From: Thibaut Horel Date: Sat, 3 Aug 2013 17:20:35 +0200 Subject: Improve wikisource.py script --- wikisource.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'wikisource.py') diff --git a/wikisource.py b/wikisource.py index 2163483..7b48eb0 100644 --- a/wikisource.py +++ b/wikisource.py @@ -10,12 +10,15 @@ def get_page(title, page): params = { "action": "render", "title": "Page:" + title + "/" + str(page) } r = requests.get(URL, params=params) soup = BeautifulSoup(r.text, "lxml") - return soup.select("div.pagetext")[0] + return " ".join(soup.select("div.pagetext")[0].findAll(text=True)) -def get_book(title): - n_pages = 10 - return [get_page(title, page) for page in xrange(1, n_pages)] +def get_pages(title, begin=1, end=None): + if not end: + end = 10 + for page in xrange(begin, end+1): + yield get_page(title, page) if __name__ == "__main__": title = sys.argv[1] - print get_book(title) + for page in get_pages(title): + print page -- cgit v1.2.3-70-g09d2