aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2013-08-03 17:20:35 +0200
committerThibaut Horel <thibaut.horel@gmail.com>2013-08-03 17:20:50 +0200
commitbbb455d1487ecf9def1b179f80334287729c6c17 (patch)
tree178997b4f1a38a33898895f869459c1d43a5acd3
parente024beea82f5bcf8df4e0177b118b3db12e9c2cb (diff)
downloadocr-layer-curation-bbb455d1487ecf9def1b179f80334287729c6c17.tar.gz
Improve wikisource.py script
-rw-r--r--wikisource.py13
1 files changed, 8 insertions, 5 deletions
diff --git a/wikisource.py b/wikisource.py
index 2163483..7b48eb0 100644
--- a/wikisource.py
+++ b/wikisource.py
@@ -10,12 +10,15 @@ def get_page(title, page):
params = { "action": "render", "title": "Page:" + title + "/" + str(page) }
r = requests.get(URL, params=params)
soup = BeautifulSoup(r.text, "lxml")
- return soup.select("div.pagetext")[0]
+ return " ".join(soup.select("div.pagetext")[0].findAll(text=True))
-def get_book(title):
- n_pages = 10
- return [get_page(title, page) for page in xrange(1, n_pages)]
+def get_pages(title, begin=1, end=None):
+ if not end:
+ end = 10
+ for page in xrange(begin, end+1):
+ yield get_page(title, page)
if __name__ == "__main__":
title = sys.argv[1]
- print get_book(title)
+ for page in get_pages(title):
+ print page