From 5d15aa5a5111b1dac8961cff35dce3acc57e56ef Mon Sep 17 00:00:00 2001 From: Thibaut Horel Date: Sat, 3 Aug 2013 17:01:32 +0200 Subject: Add simple script to download text from Wikisource --- wikisource.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 wikisource.py diff --git a/wikisource.py b/wikisource.py new file mode 100644 index 0000000..2163483 --- /dev/null +++ b/wikisource.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +import requests +import lxml +import sys +from bs4 import BeautifulSoup + +URL="http://fr.wikisource.org/w/index.php" + +def get_page(title, page): + params = { "action": "render", "title": "Page:" + title + "/" + str(page) } + r = requests.get(URL, params=params) + soup = BeautifulSoup(r.text, "lxml") + return soup.select("div.pagetext")[0] + +def get_book(title): + n_pages = 10 + return [get_page(title, page) for page in xrange(1, n_pages)] + +if __name__ == "__main__": + title = sys.argv[1] + print get_book(title) -- cgit v1.2.3-70-g09d2