diff options
| author | Thibaut Horel <thibaut.horel@gmail.com> | 2013-08-03 17:01:32 +0200 |
|---|---|---|
| committer | Thibaut Horel <thibaut.horel@gmail.com> | 2013-08-03 17:01:32 +0200 |
| commit | 5d15aa5a5111b1dac8961cff35dce3acc57e56ef (patch) | |
| tree | 6c0e535b7657ea9d6070fba9c1c262eb7c54ef0f /wikisource.py | |
| parent | 21e329d4380c29825f36877e5f7c7fabc8ec067b (diff) | |
| download | ocr-layer-curation-5d15aa5a5111b1dac8961cff35dce3acc57e56ef.tar.gz | |
Add simple script to download text from Wikisource
Diffstat (limited to 'wikisource.py')
| -rw-r--r-- | wikisource.py | 21 |
1 files changed, 21 insertions, 0 deletions
diff --git a/wikisource.py b/wikisource.py new file mode 100644 index 0000000..2163483 --- /dev/null +++ b/wikisource.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +import requests +import lxml +import sys +from bs4 import BeautifulSoup + +URL="http://fr.wikisource.org/w/index.php" + +def get_page(title, page): + params = { "action": "render", "title": "Page:" + title + "/" + str(page) } + r = requests.get(URL, params=params) + soup = BeautifulSoup(r.text, "lxml") + return soup.select("div.pagetext")[0] + +def get_book(title): + n_pages = 10 + return [get_page(title, page) for page in xrange(1, n_pages)] + +if __name__ == "__main__": + title = sys.argv[1] + print get_book(title) |
