aboutsummaryrefslogtreecommitdiffstats
path: root/wikisource.py
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2013-08-03 17:01:32 +0200
committerThibaut Horel <thibaut.horel@gmail.com>2013-08-03 17:01:32 +0200
commit5d15aa5a5111b1dac8961cff35dce3acc57e56ef (patch)
tree6c0e535b7657ea9d6070fba9c1c262eb7c54ef0f /wikisource.py
parent21e329d4380c29825f36877e5f7c7fabc8ec067b (diff)
downloadocr-layer-curation-5d15aa5a5111b1dac8961cff35dce3acc57e56ef.tar.gz
Add simple script to download text from Wikisource
Diffstat (limited to 'wikisource.py')
-rw-r--r--wikisource.py21
1 files changed, 21 insertions, 0 deletions
diff --git a/wikisource.py b/wikisource.py
new file mode 100644
index 0000000..2163483
--- /dev/null
+++ b/wikisource.py
@@ -0,0 +1,21 @@
+# -*- coding: utf-8 -*-
+import requests
+import lxml
+import sys
+from bs4 import BeautifulSoup
+
+URL="http://fr.wikisource.org/w/index.php"
+
+def get_page(title, page):
+ params = { "action": "render", "title": "Page:" + title + "/" + str(page) }
+ r = requests.get(URL, params=params)
+ soup = BeautifulSoup(r.text, "lxml")
+ return soup.select("div.pagetext")[0]
+
+def get_book(title):
+ n_pages = 10
+ return [get_page(title, page) for page in xrange(1, n_pages)]
+
+if __name__ == "__main__":
+ title = sys.argv[1]
+ print get_book(title)