aboutsummaryrefslogtreecommitdiffstats
path: root/wikisource.py
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2013-08-03 23:43:45 +0200
committerThibaut Horel <thibaut.horel@gmail.com>2013-08-03 23:43:45 +0200
commit331499d08be1ef32e5bb6200963ebc63500eb826 (patch)
treea6edb878547b2e947131a993da9587834eac4065 /wikisource.py
parent8b9977bc8cbf4b0c2bc90eb32ec3c78c91c5395c (diff)
downloadocr-layer-curation-331499d08be1ef32e5bb6200963ebc63500eb826.tar.gz
Fix html stripping
Diffstat (limited to 'wikisource.py')
-rw-r--r--wikisource.py4
1 files changed, 2 insertions, 2 deletions
diff --git a/wikisource.py b/wikisource.py
index 7b48eb0..38bd90d 100644
--- a/wikisource.py
+++ b/wikisource.py
@@ -10,11 +10,11 @@ def get_page(title, page):
params = { "action": "render", "title": "Page:" + title + "/" + str(page) }
r = requests.get(URL, params=params)
soup = BeautifulSoup(r.text, "lxml")
- return " ".join(soup.select("div.pagetext")[0].findAll(text=True))
+ return "".join(soup.select("div.pagetext")[0].findAll(text=True))
def get_pages(title, begin=1, end=None):
if not end:
- end = 10
+ end = 100
for page in xrange(begin, end+1):
yield get_page(title, page)