blob: 7b48eb0fde60e5cddedc0045fcfd2ec5f7bbea9c (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
# -*- coding: utf-8 -*-
import requests
import lxml
import sys
from bs4 import BeautifulSoup
URL="http://fr.wikisource.org/w/index.php"
def get_page(title, page):
params = { "action": "render", "title": "Page:" + title + "/" + str(page) }
r = requests.get(URL, params=params)
soup = BeautifulSoup(r.text, "lxml")
return " ".join(soup.select("div.pagetext")[0].findAll(text=True))
def get_pages(title, begin=1, end=None):
if not end:
end = 10
for page in xrange(begin, end+1):
yield get_page(title, page)
if __name__ == "__main__":
title = sys.argv[1]
for page in get_pages(title):
print page
|