From 939ea00056d5cc8817f00b8c293efa04d36bf6d5 Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Sat, 7 Nov 2015 18:49:08 -0500 Subject: add script to download all the explainations --- xkcd.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/xkcd.py b/xkcd.py index 3bbac65..97fe15e 100644 --- a/xkcd.py +++ b/xkcd.py @@ -1,5 +1,6 @@ import requests import bs4 +import os def get_xkcd(comicid): r = requests.get('http://www.xkcd.org/{0}/'.format(comicid)) @@ -8,5 +9,23 @@ def get_xkcd(comicid): img = soup.find("div", {'id':"comic"}).find("img") return img['title'], img['src'] +def get_explanation(comic_id): + """dowload explanation from explainxkcd + + ignore transcript for now""" + r = requests.get('http://www.explainxkcd.com/wiki/index.php/{0}'.format(comic_id)) + soup = bs4.BeautifulSoup(r.content) + firstp = soup.find('div', {'id':"content"}).find('p') + allp = [firstp]+firstp.find_next_siblings('p') + return "".join([t.text for t in allp]) + +def main(): + # last = sorted([int(f.split("_")[1]) for f in os.listdir("explanations")], + # reverse=True)[0] + # print(last) + for cid in range(1, 1601): + with open("explanations/comic_{0}".format(cid), "w") as fh: + fh.write(get_explanation(cid).encode("utf-8")) + if __name__=="__main__": - print(get_xkcd(1600)) + main() -- cgit v1.2.3-70-g09d2