import requests import bs4 import os def get_xkcd(comicid): r = requests.get('http://www.xkcd.org/{0}/'.format(comicid)) if r.status_code == 200: soup = bs4.BeautifulSoup(r.content) img = soup.find("div", {'id':"comic"}).find("img") return img['title'], img['src'] def get_explanation(comic_id): """dowload explanation from explainxkcd ignore transcript for now""" r = requests.get('http://www.explainxkcd.com/wiki/index.php/{0}'.format(comic_id)) soup = bs4.BeautifulSoup(r.content) firstp = soup.find('div', {'id':"content"}).find('p') allp = [firstp]+firstp.find_next_siblings('p') return "".join([t.text for t in allp]) def main(): # last = sorted([int(f.split("_")[1]) for f in os.listdir("explanations")], # reverse=True)[0] # print(last) for cid in range(1, 1601): with open("explanations/comic_{0}".format(cid), "w") as fh: fh.write(get_explanation(cid).encode("utf-8")) if __name__=="__main__": main()