import os import bs4 import csv import re root = "/home/share/serenitas/Fund Raising/Preqin/Investors/" filelist = [f for f in os.listdir(root) if f.endswith('htm')] readdress=re.compile("address") retel=re.compile("tel") refax=re.compile("fax") recat=re.compile("fund_category") reabout=re.compile("about") fh2 = open(os.path.join(root, "investors.csv"), "w") csvwriter = csv.writer(fh2) headers = ['filename','name', 'address', 'tel', 'fax','category','about'] csvwriter.writerow(headers) d = {} sentinel={} for f in filelist: with open(os.path.join(root, f)) as fh: soup = bs4.BeautifulSoup(fh) d['filename']=f d['name']=soup.findAll("h1")[1].text.strip() if d['name'] in sentinel: continue else: sentinel[d['name']]=1 d['address']=soup.find(id=readdress).text try: d['tel']=soup.find(id=retel).text except AttributeError: d['tel']='' d['about']=soup.find(id=reabout).text.replace("\n","").strip() try: d['fax']=soup.find(id=refax).text except AttributeError: d['fax']='' d['category']=soup.find(id=recat).text csvwriter.writerow([d[k] for k in headers]) fh2.close()