import os import bs4 import csv import re root = "/home/share/serenitas/Fund Raising/Preqin/Investors/" filelist = [f for f in os.listdir(root) if f.endswith('htm')] fh2 = open(os.path.join(root, "investors3.csv"), "w") csvwriter = csv.writer(fh2) # headers = ['filename','name', 'address', 'tel', 'fax','category','about'] # csvwriter.writerow(headers) # d = {} # sentinel={} # for f in filelist: # with open(os.path.join(root, f)) as fh: # soup = bs4.BeautifulSoup(fh) # d['filename']=f # d['name']=soup.findAll("h1")[1].text.strip() # if d['name'] in sentinel: # continue # else: # sentinel[d['name']]=1 # d['address']=soup.find(id=re.compile("address")).text # try: # d['tel']=soup.find(id=re.compile("tel")).text # except AttributeError: # d['tel']='' # d['about']=soup.find(id=re.compile("about")).text.replace("\n","").strip() # try: # d['fax']=soup.find(id=re.compile("fax")).text # except AttributeError: # d['fax']='' # d['category']=soup.find(id=re.compile("fund_category")).text # csvwriter.writerow([d[k] for k in headers]) # fh2.close() # headers=['filename', 'name', 'website', 'email', 'min_req_trackrecord', 'min_req_AUM', 'typicalinv', # 'inv_overview', 'summary', 'pref_method'] # csvwriter.writerow(headers) # d = {} # sentinel={} # for f in filelist: # with open(os.path.join(root, f)) as fh: # soup = bs4.BeautifulSoup(fh) # d['filename']=f # d['name']=soup.findAll("h1")[1].text.strip() # if d['name'] in sentinel: # continue # else: # sentinel[d['name']]=1 # try: # d['website'] = soup.find(id=re.compile("web_link")).text # except AttributeError: # d['website'] = '' # try: # d['email']=soup.find(id=re.compile("email_link")).text # except AttributeError: # d['email']='' # try: # temp=soup.find(id=re.compile('lblMinManagerReq_Record')).text # d['min_req_trackrecord']=temp.replace("Track Record: ","") # except AttributeError: # d['min_req_trackrecord']='' # try: # temp = soup.find(id=re.compile('lblMinManagerReq_AUM')).text # d['min_req_AUM']=temp.replace("Funds Under Management: ","") # except AttributeError: # d['min_req_AUM']='' # try: # d['typicalinv'] = soup.find(id=re.compile("TypicalInvestment")).text # except AttributeError: # d['typicalinv']='' # try: # d['inv_overview'] = soup.find(id=re.compile("investor_notes")).text.replace("\n","").strip() # except AttributeError: # d['inv_overview']='' # try: # d['summary']=soup.find(id=re.compile('lblSummary')).text.replace("\n","").strip() # except AttributeError: # d['summary']='' # try: # temp = " ".join(soup.find("div", {'class':'divPrefContact'}).stripped_strings) # d['pref_method']=temp.replace("Preferred method of initial contact: ","") # except AttributeError: # d['pref_method']='' # csvwriter.writerow([d[k] for k in headers]) # fh2.close() headers=['filename', 'name', 'contact name', 'job title', 'phone', 'email'] csvwriter.writerow(headers) d = {} sentinel={} for f in filelist: with open(os.path.join(root, f)) as fh: soup = bs4.BeautifulSoup(fh) d['filename']=f d['name']=soup.findAll("h1")[1].text.strip() if d['name'] in sentinel: continue else: sentinel[d['name']]=1 try: table =soup.find(id=re.compile('FirmContacts')) table_rows = table.findAll('tr') for row in table_rows[1:]: fields = row.findAll('td') d['contact name'] = fields[1].text.strip() d['job title'] = fields[2].text.strip() d['email'] = fields[3].a.text.strip() d['phone'] = fields[3].find(text=True).strip() csvwriter.writerow([d[k] for k in headers]) except AttributeError: continue fh2.close()