diff options
Diffstat (limited to 'python/parse_preqin.py')
| -rw-r--r-- | python/parse_preqin.py | 109 |
1 files changed, 92 insertions, 17 deletions
diff --git a/python/parse_preqin.py b/python/parse_preqin.py index 6dfcd0a0..5bb288e3 100644 --- a/python/parse_preqin.py +++ b/python/parse_preqin.py @@ -5,14 +5,89 @@ import re root = "/home/share/serenitas/Fund Raising/Preqin/Investors/" filelist = [f for f in os.listdir(root) if f.endswith('htm')] -readdress=re.compile("address") -retel=re.compile("tel") -refax=re.compile("fax") -recat=re.compile("fund_category") -reabout=re.compile("about") -fh2 = open(os.path.join(root, "investors.csv"), "w") +fh2 = open(os.path.join(root, "investors3.csv"), "w") csvwriter = csv.writer(fh2) -headers = ['filename','name', 'address', 'tel', 'fax','category','about'] +# headers = ['filename','name', 'address', 'tel', 'fax','category','about'] +# csvwriter.writerow(headers) +# d = {} +# sentinel={} +# for f in filelist: +# with open(os.path.join(root, f)) as fh: +# soup = bs4.BeautifulSoup(fh) +# d['filename']=f +# d['name']=soup.findAll("h1")[1].text.strip() +# if d['name'] in sentinel: +# continue +# else: +# sentinel[d['name']]=1 +# d['address']=soup.find(id=re.compile("address")).text +# try: +# d['tel']=soup.find(id=re.compile("tel")).text +# except AttributeError: +# d['tel']='' +# d['about']=soup.find(id=re.compile("about")).text.replace("\n","").strip() +# try: +# d['fax']=soup.find(id=re.compile("fax")).text +# except AttributeError: +# d['fax']='' +# d['category']=soup.find(id=re.compile("fund_category")).text +# csvwriter.writerow([d[k] for k in headers]) +# fh2.close() + +# headers=['filename', 'name', 'website', 'email', 'min_req_trackrecord', 'min_req_AUM', 'typicalinv', +# 'inv_overview', 'summary', 'pref_method'] +# csvwriter.writerow(headers) +# d = {} +# sentinel={} +# for f in filelist: +# with open(os.path.join(root, f)) as fh: +# soup = bs4.BeautifulSoup(fh) +# d['filename']=f +# d['name']=soup.findAll("h1")[1].text.strip() +# if d['name'] in sentinel: +# continue +# else: +# sentinel[d['name']]=1 +# try: +# d['website'] = soup.find(id=re.compile("web_link")).text +# except AttributeError: +# d['website'] = '' +# try: +# d['email']=soup.find(id=re.compile("email_link")).text +# except AttributeError: +# d['email']='' +# try: +# temp=soup.find(id=re.compile('lblMinManagerReq_Record')).text +# d['min_req_trackrecord']=temp.replace("Track Record: ","") +# except AttributeError: +# d['min_req_trackrecord']='' +# try: +# temp = soup.find(id=re.compile('lblMinManagerReq_AUM')).text +# d['min_req_AUM']=temp.replace("Funds Under Management: ","") +# except AttributeError: +# d['min_req_AUM']='' +# try: +# d['typicalinv'] = soup.find(id=re.compile("TypicalInvestment")).text +# except AttributeError: +# d['typicalinv']='' +# try: +# d['inv_overview'] = soup.find(id=re.compile("investor_notes")).text.replace("\n","").strip() +# except AttributeError: +# d['inv_overview']='' +# try: +# d['summary']=soup.find(id=re.compile('lblSummary')).text.replace("\n","").strip() +# except AttributeError: +# d['summary']='' +# try: +# temp = " ".join(soup.find("div", {'class':'divPrefContact'}).stripped_strings) +# d['pref_method']=temp.replace("Preferred method of initial contact: ","") + +# except AttributeError: +# d['pref_method']='' +# csvwriter.writerow([d[k] for k in headers]) +# fh2.close() + +headers=['filename', 'name', 'contact name', 'job title', 'phone', 'email'] csvwriter.writerow(headers) d = {} sentinel={} @@ -25,16 +100,16 @@ for f in filelist: continue else: sentinel[d['name']]=1 - d['address']=soup.find(id=readdress).text try: - d['tel']=soup.find(id=retel).text + table =soup.find(id=re.compile('FirmContacts')) + table_rows = table.findAll('tr') + for row in table_rows[1:]: + fields = row.findAll('td') + d['contact name'] = fields[1].text.strip() + d['job title'] = fields[2].text.strip() + d['email'] = fields[3].a.text.strip() + d['phone'] = fields[3].find(text=True).strip() + csvwriter.writerow([d[k] for k in headers]) except AttributeError: - d['tel']='' - d['about']=soup.find(id=reabout).text.replace("\n","").strip() - try: - d['fax']=soup.find(id=refax).text - except AttributeError: - d['fax']='' - d['category']=soup.find(id=recat).text - csvwriter.writerow([d[k] for k in headers]) + continue fh2.close() |
