aboutsummaryrefslogtreecommitdiffstats
path: root/python/parse_preqin.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/parse_preqin.py')
-rw-r--r--python/parse_preqin.py109
1 files changed, 92 insertions, 17 deletions
diff --git a/python/parse_preqin.py b/python/parse_preqin.py
index 6dfcd0a0..5bb288e3 100644
--- a/python/parse_preqin.py
+++ b/python/parse_preqin.py
@@ -5,14 +5,89 @@ import re
root = "/home/share/serenitas/Fund Raising/Preqin/Investors/"
filelist = [f for f in os.listdir(root) if f.endswith('htm')]
-readdress=re.compile("address")
-retel=re.compile("tel")
-refax=re.compile("fax")
-recat=re.compile("fund_category")
-reabout=re.compile("about")
-fh2 = open(os.path.join(root, "investors.csv"), "w")
+fh2 = open(os.path.join(root, "investors3.csv"), "w")
csvwriter = csv.writer(fh2)
-headers = ['filename','name', 'address', 'tel', 'fax','category','about']
+# headers = ['filename','name', 'address', 'tel', 'fax','category','about']
+# csvwriter.writerow(headers)
+# d = {}
+# sentinel={}
+# for f in filelist:
+# with open(os.path.join(root, f)) as fh:
+# soup = bs4.BeautifulSoup(fh)
+# d['filename']=f
+# d['name']=soup.findAll("h1")[1].text.strip()
+# if d['name'] in sentinel:
+# continue
+# else:
+# sentinel[d['name']]=1
+# d['address']=soup.find(id=re.compile("address")).text
+# try:
+# d['tel']=soup.find(id=re.compile("tel")).text
+# except AttributeError:
+# d['tel']=''
+# d['about']=soup.find(id=re.compile("about")).text.replace("\n","").strip()
+# try:
+# d['fax']=soup.find(id=re.compile("fax")).text
+# except AttributeError:
+# d['fax']=''
+# d['category']=soup.find(id=re.compile("fund_category")).text
+# csvwriter.writerow([d[k] for k in headers])
+# fh2.close()
+
+# headers=['filename', 'name', 'website', 'email', 'min_req_trackrecord', 'min_req_AUM', 'typicalinv',
+# 'inv_overview', 'summary', 'pref_method']
+# csvwriter.writerow(headers)
+# d = {}
+# sentinel={}
+# for f in filelist:
+# with open(os.path.join(root, f)) as fh:
+# soup = bs4.BeautifulSoup(fh)
+# d['filename']=f
+# d['name']=soup.findAll("h1")[1].text.strip()
+# if d['name'] in sentinel:
+# continue
+# else:
+# sentinel[d['name']]=1
+# try:
+# d['website'] = soup.find(id=re.compile("web_link")).text
+# except AttributeError:
+# d['website'] = ''
+# try:
+# d['email']=soup.find(id=re.compile("email_link")).text
+# except AttributeError:
+# d['email']=''
+# try:
+# temp=soup.find(id=re.compile('lblMinManagerReq_Record')).text
+# d['min_req_trackrecord']=temp.replace("Track Record: ","")
+# except AttributeError:
+# d['min_req_trackrecord']=''
+# try:
+# temp = soup.find(id=re.compile('lblMinManagerReq_AUM')).text
+# d['min_req_AUM']=temp.replace("Funds Under Management: ","")
+# except AttributeError:
+# d['min_req_AUM']=''
+# try:
+# d['typicalinv'] = soup.find(id=re.compile("TypicalInvestment")).text
+# except AttributeError:
+# d['typicalinv']=''
+# try:
+# d['inv_overview'] = soup.find(id=re.compile("investor_notes")).text.replace("\n","").strip()
+# except AttributeError:
+# d['inv_overview']=''
+# try:
+# d['summary']=soup.find(id=re.compile('lblSummary')).text.replace("\n","").strip()
+# except AttributeError:
+# d['summary']=''
+# try:
+# temp = " ".join(soup.find("div", {'class':'divPrefContact'}).stripped_strings)
+# d['pref_method']=temp.replace("Preferred method of initial contact: ","")
+
+# except AttributeError:
+# d['pref_method']=''
+# csvwriter.writerow([d[k] for k in headers])
+# fh2.close()
+
+headers=['filename', 'name', 'contact name', 'job title', 'phone', 'email']
csvwriter.writerow(headers)
d = {}
sentinel={}
@@ -25,16 +100,16 @@ for f in filelist:
continue
else:
sentinel[d['name']]=1
- d['address']=soup.find(id=readdress).text
try:
- d['tel']=soup.find(id=retel).text
+ table =soup.find(id=re.compile('FirmContacts'))
+ table_rows = table.findAll('tr')
+ for row in table_rows[1:]:
+ fields = row.findAll('td')
+ d['contact name'] = fields[1].text.strip()
+ d['job title'] = fields[2].text.strip()
+ d['email'] = fields[3].a.text.strip()
+ d['phone'] = fields[3].find(text=True).strip()
+ csvwriter.writerow([d[k] for k in headers])
except AttributeError:
- d['tel']=''
- d['about']=soup.find(id=reabout).text.replace("\n","").strip()
- try:
- d['fax']=soup.find(id=refax).text
- except AttributeError:
- d['fax']=''
- d['category']=soup.find(id=recat).text
- csvwriter.writerow([d[k] for k in headers])
+ continue
fh2.close()