aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/parse_preqin.py40
1 files changed, 40 insertions, 0 deletions
diff --git a/python/parse_preqin.py b/python/parse_preqin.py
new file mode 100644
index 00000000..6dfcd0a0
--- /dev/null
+++ b/python/parse_preqin.py
@@ -0,0 +1,40 @@
+import os
+import bs4
+import csv
+import re
+
+root = "/home/share/serenitas/Fund Raising/Preqin/Investors/"
+filelist = [f for f in os.listdir(root) if f.endswith('htm')]
+readdress=re.compile("address")
+retel=re.compile("tel")
+refax=re.compile("fax")
+recat=re.compile("fund_category")
+reabout=re.compile("about")
+fh2 = open(os.path.join(root, "investors.csv"), "w")
+csvwriter = csv.writer(fh2)
+headers = ['filename','name', 'address', 'tel', 'fax','category','about']
+csvwriter.writerow(headers)
+d = {}
+sentinel={}
+for f in filelist:
+ with open(os.path.join(root, f)) as fh:
+ soup = bs4.BeautifulSoup(fh)
+ d['filename']=f
+ d['name']=soup.findAll("h1")[1].text.strip()
+ if d['name'] in sentinel:
+ continue
+ else:
+ sentinel[d['name']]=1
+ d['address']=soup.find(id=readdress).text
+ try:
+ d['tel']=soup.find(id=retel).text
+ except AttributeError:
+ d['tel']=''
+ d['about']=soup.find(id=reabout).text.replace("\n","").strip()
+ try:
+ d['fax']=soup.find(id=refax).text
+ except AttributeError:
+ d['fax']=''
+ d['category']=soup.find(id=recat).text
+ csvwriter.writerow([d[k] for k in headers])
+fh2.close()