diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/parse_preqin.py | 40 |
1 files changed, 40 insertions, 0 deletions
diff --git a/python/parse_preqin.py b/python/parse_preqin.py new file mode 100644 index 00000000..6dfcd0a0 --- /dev/null +++ b/python/parse_preqin.py @@ -0,0 +1,40 @@ +import os +import bs4 +import csv +import re + +root = "/home/share/serenitas/Fund Raising/Preqin/Investors/" +filelist = [f for f in os.listdir(root) if f.endswith('htm')] +readdress=re.compile("address") +retel=re.compile("tel") +refax=re.compile("fax") +recat=re.compile("fund_category") +reabout=re.compile("about") +fh2 = open(os.path.join(root, "investors.csv"), "w") +csvwriter = csv.writer(fh2) +headers = ['filename','name', 'address', 'tel', 'fax','category','about'] +csvwriter.writerow(headers) +d = {} +sentinel={} +for f in filelist: + with open(os.path.join(root, f)) as fh: + soup = bs4.BeautifulSoup(fh) + d['filename']=f + d['name']=soup.findAll("h1")[1].text.strip() + if d['name'] in sentinel: + continue + else: + sentinel[d['name']]=1 + d['address']=soup.find(id=readdress).text + try: + d['tel']=soup.find(id=retel).text + except AttributeError: + d['tel']='' + d['about']=soup.find(id=reabout).text.replace("\n","").strip() + try: + d['fax']=soup.find(id=refax).text + except AttributeError: + d['fax']='' + d['category']=soup.find(id=recat).text + csvwriter.writerow([d[k] for k in headers]) +fh2.close() |
