aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/download_emails.py6
-rw-r--r--python/parse_emails.py134
2 files changed, 77 insertions, 63 deletions
diff --git a/python/download_emails.py b/python/download_emails.py
index 5e0eb95b..be18b1f7 100644
--- a/python/download_emails.py
+++ b/python/download_emails.py
@@ -79,7 +79,8 @@ def msg_content(msg):
"""Extract subject and body from a gmail message"""
subject = [x['value'] for x in msg['payload']['headers'] if x['name']=='Subject'][0]
content = base64.b64decode(msg['payload']['body']['data']).decode('utf-8')
- return subject, content
+ date = msg['internalDate'] ## date /1000 to get timestamp
+ return subject, content, date
def update_emails():
"""Download new emails that were labeled swaptions."""
@@ -91,13 +92,14 @@ def update_emails():
for msg in ListMessagesWithLabels(service, 'me', labelsdict['swaptions']):
if msg['id'] not in current_msgs:
try:
- subject, content = msg_content(get_msg(service, 'me', msg['id']))
+ subject, content, date = msg_content(get_msg(service, 'me', msg['id']))
except (binascii.Error, KeyError):
print("error decoding {0}".format(msg['id']))
continue
else:
email = p / msg['id']
with email.open("w") as fh:
+ fh.write(date + "\r\n")
fh.write(subject + "\r\n")
fh.write(content)
diff --git a/python/parse_emails.py b/python/parse_emails.py
index 3b25cf20..65cb7810 100644
--- a/python/parse_emails.py
+++ b/python/parse_emails.py
@@ -3,6 +3,7 @@ import re
from pathlib import Path
import pdb
from download_emails import update_emails
+import datetime
def makedf(r, indextype, ref):
if indextype=='IG':
@@ -19,75 +20,86 @@ def makedf(r, indextype, ref):
for k in df:
if df.dtypes[k]=='object':
df[k] = pd.to_numeric(df[k])
- df.set_index('strike', inplace=True)
+ df.set_index('Strike', inplace=True)
return df
-if __name__=="__main__":
- update_emails()
- emails = [f for f in Path("../../data/swaptions").iterdir() if f.is_file()]
- masterdf = {}
- for f in emails:
- with f.open("rb") as fh:
- subject = fh.readline()
- m = re.match("(?:Fwd:)?(\w{2})([0-9]{1,2})\s", subject.decode('utf-8'))
- if m:
- indextype, series = m.groups()
- series = int(series)
- else:
- print("can't parse subject line for {0}".format(f))
- print(subject.decode("utf-8"))
- continue
- flag = False
- allexpiriesdf = {}
- for line in fh:
- line = line.decode('utf-8', 'ignore')
- line = line.rstrip()
- if line.startswith("At"):
- for p in ['%m/%d %H:%M:%S', '%b %d %Y %H:%M:%S']:
- try:
- quotedate = pd.to_datetime(line, format=p, exact=False)
- except ValueError:
- continue
- else:
- if quotedate.year == 1900:
- quotedate = quotedate.replace(year=2015)
- break
- else:
- pdb.set_trace()
- if line.startswith("Ref"):
- m = re.match("Ref:(\S+)\s+(?:Fwd Px:(\S+)\s+)?Fwd(?: Spd)?:(\S+)\s+Fwd Bpv:(\S+)\s+Expiry:(\S+)",
- line)
- if m:
- if len(m.groups())==4:
- ref, fwspread, fwfwbpv, expiry = m.groups()
- elif len(m.groups())==5:
- ref, fwprice, fwspread, fwfwbpv, expiry = m.groups()
+def parse_email(email_path):
+ with email_path.open("rb") as fh:
+ date_received = datetime.datetime.fromtimestamp(int(fh.readline())/1000)
+ subject = fh.readline()
+ m = re.match("(?:Fwd:)?(?:BAML )?(\w{2})([0-9]{1,2})\s", subject.decode('utf-8'))
+ if m:
+ indextype, series = m.groups()
+ series = int(series)
+ else:
+ raise RuntimeError("can't parse subject line: {0} for email {1}".format(
+ subject.decode("utf-8"), email_path.name))
+ flag = False
+ allexpiriesdf = {}
+ for line in fh:
+ line = line.decode('utf-8', 'ignore')
+ line = line.rstrip()
+ if line.startswith("At"):
+ for p in ['%m/%d %H:%M:%S', '%b %d %Y %H:%M:%S']:
+ try:
+ quotedate = pd.to_datetime(line, format=p, exact=False)
+ except ValueError:
+ continue
else:
- print("something wrong with {0}".format(f))
- expiry = pd.datetime.strptime(expiry, '%d-%b-%y')
+ if quotedate.year == 1900:
+ quotedate = quotedate.replace(year=date_received.year)
+ break
+ else:
+ raise RuntimeError("can't parse date")
+ if line.startswith("Ref"):
+ m = re.match("Ref:(\S+)\s+(?:Fwd Px:(\S+)\s+)?Fwd(?: Spd)?:(\S+)\s+Fwd Bpv:(\S+)\s+Expiry:(\S+)",
+ line)
+ if m:
+ if len(m.groups())==4:
+ ref, fwspread, fwfwbpv, expiry = m.groups()
+ elif len(m.groups())==5:
+ ref, fwprice, fwspread, fwfwbpv, expiry = m.groups()
+ else:
+ print("something wrong with {0}".format(f))
+ expiry = pd.datetime.strptime(expiry, '%d-%b-%y')
+ continue
+ if line.startswith("Strike"):
+ if "Px Vol" in line:
+ indextype='HY'
+ else:
+ indextype='IG'
+ flag = True
+ r = []
+ continue
+ if flag:
+ if line:
+ line = re.sub("[/|]", " ", line)
+ vals = re.sub(" +", " ", line).rstrip().split(" ")
+ r.append(vals)
continue
- if line.startswith("Strike"):
- if "Px Vol" in line:
- indextype='HY'
- else:
- indextype='IG'
- flag = True
+ else:
+ allexpiriesdf[expiry] = makedf(r, indextype, ref)
+ flag = False
r = []
continue
- if flag:
- if line:
- line = re.sub("[/|]", " ", line)
- vals = re.sub(" +", " ", line).rstrip().split(" ")
- r.append(vals)
- continue
- else:
- allexpiriesdf[expiry] = makedf(r, indextype, ref)
- flag = False
- r = []
- continue
if flag:
allexpiriesdf[expiry] = makedf(r, indextype, ref)
if allexpiriesdf:
- masterdf[(quotedate, indextype, series)] = pd.concat(allexpiriesdf, names=['expiry', 'strike'])
+ return (quotedate, indextype, series), allexpiriesdf
+ else:
+ raise RuntimeError("empty email")
+
+if __name__=="__main__":
+ #update_emails()
+ emails = [f for f in Path("../../data/swaptions").iterdir() if f.is_file()]
+ masterdf = {}
+ for f in emails:
+ try:
+ key, allexpiriesdf = parse_email(f)
+ except RuntimeError as e:
+ print(e)
+ print(f.name)
+ else:
+ masterdf[key] = pd.concat(allexpiriesdf, names=['expiry', 'strike'])
masterdf = pd.concat(masterdf, names=['quotedate', 'indextype', 'series'])
masterdf.to_hdf('swaptions.hdf', key='swaptions')