diff options
| -rw-r--r-- | python/download_emails.py | 6 | ||||
| -rw-r--r-- | python/parse_emails.py | 134 |
2 files changed, 77 insertions, 63 deletions
diff --git a/python/download_emails.py b/python/download_emails.py index 5e0eb95b..be18b1f7 100644 --- a/python/download_emails.py +++ b/python/download_emails.py @@ -79,7 +79,8 @@ def msg_content(msg): """Extract subject and body from a gmail message""" subject = [x['value'] for x in msg['payload']['headers'] if x['name']=='Subject'][0] content = base64.b64decode(msg['payload']['body']['data']).decode('utf-8') - return subject, content + date = msg['internalDate'] ## date /1000 to get timestamp + return subject, content, date def update_emails(): """Download new emails that were labeled swaptions.""" @@ -91,13 +92,14 @@ def update_emails(): for msg in ListMessagesWithLabels(service, 'me', labelsdict['swaptions']): if msg['id'] not in current_msgs: try: - subject, content = msg_content(get_msg(service, 'me', msg['id'])) + subject, content, date = msg_content(get_msg(service, 'me', msg['id'])) except (binascii.Error, KeyError): print("error decoding {0}".format(msg['id'])) continue else: email = p / msg['id'] with email.open("w") as fh: + fh.write(date + "\r\n") fh.write(subject + "\r\n") fh.write(content) diff --git a/python/parse_emails.py b/python/parse_emails.py index 3b25cf20..65cb7810 100644 --- a/python/parse_emails.py +++ b/python/parse_emails.py @@ -3,6 +3,7 @@ import re from pathlib import Path import pdb from download_emails import update_emails +import datetime def makedf(r, indextype, ref): if indextype=='IG': @@ -19,75 +20,86 @@ def makedf(r, indextype, ref): for k in df: if df.dtypes[k]=='object': df[k] = pd.to_numeric(df[k]) - df.set_index('strike', inplace=True) + df.set_index('Strike', inplace=True) return df -if __name__=="__main__": - update_emails() - emails = [f for f in Path("../../data/swaptions").iterdir() if f.is_file()] - masterdf = {} - for f in emails: - with f.open("rb") as fh: - subject = fh.readline() - m = re.match("(?:Fwd:)?(\w{2})([0-9]{1,2})\s", subject.decode('utf-8')) - if m: - indextype, series = m.groups() - series = int(series) - else: - print("can't parse subject line for {0}".format(f)) - print(subject.decode("utf-8")) - continue - flag = False - allexpiriesdf = {} - for line in fh: - line = line.decode('utf-8', 'ignore') - line = line.rstrip() - if line.startswith("At"): - for p in ['%m/%d %H:%M:%S', '%b %d %Y %H:%M:%S']: - try: - quotedate = pd.to_datetime(line, format=p, exact=False) - except ValueError: - continue - else: - if quotedate.year == 1900: - quotedate = quotedate.replace(year=2015) - break - else: - pdb.set_trace() - if line.startswith("Ref"): - m = re.match("Ref:(\S+)\s+(?:Fwd Px:(\S+)\s+)?Fwd(?: Spd)?:(\S+)\s+Fwd Bpv:(\S+)\s+Expiry:(\S+)", - line) - if m: - if len(m.groups())==4: - ref, fwspread, fwfwbpv, expiry = m.groups() - elif len(m.groups())==5: - ref, fwprice, fwspread, fwfwbpv, expiry = m.groups() +def parse_email(email_path): + with email_path.open("rb") as fh: + date_received = datetime.datetime.fromtimestamp(int(fh.readline())/1000) + subject = fh.readline() + m = re.match("(?:Fwd:)?(?:BAML )?(\w{2})([0-9]{1,2})\s", subject.decode('utf-8')) + if m: + indextype, series = m.groups() + series = int(series) + else: + raise RuntimeError("can't parse subject line: {0} for email {1}".format( + subject.decode("utf-8"), email_path.name)) + flag = False + allexpiriesdf = {} + for line in fh: + line = line.decode('utf-8', 'ignore') + line = line.rstrip() + if line.startswith("At"): + for p in ['%m/%d %H:%M:%S', '%b %d %Y %H:%M:%S']: + try: + quotedate = pd.to_datetime(line, format=p, exact=False) + except ValueError: + continue else: - print("something wrong with {0}".format(f)) - expiry = pd.datetime.strptime(expiry, '%d-%b-%y') + if quotedate.year == 1900: + quotedate = quotedate.replace(year=date_received.year) + break + else: + raise RuntimeError("can't parse date") + if line.startswith("Ref"): + m = re.match("Ref:(\S+)\s+(?:Fwd Px:(\S+)\s+)?Fwd(?: Spd)?:(\S+)\s+Fwd Bpv:(\S+)\s+Expiry:(\S+)", + line) + if m: + if len(m.groups())==4: + ref, fwspread, fwfwbpv, expiry = m.groups() + elif len(m.groups())==5: + ref, fwprice, fwspread, fwfwbpv, expiry = m.groups() + else: + print("something wrong with {0}".format(f)) + expiry = pd.datetime.strptime(expiry, '%d-%b-%y') + continue + if line.startswith("Strike"): + if "Px Vol" in line: + indextype='HY' + else: + indextype='IG' + flag = True + r = [] + continue + if flag: + if line: + line = re.sub("[/|]", " ", line) + vals = re.sub(" +", " ", line).rstrip().split(" ") + r.append(vals) continue - if line.startswith("Strike"): - if "Px Vol" in line: - indextype='HY' - else: - indextype='IG' - flag = True + else: + allexpiriesdf[expiry] = makedf(r, indextype, ref) + flag = False r = [] continue - if flag: - if line: - line = re.sub("[/|]", " ", line) - vals = re.sub(" +", " ", line).rstrip().split(" ") - r.append(vals) - continue - else: - allexpiriesdf[expiry] = makedf(r, indextype, ref) - flag = False - r = [] - continue if flag: allexpiriesdf[expiry] = makedf(r, indextype, ref) if allexpiriesdf: - masterdf[(quotedate, indextype, series)] = pd.concat(allexpiriesdf, names=['expiry', 'strike']) + return (quotedate, indextype, series), allexpiriesdf + else: + raise RuntimeError("empty email") + +if __name__=="__main__": + #update_emails() + emails = [f for f in Path("../../data/swaptions").iterdir() if f.is_file()] + masterdf = {} + for f in emails: + try: + key, allexpiriesdf = parse_email(f) + except RuntimeError as e: + print(e) + print(f.name) + else: + masterdf[key] = pd.concat(allexpiriesdf, names=['expiry', 'strike']) masterdf = pd.concat(masterdf, names=['quotedate', 'indextype', 'series']) masterdf.to_hdf('swaptions.hdf', key='swaptions') |
