diff options
Diffstat (limited to 'python/parse_emails.py')
| -rw-r--r-- | python/parse_emails.py | 66 |
1 files changed, 38 insertions, 28 deletions
diff --git a/python/parse_emails.py b/python/parse_emails.py index 65cb7810..65a8b091 100644 --- a/python/parse_emails.py +++ b/python/parse_emails.py @@ -27,7 +27,7 @@ def parse_email(email_path): with email_path.open("rb") as fh: date_received = datetime.datetime.fromtimestamp(int(fh.readline())/1000) subject = fh.readline() - m = re.match("(?:Fwd:)?(?:BAML )?(\w{2})([0-9]{1,2})\s", subject.decode('utf-8')) + m = re.match("(?:Fwd:){0,2}(?:BAML )?(\w{2})([0-9]{1,2})\s", subject.decode('utf-8')) if m: indextype, series = m.groups() series = int(series) @@ -35,7 +35,8 @@ def parse_email(email_path): raise RuntimeError("can't parse subject line: {0} for email {1}".format( subject.decode("utf-8"), email_path.name)) flag = False - allexpiriesdf = {} + option_stack = {} + fwd_index = [] for line in fh: line = line.decode('utf-8', 'ignore') line = line.rstrip() @@ -52,22 +53,20 @@ def parse_email(email_path): else: raise RuntimeError("can't parse date") if line.startswith("Ref"): - m = re.match("Ref:(\S+)\s+(?:Fwd Px:(\S+)\s+)?Fwd(?: Spd)?:(\S+)\s+Fwd Bpv:(\S+)\s+Expiry:(\S+)", - line) - if m: - if len(m.groups())==4: - ref, fwspread, fwfwbpv, expiry = m.groups() - elif len(m.groups())==5: - ref, fwprice, fwspread, fwfwbpv, expiry = m.groups() - else: - print("something wrong with {0}".format(f)) - expiry = pd.datetime.strptime(expiry, '%d-%b-%y') + regex = "Ref:(?P<ref>\S+)\s+(?:Fwd Px:(?P<fwdprice>\S+)\s+)?" \ + "Fwd(?: Spd)?:(?P<fwdspread>\S+)\s+Fwd Bpv:(?P<fwdbpv>\S+)" \ + "\s+Expiry:(?P<expiry>\S+)" + m = re.match(regex, line) + try: + d = m.groupdict() + d['quotedate'] = quotedate + d['index'] = indextype + d['series'] = series + d['expiry'] = pd.to_datetime(d['expiry'], format='%d-%b-%y') + except AttributeError: + print("something wrong with {0}".format(email_path.name)) continue if line.startswith("Strike"): - if "Px Vol" in line: - indextype='HY' - else: - indextype='IG' flag = True r = [] continue @@ -78,28 +77,39 @@ def parse_email(email_path): r.append(vals) continue else: - allexpiriesdf[expiry] = makedf(r, indextype, ref) + option_stack[d['expiry']] = makedf(r, indextype, d['ref']) + fwd_index.append(d) flag = False r = [] continue if flag: - allexpiriesdf[expiry] = makedf(r, indextype, ref) - if allexpiriesdf: - return (quotedate, indextype, series), allexpiriesdf + option_stack[d['expiry']] = makedf(r, indextype, d['ref']) + fwd_index.append(d) + if option_stack: + fwd_index = pd.DataFrame.from_records(fwd_index, + index='quotedate') + return (quotedate, indextype, series), option_stack, fwd_index else: - raise RuntimeError("empty email") + raise RuntimeError("empty email: {0}".format(email_path.name)) if __name__=="__main__": - #update_emails() + update_emails() emails = [f for f in Path("../../data/swaptions").iterdir() if f.is_file()] - masterdf = {} + swaption_stack = {} + index_data = pd.DataFrame() for f in emails: try: - key, allexpiriesdf = parse_email(f) + key, option_stack, fwd_index = parse_email(f) except RuntimeError as e: print(e) - print(f.name) else: - masterdf[key] = pd.concat(allexpiriesdf, names=['expiry', 'strike']) - masterdf = pd.concat(masterdf, names=['quotedate', 'indextype', 'series']) - masterdf.to_hdf('swaptions.hdf', key='swaptions') + swaption_stack[key] = pd.concat(option_stack, names=['expiry', 'strike']) + index_data = index_data.append(fwd_index) + for col in ['fwdbpv', 'fwdprice', 'fwdspread', 'ref']: + index_data[col] = index_data[col].astype('float') + index_data['index'] = index_data['index'].astype('category') + swaption_stack = pd.concat(swaption_stack, names=['quotedate', 'indextype', 'series']) + with pd.HDFStore('swaptions.hdf', mode = 'w', complevel=4, + complib='blosc', fletcher32=True) as swaptions: + swaptions.append('swaptions', swaption_stack) + swaptions.append('index_data', index_data) |
