diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/parse_emails.py | 96 |
1 files changed, 70 insertions, 26 deletions
diff --git a/python/parse_emails.py b/python/parse_emails.py index 5c5fb8bd..c31bf1f7 100644 --- a/python/parse_emails.py +++ b/python/parse_emails.py @@ -40,13 +40,13 @@ def parse_quotedate(fh, date_received): for line in fh: line = line.rstrip() if line.startswith("At"): - for p in ['%m/%d %H:%M:%S', '%b %d %Y %H:%M:%S']: + for p in ['%m/%d/%y %H:%M:%S', '%b %d %Y %H:%M:%S', '%m/%d %H:%M:%S']: try: quotedate = pd.to_datetime(line, format=p, exact=False) except ValueError: continue else: - if quotedate.year == 1900: + if quotedate.year == 1900: # p='%m/%d %H:%M:%S' quotedate = quotedate.replace(year=date_received.year) break else: @@ -132,6 +132,27 @@ def parse_ms_block(fh, indextype): r.append(vals) return makedf(r, indextype, "MS") +def parse_nomura_block(fh, indextype): + next(fh) ## skip header + r = [] + for line in fh: + line = line.rstrip() + if line == "": + break + strike, receiver, payer, vol, _ = line.split("|", 4) + strike = strike.strip() + pay, pay_delta = payer.strip().split() + rec, rec_delta = receiver.strip().split() + pay_bid, pay_offer = pay.split("/") + rec_bid, rec_offer = rec.split("/") + vol = vol.strip() + vals = [strike, rec_bid, rec_offer, rec_delta, + pay_bid, pay_offer, pay_delta, vol] + if indextype == "HY": # we don't have price vol + vals.append(None) + r.append(vals) + return makedf(r, indextype, "NOM") + def parse_ms(fh, indextype): option_stack = {} for line in fh: @@ -142,33 +163,46 @@ def parse_ms(fh, indextype): option_stack[expiry] = parse_ms_block(fh, indextype) return option_stack -subject_BAML = re.compile("(?:Fwd:){0,2}(?:BAML )?(\w{2})([0-9]{1,2})\s") -subject_MS = re.compile("\$\$ MS CDX OPTIONS: (IG|HY)(\d{2})[^\d]*([\d.]+)") +def parse_nomura(fh, indextype): + option_stack = {} + for line in fh: + line = line.rstrip() + if "EXPIRY" in line: + expiry = line.split(" ")[0] + expiry = pd.to_datetime(expiry, format="%d-%b-%y") + option_stack[expiry] = parse_nomura_block(fh, indextype) + return option_stack + +subject_baml = re.compile("(?:Fwd:){0,2}(?:BAML )?(\w{2})([0-9]{1,2})\s") +subject_ms = re.compile("[^$]*\$\$ MS CDX OPTIONS: (IG|HY)(\d{2})[^-]*- REF[^\d]*([\d.]+)") +subject_nomura = re.compile("(?:Fwd:)?CDX (IG|HY)(\d{2}).*- REF:[^\d]*([\d.]+)") def parse_email(email): with open(email.path, "rt") as fh: date_received = datetime.datetime.fromtimestamp(int(fh.readline())/1000) subject = next(fh) - m = subject_BAML.match(subject) - if m: - indextype, series = m.groups() - series = int(series) - quotedate = parse_quotedate(fh, date_received) - return (quotedate, indextype, series), parse_baml(fh, indextype, series, quotedate) - m = subject_MS.match(subject) - if m: - indextype, series, ref = m.groups() - series = int(series) - ref = float(ref) - quotedate = parse_quotedate(fh, date_received) - option_stack = parse_ms(fh, indextype) - fwd_index = pd.DataFrame({'quotedate': quotedate, - 'ref': ref, - 'index': indextype, - 'series': series, - 'expiry': list(option_stack.keys())}) - fwd_index.set_index('quotedate', inplace = True) - return (quotedate, indextype, series), (option_stack, fwd_index) + for source in ['BAML', 'MS', 'NOMURA']: + m = globals()['subject_'+source.lower()].match(subject) + if m: + if source == 'BAML': + indextype, series = m.groups() + else: + indextype, series, ref = m.groups() + ref = float(ref) + series = int(series) + quotedate = parse_quotedate(fh, date_received) + + if source == 'BAML': + return (quotedate, indextype, series), parse_baml(fh, indextype, series, quotedate) + else: + option_stack = globals()['parse_'+source.lower()](fh, indextype) + fwd_index = pd.DataFrame({'quotedate': quotedate, + 'ref': ref, + 'index': indextype, + 'series': series, + 'expiry': list(option_stack.keys())}) + fwd_index.set_index('quotedate', inplace = True) + return (quotedate, indextype, series), (option_stack, fwd_index) raise RuntimeError("can't parse subject line: {0} for email {1}".format( subject, email.name)) @@ -180,7 +214,9 @@ def write_todb(swaption_stack, index_data): psycopg2.extensions.register_adapter(float, nan_to_null) meta = MetaData(bind=serenitasdb) swaption_quotes = Table('swaption_quotes', meta, autoload=True) - ins = swaption_quotes.insert().values(swaption_stack.to_dict(orient='records')).execute() + for r in swaption_stack.to_dict(orient='records'): + serenitasdb.execute(swaption_quotes.insert(), r) + #ins = swaption_quotes.insert().values(swaption_stack.to_dict(orient='records')).execute() index_data.to_sql('swaption_ref_quotes', serenitasdb, if_exists='append', index=False) def get_email_list(date): @@ -197,6 +233,13 @@ def get_email_list(date): df = df.reset_index().set_index('quotedate') return df.loc[date,'index'].tolist() +def pickle_drop_date(date): + with open(".pickle", "rb") as fh: + already_uploaded = pickle.load(fh) + newdict = {k: v for k, v in already_uploaded.items() if v.date() != date} + with open(".pickle", "wb") as fh: + pickle.dump(newdict, fh) + if __name__=="__main__": update_emails() data_dir = os.path.join(os.getenv("DATA_DIR"), "swaptions") @@ -223,7 +266,8 @@ if __name__=="__main__": if index_data.empty: sys.exit() for col in ['fwdbpv', 'fwdprice', 'fwdspread', 'ref']: - index_data[col] = index_data[col].astype('float') + if col in index_data: + index_data[col] = index_data[col].astype('float') index_data['index'] = index_data['index'].astype('category') swaption_stack = pd.concat(swaption_stack, names=['quotedate', 'index', 'series']) |
