diff options
Diffstat (limited to 'python/quote_parsing')
| -rw-r--r-- | python/quote_parsing/parse_emails.py | 263 |
1 files changed, 171 insertions, 92 deletions
diff --git a/python/quote_parsing/parse_emails.py b/python/quote_parsing/parse_emails.py index 47a17e9e..21c1b9f1 100644 --- a/python/quote_parsing/parse_emails.py +++ b/python/quote_parsing/parse_emails.py @@ -19,30 +19,48 @@ def list_imm_dates(date): def makedf(r, indextype, quote_source): - if indextype == 'IG': - cols = ['strike', 'rec_bid', 'rec_offer', 'delta_rec', 'pay_bid', - 'pay_offer', 'delta_pay', 'vol'] + if indextype == "IG": + cols = [ + "strike", + "rec_bid", + "rec_offer", + "delta_rec", + "pay_bid", + "pay_offer", + "delta_pay", + "vol", + ] else: - cols = ['strike', 'rec_bid', 'rec_offer', 'delta_rec', 'pay_bid', - 'pay_offer', 'delta_pay', 'vol', 'price_vol'] + cols = [ + "strike", + "rec_bid", + "rec_offer", + "delta_rec", + "pay_bid", + "pay_offer", + "delta_pay", + "vol", + "price_vol", + ] if quote_source == "BAML": - cols.append('gamma') + cols.append("gamma") if quote_source == "GS": cols.append("tail") df = pd.DataFrame.from_records(r, columns=cols) - for col in ['delta_rec', 'delta_pay', 'vol', 'price_vol', 'gamma', 'tail']: + for col in ["delta_rec", "delta_pay", "vol", "price_vol", "gamma", "tail"]: if col in df: try: - df[col] = df[col].str.rstrip("%").astype('float') / 100 - except ValueError: #typo in one email - df[col] = (df[col].str.rstrip("%"). - str.replace("n", "").astype("float") / 100) + df[col] = df[col].str.rstrip("%").astype("float") / 100 + except ValueError: # typo in one email + df[col] = ( + df[col].str.rstrip("%").str.replace("n", "").astype("float") / 100 + ) if quote_source == "GS": for col in ["pay_bid", "pay_offer", "rec_bid", "rec_offer"]: - df[col] = df[col].str.strip('-') - df['delta_pay'] *= -1 + df[col] = df[col].str.strip("-") + df["delta_pay"] *= -1 for k in df: - if df.dtypes[k] == 'object': + if df.dtypes[k] == "object": df[k] = df[k].str.replace(",", "") try: df[k] = pd.to_numeric(df[k]) @@ -51,7 +69,7 @@ def makedf(r, indextype, quote_source): logger.error("couldn't convert column") df[k] = pd.to_numeric(df[k].str.replace("n", "")) breakpoint() - df.set_index('strike', inplace=True) + df.set_index("strike", inplace=True) return df @@ -59,8 +77,12 @@ def parse_quotedate(fh, date_received): for line in fh: line = line.rstrip() if "At:" in line or "Sent:" in line: - for p in ["%m/%d/%y %H:%M:%S", "%b %d %Y %H:%M:%S", "%m/%d %H:%M:%S", - "%B %d, %Y %I:%M %p"]: + for p in [ + "%m/%d/%y %H:%M:%S", + "%b %d %Y %H:%M:%S", + "%m/%d %H:%M:%S", + "%B %d, %Y %I:%M %p", + ]: try: quotedate = pd.to_datetime(line, format=p, exact=False) except ValueError: @@ -78,13 +100,15 @@ def parse_quotedate(fh, date_received): def parse_refline(line): - regex = r"Ref:(?P<ref>\S+)\s+(?:Fwd Px:(?P<fwdprice>\S+)\s+)?" \ - r"Fwd(?: Spd)?:(?P<fwdspread>\S+)\s+Fwd Bpv:(?P<fwdbpv>\S+)" \ + regex = ( + r"Ref:(?P<ref>\S+)\s+(?:Fwd Px:(?P<fwdprice>\S+)\s+)?" + r"Fwd(?: Spd)?:(?P<fwdspread>\S+)\s+Fwd Bpv:(?P<fwdbpv>\S+)" r"\s+Expiry:(?P<expiry>\S+)" + ) m = re.match(regex, line) try: d = m.groupdict() - d['expiry'] = pd.to_datetime(d['expiry'], format='%d-%b-%y') + d["expiry"] = pd.to_datetime(d["expiry"], format="%d-%b-%y") except AttributeError: raise RuntimeError(f"can't parse refline {line}") return d @@ -102,16 +126,15 @@ def parse_baml(fh, indextype, series, quotedate, *args): break if line.startswith("Ref"): d = parse_refline(line) - d.update({'quotedate': quotedate, 'index': indextype, 'series': series}) + d.update({"quotedate": quotedate, "index": indextype, "series": series}) df, line = parse_baml_block(fh, indextype) - option_stack[d['expiry']] = df + option_stack[d["expiry"]] = df fwd_index.append(d) else: line = "" if option_stack: - fwd_index = pd.DataFrame.from_records(fwd_index, - index='quotedate') - fwd_index['quote_source'] = 'BAML' + fwd_index = pd.DataFrame.from_records(fwd_index, index="quotedate") + fwd_index["quote_source"] = "BAML" return option_stack, fwd_index else: raise RuntimeError("empty email: " + fh.name) @@ -159,15 +182,15 @@ def parse_ms_block(fh, indextype): except ValueError: raise RuntimeError("Couldn't parse line: {line}") - vals = [strike, rec_bid, rec_offer, rec_delta, - pay_bid, pay_offer, pay_delta] + vals = [strike, rec_bid, rec_offer, rec_delta, pay_bid, pay_offer, pay_delta] vol = vol.strip() if indextype == "HY": try: price_vol, vol = vol.replace("[", "").replace("]", "").split() except ValueError: - price_vol, vol, vol_change, be = (vol.replace("[", ""). - replace("]", "").split()) + price_vol, vol, vol_change, be = ( + vol.replace("[", "").replace("]", "").split() + ) vals += [vol, price_vol] else: if " " in vol: @@ -191,8 +214,16 @@ def parse_nomura_block(fh, indextype): pay_bid, pay_offer = pay.split("/") rec_bid, rec_offer = rec.split("/") vol = vol.strip() - vals = [strike, rec_bid, rec_offer, rec_delta, - pay_bid, pay_offer, pay_delta, vol] + vals = [ + strike, + rec_bid, + rec_offer, + rec_delta, + pay_bid, + pay_offer, + pay_delta, + vol, + ] if indextype == "HY": # we don't have price vol vals.append(None) r.append(vals) @@ -213,7 +244,9 @@ def parse_sg_block(fh, indextype, expiration_dates): option_type, strike, strike_spread, price, delta, vol, expiry = line.split() expiry_month = datetime.datetime.strptime(expiry, "%b-%y").month - expiry = next(pd.Timestamp(d) for d in expiration_dates if d.month == expiry_month) + expiry = next( + pd.Timestamp(d) for d in expiration_dates if d.month == expiry_month + ) if option_type == "Rec": rec_bid, rec_offer = price.split("/") pay_bid, pay_offer = None, None @@ -222,8 +255,16 @@ def parse_sg_block(fh, indextype, expiration_dates): pay_bid, pay_offer = price.split("/") rec_bid, rec_offer = None, None rec_delta, pay_delta = None, delta - vals = [strike, rec_bid, rec_offer, rec_delta, pay_bid, - pay_offer, pay_delta, vol] + vals = [ + strike, + rec_bid, + rec_offer, + rec_delta, + pay_bid, + pay_offer, + pay_delta, + vol, + ] if indextype == "HY": vals.append(None) r.append(vals) @@ -231,7 +272,7 @@ def parse_sg_block(fh, indextype, expiration_dates): def parse_gs_block(fh, indextype): - #skip header + # skip header while True: line = next(fh) if line.strip().startswith("Stk"): @@ -245,7 +286,7 @@ def parse_gs_block(fh, indextype): if line.startswith("Expiry") or line.startswith("Assumes"): break vals = line.split() - if indextype == 'HY': + if indextype == "HY": vals.pop(2) vals.pop(9) else: @@ -253,7 +294,7 @@ def parse_gs_block(fh, indextype): vals.pop(8) strike = vals.pop(0) if indextype == "HY": - vals.pop(0) # pop the spread + vals.pop(0) # pop the spread pay, pay_delta = vals[:2] pay_bid, pay_offer = pay.split("/") rec_bid, rec_offer = vals[2].split("/") @@ -266,8 +307,9 @@ def parse_gs_block(fh, indextype): r.append(vals) return makedf(r, indextype, "GS"), line + def parse_citi_block(fh, indextype): - next(fh) #skip header + next(fh) # skip header r = [] for line in fh: line = line.rstrip() @@ -290,13 +332,35 @@ def parse_citi_block(fh, indextype): vol = vol.split()[0] if indextype == "HY": price_vol = price_vol.strip() - r.append([strike, rec_bid, rec_offer, rec_delta, - pay_bid, pay_offer, pay_delta, vol, price_vol]) + r.append( + [ + strike, + rec_bid, + rec_offer, + rec_delta, + pay_bid, + pay_offer, + pay_delta, + vol, + price_vol, + ] + ) else: - r.append([strike, rec_bid, rec_offer, rec_delta, - pay_bid, pay_offer, pay_delta, vol]) + r.append( + [ + strike, + rec_bid, + rec_offer, + rec_delta, + pay_bid, + pay_offer, + pay_delta, + vol, + ] + ) return makedf(r, indextype, "CITI") + def parse_ms(fh, indextype, *args): option_stack = {} for line in fh: @@ -325,6 +389,7 @@ def parse_nom(fh, indextype, *args): aux(next_line, fh, indextype, option_stack) else: raise RuntimeError(f"Don't know what to do with {line}.") + for line in fh: line = line.rstrip() if "EXPIRY" in line: @@ -345,8 +410,7 @@ def parse_sg(fh, indextype, expiration_dates): def parse_gs(fh, indextype, series, quotedate, ref): option_stack = {} fwd_index = [] - d = {'quotedate': quotedate, 'index': indextype, - 'series': series, 'ref': ref} + d = {"quotedate": quotedate, "index": indextype, "series": series, "ref": ref} pat = re.compile(r"Expiry (\d{2}\w{3}\d{2}) \((?:([\S]+) )?([\S]+)\)") line = next(fh).strip() @@ -355,9 +419,10 @@ def parse_gs(fh, indextype, series, quotedate, ref): m = pat.match(line) if m: expiry, fwdprice, fwdspread = m.groups() - expiry = pd.to_datetime(expiry, format='%d%b%y') - d.update({'fwdspread': fwdspread, 'fwdprice': fwdprice, - 'expiry': expiry}) + expiry = pd.to_datetime(expiry, format="%d%b%y") + d.update( + {"fwdspread": fwdspread, "fwdprice": fwdprice, "expiry": expiry} + ) fwd_index.append(d.copy()) option_stack[expiry], line = parse_gs_block(fh, indextype) else: @@ -370,17 +435,15 @@ def parse_gs(fh, indextype, series, quotedate, ref): except StopIteration: break - fwd_index = pd.DataFrame.from_records(fwd_index, - index='quotedate') - fwd_index['quote_source'] = 'GS' + fwd_index = pd.DataFrame.from_records(fwd_index, index="quotedate") + fwd_index["quote_source"] = "GS" return option_stack, fwd_index + def parse_citi(fh, indextype, series, quotedate): option_stack = {} fwd_index = [] - d = {'quotedate': quotedate, - 'index': indextype, - 'series': series} + d = {"quotedate": quotedate, "index": indextype, "series": series} pat = re.compile(r"Exp: (\d{2}-\w{3}-\d{2})[^R]*Ref:[^\d]*([\d.]+)") for line in fh: line = line.strip() @@ -388,33 +451,35 @@ def parse_citi(fh, indextype, series, quotedate): m = pat.match(line) if m: expiry, ref = m.groups() - expiry = pd.to_datetime(expiry, format='%d-%b-%y') - d.update({'ref': ref, - 'expiry': expiry}) + expiry = pd.to_datetime(expiry, format="%d-%b-%y") + d.update({"ref": ref, "expiry": expiry}) fwd_index.append(d.copy()) option_stack[expiry] = parse_citi_block(fh, indextype) else: logger.error("Cant't parse expiry line:", line) - fwd_index = pd.DataFrame.from_records(fwd_index, - index='quotedate') - fwd_index['quote_source'] = 'CITI' + fwd_index = pd.DataFrame.from_records(fwd_index, index="quotedate") + fwd_index["quote_source"] = "CITI" return option_stack, fwd_index + subject_baml = re.compile(r"(?:Fwd:){0,2}(?:BAML )?(\w{2})([0-9]{1,2})\s") -subject_ms = re.compile(r"[^$]*\$\$ MS CDX OPTIONS: (IG|HY)(\d{2})[^-]*- REF[^\d]*([\d.]+)") +subject_ms = re.compile( + r"[^$]*\$\$ MS CDX OPTIONS: (IG|HY)(\d{2})[^-]*- REF[^\d]*([\d.]+)" +) subject_nom = re.compile(r"(?:Fwd:)?CDX (IG|HY)(\d{2}).*- REF:[^\d]*([\d.]+)") subject_gs = re.compile(r"(?:FW: |Fwd: )?GS (IG|HY)(\d{2}) 5y.*- Ref [^\d]*([\d.]+)") subject_sg = re.compile(r"SG OPTIONS - CDX (IG|HY) S(\d{2}).* REF[^\d]*([\d.]+)") subject_citi = re.compile(r"(?:Fwd:)?Citi Options: (IG|HY)(\d{2}) 5Y") + def parse_email(email, date_received): with email.open("rt") as fh: subject = fh.readline().lstrip() - for source in ['BAML', 'GS', 'MS', 'NOM', 'SG', 'CITI']: - m = globals()[f'subject_{source.lower()}'].match(subject) + for source in ["BAML", "GS", "MS", "NOM", "SG", "CITI"]: + m = globals()[f"subject_{source.lower()}"].match(subject) if m: - if source in ['BAML', 'CITI']: + if source in ["BAML", "CITI"]: indextype, series = m.groups() else: indextype, series, ref = m.groups() @@ -424,43 +489,55 @@ def parse_email(email, date_received): try: quotedate = parse_quotedate(fh, date_received) except RuntimeError: - logger.warning("couldn't find received date in message: " - f"{email.name}, using {date_received}") - quotedate = pd.Timestamp(date_received).tz_localize("America/New_York") + logger.warning( + "couldn't find received date in message: " + f"{email.name}, using {date_received}" + ) + quotedate = pd.Timestamp(date_received).tz_localize( + "America/New_York" + ) fh.seek(cur_pos) expiration_dates = list_imm_dates(quotedate) - parse_fun = globals()[f'parse_{source.lower()}'] + parse_fun = globals()[f"parse_{source.lower()}"] key = (quotedate, indextype, series, source) - if source in ['BAML', 'CITI']: + if source in ["BAML", "CITI"]: return (key, parse_fun(fh, indextype, series, quotedate)) - parse_fun(fh, indextype, series, quotedate) elif source == "GS": return (key, parse_fun(fh, indextype, series, quotedate, ref)) - parse_fun(fh, indextype, series, quotedate, ref) else: option_stack = parse_fun(fh, indextype, expiration_dates) - fwd_index = pd.DataFrame({'quotedate': quotedate, - 'ref': ref, - 'index': indextype, - 'series': series, - 'expiry': list(option_stack.keys()), - 'quote_source': source}) - fwd_index.set_index('quotedate', inplace=True) + fwd_index = pd.DataFrame( + { + "quotedate": quotedate, + "ref": ref, + "index": indextype, + "series": series, + "expiry": list(option_stack.keys()), + "quote_source": source, + } + ) + fwd_index.set_index("quotedate", inplace=True) return (key, (option_stack, fwd_index)) else: - raise RuntimeError(f"can't parse subject line: {subject} for email {email.name}") + raise RuntimeError( + f"can't parse subject line: {subject} for email {email.name}" + ) + def write_todb(swaption_stack, index_data, conn): def gen_sql_str(query, table_name, columns): - return query.format(sql.Identifier(table_name), - sql.SQL(", ").join(sql.Identifier(c) for c in columns), - sql.SQL(", ").join(sql.Placeholder() * len(columns))) - query = sql.SQL("INSERT INTO {}({}) VALUES({}) " - "ON CONFLICT DO NOTHING RETURNING ref_id") + return query.format( + sql.Identifier(table_name), + sql.SQL(", ").join(sql.Identifier(c) for c in columns), + sql.SQL(", ").join(sql.Placeholder() * len(columns)), + ) + + query = sql.SQL( + "INSERT INTO {}({}) VALUES({}) " "ON CONFLICT DO NOTHING RETURNING ref_id" + ) sql_str = gen_sql_str(query, "swaption_ref_quotes", index_data.columns) - query = sql.SQL("INSERT INTO {}({}) VALUES({}) " - "ON CONFLICT DO NOTHING") + query = sql.SQL("INSERT INTO {}({}) VALUES({}) " "ON CONFLICT DO NOTHING") with conn.cursor() as c: for t in index_data.itertuples(index=False): c.execute(sql_str, t) @@ -481,9 +558,11 @@ def write_todb(swaption_stack, index_data, conn): continue except IndexError: breakpoint() - df['ref_id'] = ref_id - c.executemany(gen_sql_str(query, "swaption_quotes", df.columns), - df.itertuples(index=False)) + df["ref_id"] = ref_id + c.executemany( + gen_sql_str(query, "swaption_quotes", df.columns), + df.itertuples(index=False), + ) conn.commit() @@ -496,10 +575,10 @@ def get_email_list(date): """ with open(".pickle", "rb") as fh: already_uploaded = pickle.load(fh) - df = pd.DataFrame.from_dict(already_uploaded, orient='index') - df.columns = ['quotedate'] - df = df.reset_index().set_index('quotedate') - return df.loc[date, 'index'].tolist() + df = pd.DataFrame.from_dict(already_uploaded, orient="index") + df.columns = ["quotedate"] + df = df.reset_index().set_index("quotedate") + return df.loc[date, "index"].tolist() def pickle_drop_date(date): |
