aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/quote_parsing/parse_emails.py263
1 files changed, 171 insertions, 92 deletions
diff --git a/python/quote_parsing/parse_emails.py b/python/quote_parsing/parse_emails.py
index 47a17e9e..21c1b9f1 100644
--- a/python/quote_parsing/parse_emails.py
+++ b/python/quote_parsing/parse_emails.py
@@ -19,30 +19,48 @@ def list_imm_dates(date):
def makedf(r, indextype, quote_source):
- if indextype == 'IG':
- cols = ['strike', 'rec_bid', 'rec_offer', 'delta_rec', 'pay_bid',
- 'pay_offer', 'delta_pay', 'vol']
+ if indextype == "IG":
+ cols = [
+ "strike",
+ "rec_bid",
+ "rec_offer",
+ "delta_rec",
+ "pay_bid",
+ "pay_offer",
+ "delta_pay",
+ "vol",
+ ]
else:
- cols = ['strike', 'rec_bid', 'rec_offer', 'delta_rec', 'pay_bid',
- 'pay_offer', 'delta_pay', 'vol', 'price_vol']
+ cols = [
+ "strike",
+ "rec_bid",
+ "rec_offer",
+ "delta_rec",
+ "pay_bid",
+ "pay_offer",
+ "delta_pay",
+ "vol",
+ "price_vol",
+ ]
if quote_source == "BAML":
- cols.append('gamma')
+ cols.append("gamma")
if quote_source == "GS":
cols.append("tail")
df = pd.DataFrame.from_records(r, columns=cols)
- for col in ['delta_rec', 'delta_pay', 'vol', 'price_vol', 'gamma', 'tail']:
+ for col in ["delta_rec", "delta_pay", "vol", "price_vol", "gamma", "tail"]:
if col in df:
try:
- df[col] = df[col].str.rstrip("%").astype('float') / 100
- except ValueError: #typo in one email
- df[col] = (df[col].str.rstrip("%").
- str.replace("n", "").astype("float") / 100)
+ df[col] = df[col].str.rstrip("%").astype("float") / 100
+ except ValueError: # typo in one email
+ df[col] = (
+ df[col].str.rstrip("%").str.replace("n", "").astype("float") / 100
+ )
if quote_source == "GS":
for col in ["pay_bid", "pay_offer", "rec_bid", "rec_offer"]:
- df[col] = df[col].str.strip('-')
- df['delta_pay'] *= -1
+ df[col] = df[col].str.strip("-")
+ df["delta_pay"] *= -1
for k in df:
- if df.dtypes[k] == 'object':
+ if df.dtypes[k] == "object":
df[k] = df[k].str.replace(",", "")
try:
df[k] = pd.to_numeric(df[k])
@@ -51,7 +69,7 @@ def makedf(r, indextype, quote_source):
logger.error("couldn't convert column")
df[k] = pd.to_numeric(df[k].str.replace("n", ""))
breakpoint()
- df.set_index('strike', inplace=True)
+ df.set_index("strike", inplace=True)
return df
@@ -59,8 +77,12 @@ def parse_quotedate(fh, date_received):
for line in fh:
line = line.rstrip()
if "At:" in line or "Sent:" in line:
- for p in ["%m/%d/%y %H:%M:%S", "%b %d %Y %H:%M:%S", "%m/%d %H:%M:%S",
- "%B %d, %Y %I:%M %p"]:
+ for p in [
+ "%m/%d/%y %H:%M:%S",
+ "%b %d %Y %H:%M:%S",
+ "%m/%d %H:%M:%S",
+ "%B %d, %Y %I:%M %p",
+ ]:
try:
quotedate = pd.to_datetime(line, format=p, exact=False)
except ValueError:
@@ -78,13 +100,15 @@ def parse_quotedate(fh, date_received):
def parse_refline(line):
- regex = r"Ref:(?P<ref>\S+)\s+(?:Fwd Px:(?P<fwdprice>\S+)\s+)?" \
- r"Fwd(?: Spd)?:(?P<fwdspread>\S+)\s+Fwd Bpv:(?P<fwdbpv>\S+)" \
+ regex = (
+ r"Ref:(?P<ref>\S+)\s+(?:Fwd Px:(?P<fwdprice>\S+)\s+)?"
+ r"Fwd(?: Spd)?:(?P<fwdspread>\S+)\s+Fwd Bpv:(?P<fwdbpv>\S+)"
r"\s+Expiry:(?P<expiry>\S+)"
+ )
m = re.match(regex, line)
try:
d = m.groupdict()
- d['expiry'] = pd.to_datetime(d['expiry'], format='%d-%b-%y')
+ d["expiry"] = pd.to_datetime(d["expiry"], format="%d-%b-%y")
except AttributeError:
raise RuntimeError(f"can't parse refline {line}")
return d
@@ -102,16 +126,15 @@ def parse_baml(fh, indextype, series, quotedate, *args):
break
if line.startswith("Ref"):
d = parse_refline(line)
- d.update({'quotedate': quotedate, 'index': indextype, 'series': series})
+ d.update({"quotedate": quotedate, "index": indextype, "series": series})
df, line = parse_baml_block(fh, indextype)
- option_stack[d['expiry']] = df
+ option_stack[d["expiry"]] = df
fwd_index.append(d)
else:
line = ""
if option_stack:
- fwd_index = pd.DataFrame.from_records(fwd_index,
- index='quotedate')
- fwd_index['quote_source'] = 'BAML'
+ fwd_index = pd.DataFrame.from_records(fwd_index, index="quotedate")
+ fwd_index["quote_source"] = "BAML"
return option_stack, fwd_index
else:
raise RuntimeError("empty email: " + fh.name)
@@ -159,15 +182,15 @@ def parse_ms_block(fh, indextype):
except ValueError:
raise RuntimeError("Couldn't parse line: {line}")
- vals = [strike, rec_bid, rec_offer, rec_delta,
- pay_bid, pay_offer, pay_delta]
+ vals = [strike, rec_bid, rec_offer, rec_delta, pay_bid, pay_offer, pay_delta]
vol = vol.strip()
if indextype == "HY":
try:
price_vol, vol = vol.replace("[", "").replace("]", "").split()
except ValueError:
- price_vol, vol, vol_change, be = (vol.replace("[", "").
- replace("]", "").split())
+ price_vol, vol, vol_change, be = (
+ vol.replace("[", "").replace("]", "").split()
+ )
vals += [vol, price_vol]
else:
if " " in vol:
@@ -191,8 +214,16 @@ def parse_nomura_block(fh, indextype):
pay_bid, pay_offer = pay.split("/")
rec_bid, rec_offer = rec.split("/")
vol = vol.strip()
- vals = [strike, rec_bid, rec_offer, rec_delta,
- pay_bid, pay_offer, pay_delta, vol]
+ vals = [
+ strike,
+ rec_bid,
+ rec_offer,
+ rec_delta,
+ pay_bid,
+ pay_offer,
+ pay_delta,
+ vol,
+ ]
if indextype == "HY": # we don't have price vol
vals.append(None)
r.append(vals)
@@ -213,7 +244,9 @@ def parse_sg_block(fh, indextype, expiration_dates):
option_type, strike, strike_spread, price, delta, vol, expiry = line.split()
expiry_month = datetime.datetime.strptime(expiry, "%b-%y").month
- expiry = next(pd.Timestamp(d) for d in expiration_dates if d.month == expiry_month)
+ expiry = next(
+ pd.Timestamp(d) for d in expiration_dates if d.month == expiry_month
+ )
if option_type == "Rec":
rec_bid, rec_offer = price.split("/")
pay_bid, pay_offer = None, None
@@ -222,8 +255,16 @@ def parse_sg_block(fh, indextype, expiration_dates):
pay_bid, pay_offer = price.split("/")
rec_bid, rec_offer = None, None
rec_delta, pay_delta = None, delta
- vals = [strike, rec_bid, rec_offer, rec_delta, pay_bid,
- pay_offer, pay_delta, vol]
+ vals = [
+ strike,
+ rec_bid,
+ rec_offer,
+ rec_delta,
+ pay_bid,
+ pay_offer,
+ pay_delta,
+ vol,
+ ]
if indextype == "HY":
vals.append(None)
r.append(vals)
@@ -231,7 +272,7 @@ def parse_sg_block(fh, indextype, expiration_dates):
def parse_gs_block(fh, indextype):
- #skip header
+ # skip header
while True:
line = next(fh)
if line.strip().startswith("Stk"):
@@ -245,7 +286,7 @@ def parse_gs_block(fh, indextype):
if line.startswith("Expiry") or line.startswith("Assumes"):
break
vals = line.split()
- if indextype == 'HY':
+ if indextype == "HY":
vals.pop(2)
vals.pop(9)
else:
@@ -253,7 +294,7 @@ def parse_gs_block(fh, indextype):
vals.pop(8)
strike = vals.pop(0)
if indextype == "HY":
- vals.pop(0) # pop the spread
+ vals.pop(0) # pop the spread
pay, pay_delta = vals[:2]
pay_bid, pay_offer = pay.split("/")
rec_bid, rec_offer = vals[2].split("/")
@@ -266,8 +307,9 @@ def parse_gs_block(fh, indextype):
r.append(vals)
return makedf(r, indextype, "GS"), line
+
def parse_citi_block(fh, indextype):
- next(fh) #skip header
+ next(fh) # skip header
r = []
for line in fh:
line = line.rstrip()
@@ -290,13 +332,35 @@ def parse_citi_block(fh, indextype):
vol = vol.split()[0]
if indextype == "HY":
price_vol = price_vol.strip()
- r.append([strike, rec_bid, rec_offer, rec_delta,
- pay_bid, pay_offer, pay_delta, vol, price_vol])
+ r.append(
+ [
+ strike,
+ rec_bid,
+ rec_offer,
+ rec_delta,
+ pay_bid,
+ pay_offer,
+ pay_delta,
+ vol,
+ price_vol,
+ ]
+ )
else:
- r.append([strike, rec_bid, rec_offer, rec_delta,
- pay_bid, pay_offer, pay_delta, vol])
+ r.append(
+ [
+ strike,
+ rec_bid,
+ rec_offer,
+ rec_delta,
+ pay_bid,
+ pay_offer,
+ pay_delta,
+ vol,
+ ]
+ )
return makedf(r, indextype, "CITI")
+
def parse_ms(fh, indextype, *args):
option_stack = {}
for line in fh:
@@ -325,6 +389,7 @@ def parse_nom(fh, indextype, *args):
aux(next_line, fh, indextype, option_stack)
else:
raise RuntimeError(f"Don't know what to do with {line}.")
+
for line in fh:
line = line.rstrip()
if "EXPIRY" in line:
@@ -345,8 +410,7 @@ def parse_sg(fh, indextype, expiration_dates):
def parse_gs(fh, indextype, series, quotedate, ref):
option_stack = {}
fwd_index = []
- d = {'quotedate': quotedate, 'index': indextype,
- 'series': series, 'ref': ref}
+ d = {"quotedate": quotedate, "index": indextype, "series": series, "ref": ref}
pat = re.compile(r"Expiry (\d{2}\w{3}\d{2}) \((?:([\S]+) )?([\S]+)\)")
line = next(fh).strip()
@@ -355,9 +419,10 @@ def parse_gs(fh, indextype, series, quotedate, ref):
m = pat.match(line)
if m:
expiry, fwdprice, fwdspread = m.groups()
- expiry = pd.to_datetime(expiry, format='%d%b%y')
- d.update({'fwdspread': fwdspread, 'fwdprice': fwdprice,
- 'expiry': expiry})
+ expiry = pd.to_datetime(expiry, format="%d%b%y")
+ d.update(
+ {"fwdspread": fwdspread, "fwdprice": fwdprice, "expiry": expiry}
+ )
fwd_index.append(d.copy())
option_stack[expiry], line = parse_gs_block(fh, indextype)
else:
@@ -370,17 +435,15 @@ def parse_gs(fh, indextype, series, quotedate, ref):
except StopIteration:
break
- fwd_index = pd.DataFrame.from_records(fwd_index,
- index='quotedate')
- fwd_index['quote_source'] = 'GS'
+ fwd_index = pd.DataFrame.from_records(fwd_index, index="quotedate")
+ fwd_index["quote_source"] = "GS"
return option_stack, fwd_index
+
def parse_citi(fh, indextype, series, quotedate):
option_stack = {}
fwd_index = []
- d = {'quotedate': quotedate,
- 'index': indextype,
- 'series': series}
+ d = {"quotedate": quotedate, "index": indextype, "series": series}
pat = re.compile(r"Exp: (\d{2}-\w{3}-\d{2})[^R]*Ref:[^\d]*([\d.]+)")
for line in fh:
line = line.strip()
@@ -388,33 +451,35 @@ def parse_citi(fh, indextype, series, quotedate):
m = pat.match(line)
if m:
expiry, ref = m.groups()
- expiry = pd.to_datetime(expiry, format='%d-%b-%y')
- d.update({'ref': ref,
- 'expiry': expiry})
+ expiry = pd.to_datetime(expiry, format="%d-%b-%y")
+ d.update({"ref": ref, "expiry": expiry})
fwd_index.append(d.copy())
option_stack[expiry] = parse_citi_block(fh, indextype)
else:
logger.error("Cant't parse expiry line:", line)
- fwd_index = pd.DataFrame.from_records(fwd_index,
- index='quotedate')
- fwd_index['quote_source'] = 'CITI'
+ fwd_index = pd.DataFrame.from_records(fwd_index, index="quotedate")
+ fwd_index["quote_source"] = "CITI"
return option_stack, fwd_index
+
subject_baml = re.compile(r"(?:Fwd:){0,2}(?:BAML )?(\w{2})([0-9]{1,2})\s")
-subject_ms = re.compile(r"[^$]*\$\$ MS CDX OPTIONS: (IG|HY)(\d{2})[^-]*- REF[^\d]*([\d.]+)")
+subject_ms = re.compile(
+ r"[^$]*\$\$ MS CDX OPTIONS: (IG|HY)(\d{2})[^-]*- REF[^\d]*([\d.]+)"
+)
subject_nom = re.compile(r"(?:Fwd:)?CDX (IG|HY)(\d{2}).*- REF:[^\d]*([\d.]+)")
subject_gs = re.compile(r"(?:FW: |Fwd: )?GS (IG|HY)(\d{2}) 5y.*- Ref [^\d]*([\d.]+)")
subject_sg = re.compile(r"SG OPTIONS - CDX (IG|HY) S(\d{2}).* REF[^\d]*([\d.]+)")
subject_citi = re.compile(r"(?:Fwd:)?Citi Options: (IG|HY)(\d{2}) 5Y")
+
def parse_email(email, date_received):
with email.open("rt") as fh:
subject = fh.readline().lstrip()
- for source in ['BAML', 'GS', 'MS', 'NOM', 'SG', 'CITI']:
- m = globals()[f'subject_{source.lower()}'].match(subject)
+ for source in ["BAML", "GS", "MS", "NOM", "SG", "CITI"]:
+ m = globals()[f"subject_{source.lower()}"].match(subject)
if m:
- if source in ['BAML', 'CITI']:
+ if source in ["BAML", "CITI"]:
indextype, series = m.groups()
else:
indextype, series, ref = m.groups()
@@ -424,43 +489,55 @@ def parse_email(email, date_received):
try:
quotedate = parse_quotedate(fh, date_received)
except RuntimeError:
- logger.warning("couldn't find received date in message: "
- f"{email.name}, using {date_received}")
- quotedate = pd.Timestamp(date_received).tz_localize("America/New_York")
+ logger.warning(
+ "couldn't find received date in message: "
+ f"{email.name}, using {date_received}"
+ )
+ quotedate = pd.Timestamp(date_received).tz_localize(
+ "America/New_York"
+ )
fh.seek(cur_pos)
expiration_dates = list_imm_dates(quotedate)
- parse_fun = globals()[f'parse_{source.lower()}']
+ parse_fun = globals()[f"parse_{source.lower()}"]
key = (quotedate, indextype, series, source)
- if source in ['BAML', 'CITI']:
+ if source in ["BAML", "CITI"]:
return (key, parse_fun(fh, indextype, series, quotedate))
- parse_fun(fh, indextype, series, quotedate)
elif source == "GS":
return (key, parse_fun(fh, indextype, series, quotedate, ref))
- parse_fun(fh, indextype, series, quotedate, ref)
else:
option_stack = parse_fun(fh, indextype, expiration_dates)
- fwd_index = pd.DataFrame({'quotedate': quotedate,
- 'ref': ref,
- 'index': indextype,
- 'series': series,
- 'expiry': list(option_stack.keys()),
- 'quote_source': source})
- fwd_index.set_index('quotedate', inplace=True)
+ fwd_index = pd.DataFrame(
+ {
+ "quotedate": quotedate,
+ "ref": ref,
+ "index": indextype,
+ "series": series,
+ "expiry": list(option_stack.keys()),
+ "quote_source": source,
+ }
+ )
+ fwd_index.set_index("quotedate", inplace=True)
return (key, (option_stack, fwd_index))
else:
- raise RuntimeError(f"can't parse subject line: {subject} for email {email.name}")
+ raise RuntimeError(
+ f"can't parse subject line: {subject} for email {email.name}"
+ )
+
def write_todb(swaption_stack, index_data, conn):
def gen_sql_str(query, table_name, columns):
- return query.format(sql.Identifier(table_name),
- sql.SQL(", ").join(sql.Identifier(c) for c in columns),
- sql.SQL(", ").join(sql.Placeholder() * len(columns)))
- query = sql.SQL("INSERT INTO {}({}) VALUES({}) "
- "ON CONFLICT DO NOTHING RETURNING ref_id")
+ return query.format(
+ sql.Identifier(table_name),
+ sql.SQL(", ").join(sql.Identifier(c) for c in columns),
+ sql.SQL(", ").join(sql.Placeholder() * len(columns)),
+ )
+
+ query = sql.SQL(
+ "INSERT INTO {}({}) VALUES({}) " "ON CONFLICT DO NOTHING RETURNING ref_id"
+ )
sql_str = gen_sql_str(query, "swaption_ref_quotes", index_data.columns)
- query = sql.SQL("INSERT INTO {}({}) VALUES({}) "
- "ON CONFLICT DO NOTHING")
+ query = sql.SQL("INSERT INTO {}({}) VALUES({}) " "ON CONFLICT DO NOTHING")
with conn.cursor() as c:
for t in index_data.itertuples(index=False):
c.execute(sql_str, t)
@@ -481,9 +558,11 @@ def write_todb(swaption_stack, index_data, conn):
continue
except IndexError:
breakpoint()
- df['ref_id'] = ref_id
- c.executemany(gen_sql_str(query, "swaption_quotes", df.columns),
- df.itertuples(index=False))
+ df["ref_id"] = ref_id
+ c.executemany(
+ gen_sql_str(query, "swaption_quotes", df.columns),
+ df.itertuples(index=False),
+ )
conn.commit()
@@ -496,10 +575,10 @@ def get_email_list(date):
"""
with open(".pickle", "rb") as fh:
already_uploaded = pickle.load(fh)
- df = pd.DataFrame.from_dict(already_uploaded, orient='index')
- df.columns = ['quotedate']
- df = df.reset_index().set_index('quotedate')
- return df.loc[date, 'index'].tolist()
+ df = pd.DataFrame.from_dict(already_uploaded, orient="index")
+ df.columns = ["quotedate"]
+ df = df.reset_index().set_index("quotedate")
+ return df.loc[date, "index"].tolist()
def pickle_drop_date(date):