1 files changed, 171 insertions, 92 deletions
diff --git a/python/quote_parsing/parse_emails.py b/python/quote_parsing/parse_emails.py
index 47a17e9e..21c1b9f1 100644
--- a/python/quote_parsing/parse_emails.py
+++ b/python/quote_parsing/parse_emails.py
@@ -19,30 +19,48 @@ def list_imm_dates(date):
 
 
 def makedf(r, indextype, quote_source):
-    if indextype == 'IG':
-        cols = ['strike', 'rec_bid', 'rec_offer', 'delta_rec', 'pay_bid',
-                'pay_offer', 'delta_pay', 'vol']
+    if indextype == "IG":
+        cols = [
+            "strike",
+            "rec_bid",
+            "rec_offer",
+            "delta_rec",
+            "pay_bid",
+            "pay_offer",
+            "delta_pay",
+            "vol",
+        ]
     else:
-        cols = ['strike', 'rec_bid', 'rec_offer', 'delta_rec', 'pay_bid',
-                'pay_offer', 'delta_pay', 'vol', 'price_vol']
+        cols = [
+            "strike",
+            "rec_bid",
+            "rec_offer",
+            "delta_rec",
+            "pay_bid",
+            "pay_offer",
+            "delta_pay",
+            "vol",
+            "price_vol",
+        ]
     if quote_source == "BAML":
-        cols.append('gamma')
+        cols.append("gamma")
     if quote_source == "GS":
         cols.append("tail")
     df = pd.DataFrame.from_records(r, columns=cols)
-    for col in ['delta_rec', 'delta_pay', 'vol', 'price_vol', 'gamma', 'tail']:
+    for col in ["delta_rec", "delta_pay", "vol", "price_vol", "gamma", "tail"]:
         if col in df:
             try:
-                df[col] = df[col].str.rstrip("%").astype('float') / 100
-            except ValueError: #typo in one email
-                df[col] = (df[col].str.rstrip("%").
-                           str.replace("n", "").astype("float") / 100)
+                df[col] = df[col].str.rstrip("%").astype("float") / 100
+            except ValueError:  # typo in one email
+                df[col] = (
+                    df[col].str.rstrip("%").str.replace("n", "").astype("float") / 100
+                )
     if quote_source == "GS":
         for col in ["pay_bid", "pay_offer", "rec_bid", "rec_offer"]:
-            df[col] = df[col].str.strip('-')
-        df['delta_pay'] *= -1
+            df[col] = df[col].str.strip("-")
+        df["delta_pay"] *= -1
     for k in df:
-        if df.dtypes[k] == 'object':
+        if df.dtypes[k] == "object":
             df[k] = df[k].str.replace(",", "")
             try:
                 df[k] = pd.to_numeric(df[k])
@@ -51,7 +69,7 @@ def makedf(r, indextype, quote_source):
                 logger.error("couldn't convert column")
                 df[k] = pd.to_numeric(df[k].str.replace("n", ""))
                 breakpoint()
-    df.set_index('strike', inplace=True)
+    df.set_index("strike", inplace=True)
     return df
 
 
@@ -59,8 +77,12 @@ def parse_quotedate(fh, date_received):
     for line in fh:
         line = line.rstrip()
         if "At:" in line or "Sent:" in line:
-            for p in ["%m/%d/%y %H:%M:%S", "%b  %d %Y %H:%M:%S", "%m/%d  %H:%M:%S",
-                      "%B %d, %Y %I:%M %p"]:
+            for p in [
+                "%m/%d/%y %H:%M:%S",
+                "%b  %d %Y %H:%M:%S",
+                "%m/%d  %H:%M:%S",
+                "%B %d, %Y %I:%M %p",
+            ]:
                 try:
                     quotedate = pd.to_datetime(line, format=p, exact=False)
                 except ValueError:
@@ -78,13 +100,15 @@ def parse_quotedate(fh, date_received):
 
 
 def parse_refline(line):
-    regex = r"Ref:(?P<ref>\S+)\s+(?:Fwd Px:(?P<fwdprice>\S+)\s+)?" \
-        r"Fwd(?: Spd)?:(?P<fwdspread>\S+)\s+Fwd Bpv:(?P<fwdbpv>\S+)" \
+    regex = (
+        r"Ref:(?P<ref>\S+)\s+(?:Fwd Px:(?P<fwdprice>\S+)\s+)?"
+        r"Fwd(?: Spd)?:(?P<fwdspread>\S+)\s+Fwd Bpv:(?P<fwdbpv>\S+)"
         r"\s+Expiry:(?P<expiry>\S+)"
+    )
     m = re.match(regex, line)
     try:
         d = m.groupdict()
-        d['expiry'] = pd.to_datetime(d['expiry'], format='%d-%b-%y')
+        d["expiry"] = pd.to_datetime(d["expiry"], format="%d-%b-%y")
     except AttributeError:
         raise RuntimeError(f"can't parse refline {line}")
     return d
@@ -102,16 +126,15 @@ def parse_baml(fh, indextype, series, quotedate, *args):
                 break
         if line.startswith("Ref"):
             d = parse_refline(line)
-            d.update({'quotedate': quotedate, 'index': indextype, 'series': series})
+            d.update({"quotedate": quotedate, "index": indextype, "series": series})
             df, line = parse_baml_block(fh, indextype)
-            option_stack[d['expiry']] = df
+            option_stack[d["expiry"]] = df
             fwd_index.append(d)
         else:
             line = ""
     if option_stack:
-        fwd_index = pd.DataFrame.from_records(fwd_index,
-                                              index='quotedate')
-        fwd_index['quote_source'] = 'BAML'
+        fwd_index = pd.DataFrame.from_records(fwd_index, index="quotedate")
+        fwd_index["quote_source"] = "BAML"
         return option_stack, fwd_index
     else:
         raise RuntimeError("empty email: " + fh.name)
@@ -159,15 +182,15 @@ def parse_ms_block(fh, indextype):
             except ValueError:
                 raise RuntimeError("Couldn't parse line: {line}")
 
-        vals = [strike, rec_bid, rec_offer, rec_delta,
-                pay_bid, pay_offer, pay_delta]
+        vals = [strike, rec_bid, rec_offer, rec_delta, pay_bid, pay_offer, pay_delta]
         vol = vol.strip()
         if indextype == "HY":
             try:
                 price_vol, vol = vol.replace("[", "").replace("]", "").split()
             except ValueError:
-                price_vol, vol, vol_change, be = (vol.replace("[", "").
-                                                  replace("]", "").split())
+                price_vol, vol, vol_change, be = (
+                    vol.replace("[", "").replace("]", "").split()
+                )
             vals += [vol, price_vol]
         else:
             if " " in vol:
@@ -191,8 +214,16 @@ def parse_nomura_block(fh, indextype):
         pay_bid, pay_offer = pay.split("/")
         rec_bid, rec_offer = rec.split("/")
         vol = vol.strip()
-        vals = [strike, rec_bid, rec_offer, rec_delta,
-                pay_bid, pay_offer, pay_delta, vol]
+        vals = [
+            strike,
+            rec_bid,
+            rec_offer,
+            rec_delta,
+            pay_bid,
+            pay_offer,
+            pay_delta,
+            vol,
+        ]
         if indextype == "HY":  # we don't have price vol
             vals.append(None)
         r.append(vals)
@@ -213,7 +244,9 @@ def parse_sg_block(fh, indextype, expiration_dates):
             option_type, strike, strike_spread, price, delta, vol, expiry = line.split()
 
         expiry_month = datetime.datetime.strptime(expiry, "%b-%y").month
-        expiry = next(pd.Timestamp(d) for d in expiration_dates if d.month == expiry_month)
+        expiry = next(
+            pd.Timestamp(d) for d in expiration_dates if d.month == expiry_month
+        )
         if option_type == "Rec":
             rec_bid, rec_offer = price.split("/")
             pay_bid, pay_offer = None, None
@@ -222,8 +255,16 @@ def parse_sg_block(fh, indextype, expiration_dates):
             pay_bid, pay_offer = price.split("/")
             rec_bid, rec_offer = None, None
             rec_delta, pay_delta = None, delta
-        vals = [strike, rec_bid, rec_offer, rec_delta, pay_bid,
-                pay_offer, pay_delta, vol]
+        vals = [
+            strike,
+            rec_bid,
+            rec_offer,
+            rec_delta,
+            pay_bid,
+            pay_offer,
+            pay_delta,
+            vol,
+        ]
         if indextype == "HY":
             vals.append(None)
         r.append(vals)
@@ -231,7 +272,7 @@ def parse_sg_block(fh, indextype, expiration_dates):
 
 
 def parse_gs_block(fh, indextype):
-    #skip header
+    # skip header
     while True:
         line = next(fh)
         if line.strip().startswith("Stk"):
@@ -245,7 +286,7 @@ def parse_gs_block(fh, indextype):
         if line.startswith("Expiry") or line.startswith("Assumes"):
             break
         vals = line.split()
-        if indextype == 'HY':
+        if indextype == "HY":
             vals.pop(2)
             vals.pop(9)
         else:
@@ -253,7 +294,7 @@ def parse_gs_block(fh, indextype):
             vals.pop(8)
         strike = vals.pop(0)
         if indextype == "HY":
-            vals.pop(0) # pop the spread
+            vals.pop(0)  # pop the spread
         pay, pay_delta = vals[:2]
         pay_bid, pay_offer = pay.split("/")
         rec_bid, rec_offer = vals[2].split("/")
@@ -266,8 +307,9 @@ def parse_gs_block(fh, indextype):
         r.append(vals)
     return makedf(r, indextype, "GS"), line
 
+
 def parse_citi_block(fh, indextype):
-    next(fh) #skip header
+    next(fh)  # skip header
     r = []
     for line in fh:
         line = line.rstrip()
@@ -290,13 +332,35 @@ def parse_citi_block(fh, indextype):
         vol = vol.split()[0]
         if indextype == "HY":
             price_vol = price_vol.strip()
-            r.append([strike, rec_bid, rec_offer, rec_delta,
-                      pay_bid, pay_offer, pay_delta, vol, price_vol])
+            r.append(
+                [
+                    strike,
+                    rec_bid,
+                    rec_offer,
+                    rec_delta,
+                    pay_bid,
+                    pay_offer,
+                    pay_delta,
+                    vol,
+                    price_vol,
+                ]
+            )
         else:
-            r.append([strike, rec_bid, rec_offer, rec_delta,
-                      pay_bid, pay_offer, pay_delta, vol])
+            r.append(
+                [
+                    strike,
+                    rec_bid,
+                    rec_offer,
+                    rec_delta,
+                    pay_bid,
+                    pay_offer,
+                    pay_delta,
+                    vol,
+                ]
+            )
     return makedf(r, indextype, "CITI")
 
+
 def parse_ms(fh, indextype, *args):
     option_stack = {}
     for line in fh:
@@ -325,6 +389,7 @@ def parse_nom(fh, indextype, *args):
                 aux(next_line, fh, indextype, option_stack)
             else:
                 raise RuntimeError(f"Don't know what to do with {line}.")
+
     for line in fh:
         line = line.rstrip()
         if "EXPIRY" in line:
@@ -345,8 +410,7 @@ def parse_sg(fh, indextype, expiration_dates):
 def parse_gs(fh, indextype, series, quotedate, ref):
     option_stack = {}
     fwd_index = []
-    d = {'quotedate': quotedate, 'index': indextype,
-         'series': series, 'ref': ref}
+    d = {"quotedate": quotedate, "index": indextype, "series": series, "ref": ref}
     pat = re.compile(r"Expiry (\d{2}\w{3}\d{2}) \((?:([\S]+) )?([\S]+)\)")
 
     line = next(fh).strip()
@@ -355,9 +419,10 @@ def parse_gs(fh, indextype, series, quotedate, ref):
             m = pat.match(line)
             if m:
                 expiry, fwdprice, fwdspread = m.groups()
-                expiry = pd.to_datetime(expiry, format='%d%b%y')
-                d.update({'fwdspread': fwdspread, 'fwdprice': fwdprice,
-                          'expiry': expiry})
+                expiry = pd.to_datetime(expiry, format="%d%b%y")
+                d.update(
+                    {"fwdspread": fwdspread, "fwdprice": fwdprice, "expiry": expiry}
+                )
                 fwd_index.append(d.copy())
                 option_stack[expiry], line = parse_gs_block(fh, indextype)
             else:
@@ -370,17 +435,15 @@ def parse_gs(fh, indextype, series, quotedate, ref):
             except StopIteration:
                 break
 
-    fwd_index = pd.DataFrame.from_records(fwd_index,
-                                          index='quotedate')
-    fwd_index['quote_source'] = 'GS'
+    fwd_index = pd.DataFrame.from_records(fwd_index, index="quotedate")
+    fwd_index["quote_source"] = "GS"
     return option_stack, fwd_index
 
+
 def parse_citi(fh, indextype, series, quotedate):
     option_stack = {}
     fwd_index = []
-    d = {'quotedate': quotedate,
-         'index': indextype,
-         'series': series}
+    d = {"quotedate": quotedate, "index": indextype, "series": series}
     pat = re.compile(r"Exp: (\d{2}-\w{3}-\d{2})[^R]*Ref:[^\d]*([\d.]+)")
     for line in fh:
         line = line.strip()
@@ -388,33 +451,35 @@ def parse_citi(fh, indextype, series, quotedate):
             m = pat.match(line)
             if m:
                 expiry, ref = m.groups()
-                expiry = pd.to_datetime(expiry, format='%d-%b-%y')
-                d.update({'ref': ref,
-                          'expiry': expiry})
+                expiry = pd.to_datetime(expiry, format="%d-%b-%y")
+                d.update({"ref": ref, "expiry": expiry})
                 fwd_index.append(d.copy())
                 option_stack[expiry] = parse_citi_block(fh, indextype)
             else:
                 logger.error("Cant't parse expiry line:", line)
-    fwd_index = pd.DataFrame.from_records(fwd_index,
-                                          index='quotedate')
-    fwd_index['quote_source'] = 'CITI'
+    fwd_index = pd.DataFrame.from_records(fwd_index, index="quotedate")
+    fwd_index["quote_source"] = "CITI"
     return option_stack, fwd_index
 
+
 subject_baml = re.compile(r"(?:Fwd:){0,2}(?:BAML )?(\w{2})([0-9]{1,2})\s")
-subject_ms = re.compile(r"[^$]*\$\$ MS CDX OPTIONS: (IG|HY)(\d{2})[^-]*- REF[^\d]*([\d.]+)")
+subject_ms = re.compile(
+    r"[^$]*\$\$ MS CDX OPTIONS: (IG|HY)(\d{2})[^-]*- REF[^\d]*([\d.]+)"
+)
 subject_nom = re.compile(r"(?:Fwd:)?CDX (IG|HY)(\d{2}).*- REF:[^\d]*([\d.]+)")
 subject_gs = re.compile(r"(?:FW: |Fwd: )?GS (IG|HY)(\d{2}) 5y.*- Ref [^\d]*([\d.]+)")
 subject_sg = re.compile(r"SG OPTIONS - CDX (IG|HY) S(\d{2}).* REF[^\d]*([\d.]+)")
 subject_citi = re.compile(r"(?:Fwd:)?Citi Options: (IG|HY)(\d{2}) 5Y")
 
+
 def parse_email(email, date_received):
     with email.open("rt") as fh:
         subject = fh.readline().lstrip()
 
-        for source in ['BAML', 'GS', 'MS', 'NOM', 'SG', 'CITI']:
-            m = globals()[f'subject_{source.lower()}'].match(subject)
+        for source in ["BAML", "GS", "MS", "NOM", "SG", "CITI"]:
+            m = globals()[f"subject_{source.lower()}"].match(subject)
             if m:
-                if source in ['BAML', 'CITI']:
+                if source in ["BAML", "CITI"]:
                     indextype, series = m.groups()
                 else:
                     indextype, series, ref = m.groups()
@@ -424,43 +489,55 @@ def parse_email(email, date_received):
                 try:
                     quotedate = parse_quotedate(fh, date_received)
                 except RuntimeError:
-                    logger.warning("couldn't find received date in message: "
-                                    f"{email.name}, using {date_received}")
-                    quotedate = pd.Timestamp(date_received).tz_localize("America/New_York")
+                    logger.warning(
+                        "couldn't find received date in message: "
+                        f"{email.name}, using {date_received}"
+                    )
+                    quotedate = pd.Timestamp(date_received).tz_localize(
+                        "America/New_York"
+                    )
                     fh.seek(cur_pos)
 
                 expiration_dates = list_imm_dates(quotedate)
-                parse_fun = globals()[f'parse_{source.lower()}']
+                parse_fun = globals()[f"parse_{source.lower()}"]
                 key = (quotedate, indextype, series, source)
-                if source in ['BAML', 'CITI']:
+                if source in ["BAML", "CITI"]:
                     return (key, parse_fun(fh, indextype, series, quotedate))
-                        parse_fun(fh, indextype, series, quotedate)
                 elif source == "GS":
                     return (key, parse_fun(fh, indextype, series, quotedate, ref))
-                        parse_fun(fh, indextype, series, quotedate, ref)
                 else:
                     option_stack = parse_fun(fh, indextype, expiration_dates)
-                    fwd_index = pd.DataFrame({'quotedate': quotedate,
-                                              'ref': ref,
-                                              'index': indextype,
-                                              'series': series,
-                                              'expiry': list(option_stack.keys()),
-                                              'quote_source': source})
-                    fwd_index.set_index('quotedate', inplace=True)
+                    fwd_index = pd.DataFrame(
+                        {
+                            "quotedate": quotedate,
+                            "ref": ref,
+                            "index": indextype,
+                            "series": series,
+                            "expiry": list(option_stack.keys()),
+                            "quote_source": source,
+                        }
+                    )
+                    fwd_index.set_index("quotedate", inplace=True)
                     return (key, (option_stack, fwd_index))
         else:
-            raise RuntimeError(f"can't parse subject line: {subject} for email {email.name}")
+            raise RuntimeError(
+                f"can't parse subject line: {subject} for email {email.name}"
+            )
+
 
 def write_todb(swaption_stack, index_data, conn):
     def gen_sql_str(query, table_name, columns):
-        return query.format(sql.Identifier(table_name),
-                            sql.SQL(", ").join(sql.Identifier(c) for c in columns),
-                            sql.SQL(", ").join(sql.Placeholder() * len(columns)))
-    query = sql.SQL("INSERT INTO {}({}) VALUES({}) "
-                    "ON CONFLICT DO NOTHING RETURNING ref_id")
+        return query.format(
+            sql.Identifier(table_name),
+            sql.SQL(", ").join(sql.Identifier(c) for c in columns),
+            sql.SQL(", ").join(sql.Placeholder() * len(columns)),
+        )
+
+    query = sql.SQL(
+        "INSERT INTO {}({}) VALUES({}) " "ON CONFLICT DO NOTHING RETURNING ref_id"
+    )
     sql_str = gen_sql_str(query, "swaption_ref_quotes", index_data.columns)
-    query = sql.SQL("INSERT INTO {}({}) VALUES({}) "
-                    "ON CONFLICT DO NOTHING")
+    query = sql.SQL("INSERT INTO {}({}) VALUES({}) " "ON CONFLICT DO NOTHING")
     with conn.cursor() as c:
         for t in index_data.itertuples(index=False):
             c.execute(sql_str, t)
@@ -481,9 +558,11 @@ def write_todb(swaption_stack, index_data, conn):
                     continue
                 except IndexError:
                     breakpoint()
-                df['ref_id'] = ref_id
-                c.executemany(gen_sql_str(query, "swaption_quotes", df.columns),
-                              df.itertuples(index=False))
+                df["ref_id"] = ref_id
+                c.executemany(
+                    gen_sql_str(query, "swaption_quotes", df.columns),
+                    df.itertuples(index=False),
+                )
             conn.commit()
 
 
@@ -496,10 +575,10 @@ def get_email_list(date):
     """
     with open(".pickle", "rb") as fh:
         already_uploaded = pickle.load(fh)
-    df = pd.DataFrame.from_dict(already_uploaded, orient='index')
-    df.columns = ['quotedate']
-    df = df.reset_index().set_index('quotedate')
-    return df.loc[date, 'index'].tolist()
+    df = pd.DataFrame.from_dict(already_uploaded, orient="index")
+    df.columns = ["quotedate"]
+    df = df.reset_index().set_index("quotedate")
+    return df.loc[date, "index"].tolist()
 
 
 def pickle_drop_date(date):