1 files changed, 41 insertions, 72 deletions
diff --git a/python/exploration/dispersion.py b/python/exploration/dispersion.py
index 5b0c0968..fac383c5 100644
--- a/python/exploration/dispersion.py
+++ b/python/exploration/dispersion.py
@@ -1,4 +1,3 @@
-import bottleneck as bn
 import datetime
 import numpy as np
 import pandas as pd
@@ -7,26 +6,10 @@ import statsmodels.formula.api as smf
 
 from analytics.basket_index import MarkitBasketIndex
 from analytics import CreditIndex
-from dateutil.relativedelta import relativedelta
+from scipy.special import logit, expit
 from utils.db import dbengine
 
 
-def get_dispersion(index_type, series, use_gini=False, use_log=True, dr=None):
-    index = MarkitBasketIndex(index_type, series, ["5yr"])
-    if dr is None:
-        dr = pd.bdate_range(
-            index.issue_date, datetime.datetime.today() - pd.offsets.BDay(1)
-        )
-
-    dispersion = []
-    for d in dr:
-        # print(d)
-        index.value_date = d
-        dispersion.append(index.dispersion(use_gini, use_log))
-
-    return pd.DataFrame(dispersion, index=dr, columns=["dispersion"])
-
-
 def get_corr_data(index_type, series, engine):
     sql_str = (
         "SELECT quotedate::date, indexrefspread, indexrefprice, index_duration, "
@@ -56,17 +39,22 @@ def get_corr_data(index_type, series, engine):
     return df
 
 
-def get_tranche_data(index_type, engine):
+def get_tranche_data(conn, index_type, tenor="5yr"):
     sql_string = (
         "SELECT * FROM risk_numbers "
         "LEFT JOIN index_version USING (index, series, version) "
-        "WHERE index = %s"
+        "WHERE index = %s AND tenor=%s"
     )
     df = pd.read_sql_query(
-        sql_string, engine, parse_dates={"date": {"utc": True}}, params=[index_type]
+        sql_string,
+        conn,
+        parse_dates={"date": {"utc": True}},
+        params=(index_type, tenor),
     )
     del df["basketid"]
-    df.date = df.date.dt.normalize().dt.tz_convert(None)
+    df.date = (
+        df.date.dt.tz_convert("America/New_York").dt.tz_localize(None).dt.normalize()
+    )
     df = df.groupby(
         ["date", "index", "series", "version", "tenor", "attach"], as_index=False
     ).mean()
@@ -78,64 +66,45 @@ def get_tranche_data(index_type, engine):
         detach_adj=lambda x: np.minimum(
             (x.detach - x.cumulativeloss) / df.indexfactor, 1
         ),
+        moneyness=lambda x: (x.detach_adj + x.attach_adj) / 2 / x.index_expected_loss,
     )
-    df = df.assign(
-        moneyness=lambda x: (x.detach_adj + x.attach_adj)
-        / 2
-        / x.indexfactor
-        / x.index_expected_loss,
-    )
-    df.set_index(
-        ["date", "index", "series", "tenor", "attach"], append=True, inplace=True
+    df = df.set_index(["date", "index", "series", "version", "tenor", "attach"])
+    series = tuple(df.index.get_level_values("series").unique())
+    dispersion = pd.read_sql_query(
+        "SELECT date, index, series, version, tenor, dispersion, gini from index_quotes "
+        "WHERE index=%s AND series IN %s AND tenor=%s",
+        conn,
+        params=(index_type, series, tenor),
+        index_col=["date", "index", "series", "version", "tenor"],
     )
-    df.reset_index(level=0, drop=True, inplace=True)
+    df = df.join(dispersion)
     return df
 
 
-def create_models(df, use_gini=False, use_log=True):
+def create_models(conn, df) -> (pd.DataFrame, float):
     # Takes the output of get_tranche_data
-    dispersion = {}
-    for g, _ in df.groupby(["series", "index"]):
-        temp = df.xs(g[0], level="series")
-        date_range = temp.index.get_level_values("date").unique()
-        dispersion[g[0]] = get_dispersion(
-            g[1], g[0], use_gini=use_gini, use_log=use_log, dr=date_range
-        )
-    dispersion = pd.concat(dispersion)
-    dispersion.index.rename("series", level=0, inplace=True)
-    df = df.merge(dispersion, left_index=True, right_index=True)
-    df.dropna(subset=["dispersion"], inplace=True)
-    gini_model, gini_calc = {}, {}
-    for attach in df.index.get_level_values("attach").unique():
-        gini_calc[attach] = df.xs(
-            ["5yr", attach], level=["tenor", "attach"], drop_level=False
-        )
-        gini_model[attach] = smf.ols(
-            "np.log(exp_percentage) ~ "
-            "dispersion + "
-            "np.log(index_duration) + "
-            "np.log(moneyness)",
-            data=df.xs(attach, level="attach"),
-        ).fit()
-        gini_calc[attach]["predict"] = np.exp(
-            gini_model[attach].predict(gini_calc[attach])
-        )
-    gini_calc = pd.concat(gini_calc, sort=False).reset_index(level=0, drop=True)
-    normalization = gini_calc.groupby(["date", "index", "series", "tenor"])[
-        "predict"
-    ].sum()
-    gini_calc = gini_calc.merge(
-        normalization, left_index=True, right_index=True, suffixes=["_preN", "_sum"]
+    attach_max = df.index.get_level_values("attach").max()
+    bottom_stack = df[df.index.get_level_values("attach") != attach_max]
+    model = smf.ols(
+        "logit(exp_percentage) ~ np.log(index_duration) + "
+        "I(np.log(index_expected_loss)**2) + "
+        "np.log(moneyness)*dispersion + "
+        "np.log(index_expected_loss)*dispersion + "
+        "I(np.log(moneyness)**2) + I(np.log(moneyness)**3)",
+        data=bottom_stack,
     )
-    gini_calc["predict_N"] = gini_calc["predict_preN"] / gini_calc["predict_sum"]
-    gini_calc["mispricing"] = (
-        (gini_calc["exp_percentage"] - gini_calc["predict_N"])
-        * gini_calc["index_expected_loss"]
-        / (gini_calc["detach_adj"] - gini_calc["attach_adj"])
-        / gini_calc["indexfactor"]
-        * 10000
+    f = model.fit()
+    df.loc[df.index.get_level_values("attach") != attach_max, "predict"] = expit(
+        f.predict(bottom_stack)
     )
-    return gini_model, gini_calc
+
+    def aux(s):
+        temp = s.values
+        temp[-1] = 1 - temp[:-1].sum()
+        return temp
+
+    df["predict"] = df.groupby(["index", "series", "date"])["predict"].transform(aux)
+    return (df, model)
 
 
 if __name__ == "__main__":