2 files changed, 54 insertions, 47 deletions
diff --git a/python/exploration/dispersion.py b/python/exploration/dispersion.py
index 575877ba..f0c5e5e8 100644
--- a/python/exploration/dispersion.py
+++ b/python/exploration/dispersion.py
@@ -11,21 +11,20 @@ from dateutil.relativedelta import relativedelta
 from utils.db import dbengine
 
 
-def get_dispersion(index_type, series, end_date=datetime.date.today()):
+def get_dispersion(index_type, series, use_gini=False, use_log=True, dr=None):
     index = MarkitBasketIndex(index_type, series, ["5yr"])
-    dr = pd.bdate_range(index.issue_date, end_date)
+    if dr is None:
+        dr = pd.bdate_range(
+            index.issue_date, datetime.datetime.today() - pd.offsets.BDay(1)
+        )
 
     dispersion = []
-    cumloss = []
     for d in dr:
         print(d)
         index.value_date = d
-        dispersion.append(index.dispersion())
-        cumloss.append(index.cumloss)
+        dispersion.append(index.dispersion(use_gini, use_log))
 
-    return pd.DataFrame(
-        {"dispersion": dispersion, "cumloss": cumloss,}, index=dr, name="dispersion",
-    )
+    return pd.DataFrame(dispersion, index=dr, columns=["dispersion"])
 
 
 def get_corr_data(index_type, series, engine):
@@ -72,28 +71,51 @@ def get_tranche_data(index_type, engine):
         ["date", "index", "series", "version", "tenor", "attach"], as_index=False
     ).mean()
     df = df.assign(
-        moneyness=lambda x: np.clip(
-            (x.detach - x.cumulativeloss) / x.indexfactor / x.index_expected_loss,
-            0.0,
-            1.0,
-        ),
         exp_percentage=lambda x: x.expected_loss / x.index_expected_loss,
+        attach_adj=lambda x: np.maximum(
+            (x.attach - x.cumulativeloss) / df.indexfactor, 0
+        ),
+        detach_adj=lambda x: np.minimum(
+            (x.detach - x.cumulativeloss) / df.indexfactor, 1
+        ),
+    )
+    df = df.assign(
+        moneyness=lambda x: (x.detach_adj + x.attach_adj)
+        / 2
+        / x.indexfactor
+        / x.index_expected_loss,
     )
-    df.set_index(["index", "series", "tenor", "attach"], append=True, inplace=True)
+    df.set_index(
+        ["date", "index", "series", "tenor", "attach"], append=True, inplace=True
+    )
+    df.reset_index(level=0, drop=True, inplace=True)
     return df
 
 
-def create_gini_models(df):
+def create_models(df, use_gini=False, use_log=True):
     # Takes the output of get_tranche_data
+    dispersion = {}
+    for g, _ in df.groupby(["series", "index"]):
+        temp = df.xs(g[0], level="series")
+        date_range = temp.index.get_level_values("date").unique()
+        dispersion[g[0]] = get_dispersion(
+            g[1], g[0], use_gini=use_gini, use_log=use_log, dr=date_range
+        )
+    dispersion = pd.concat(dispersion)
+    dispersion.index.rename("series", level=0, inplace=True)
+    df = df.merge(dispersion, left_index=True, right_index=True)
+    df.dropna(subset=["dispersion"], inplace=True)
     gini_model, gini_calc = {}, {}
     for attach in df.index.get_level_values("attach").unique():
-        gini_calc[attach] = df.loc(axis=0)[:, :, :, "5yr", attach]
+        gini_calc[attach] = df.xs(
+            ["5yr", attach], level=["tenor", "attach"], drop_level=False
+        )
         gini_model[attach] = smf.ols(
             "np.log(exp_percentage) ~ "
-            "np.log(gini_spread) + "
+            "dispersion + "
             "np.log(index_duration) + "
             "np.log(moneyness)",
-            data=gini_calc[attach],
+            data=df.xs(attach, level="attach"),
         ).fit()
         gini_calc[attach]["predict"] = np.exp(
             gini_model[attach].predict(gini_calc[attach])
@@ -116,24 +138,6 @@ def create_gini_models(df):
     return gini_model, gini_calc
 
 
-def gini(array):
-    """Calculate the Gini coefficient of a numpy array."""
-    if np.amin(array) < 0:
-        array -= np.amin(array)  # values cannot be negative
-    array += 0.0000001  # values cannot be 0
-    array = np.sort(array)  # values must be sorted
-    index = np.arange(1, array.shape[0] + 1)  # index per array element
-    n = array.shape[0]  # number of array elements
-    return (np.sum((2 * index - n - 1) * array)) / (n * np.sum(array))
-
-
-def get_gini_spreadstdev(index_type, series, tenor, date):
-    indices = MarkitBasketIndex(index_type, series, tenor, value_date=date)
-    spreads = indices.spreads()
-    spreads = np.ravel(spreads)
-    return (gini(spreads), np.std(spreads))
-
-
 if __name__ == "__main__":
     index_type = "HY"
     series = 29
diff --git a/python/notebooks/Dispersion.ipynb b/python/notebooks/Dispersion.ipynb
index 59e98647..84701ba8 100644
--- a/python/notebooks/Dispersion.ipynb
+++ b/python/notebooks/Dispersion.ipynb
@@ -37,7 +37,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "value_date = (pd.datetime.today() - pd.offsets.BDay(1)).date()\n",
+    "value_date = (datetime.datetime.today() - pd.offsets.BDay(1)).date()\n",
+    "start_date = datetime.date(2019,9,27)\n",
+    "end_date = datetime.date(2020,1,30)\n",
     "index_type = 'HY'"
    ]
   },
@@ -48,9 +50,10 @@
    "outputs": [],
    "source": [
     "#Get Gini factor\n",
-    "date_range = pd.bdate_range(end=value_date, freq='5B',periods=52*.5)\n",
-    "risk = disp.get_tranche_data(index_type, date_range, serenitas_engine)\n",
-    "gini_model, gini_calc = disp.create_gini_models(risk)"
+    "date_range = pd.bdate_range(end=value_date, freq='1B',periods=52*4)\n",
+    "risk = disp.get_tranche_data(index_type, serenitas_engine)\n",
+    "risk = risk[risk.index.get_level_values(0).isin(date_range)]\n",
+    "gini_model, gini_calc = disp.create_models(risk, use_gini=True, use_log=False)"
    ]
   },
   {
@@ -59,8 +62,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "to_plot_gini = gini_calc.loc(axis=0)[:,:,:,'5yr',0].groupby(['date', 'series']).nth(-1)\n",
-    "to_plot_gini['gini_spread'].unstack().plot()"
+    "to_plot_gini = gini_calc.xs(0, level='attach').groupby(['date', 'series']).nth(-1)\n",
+    "to_plot_gini['dispersion'].unstack().plot()"
    ]
   },
   {
@@ -69,7 +72,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "today = gini_calc.loc(axis=0)[value_date,:,33,'5yr',:]\n",
+    "today = gini_calc.xs([value_date,33], level=['date','series'])\n",
     "today[['exp_percentage', 'predict_N', 'predict_preN', 'mispricing']]"
    ]
   },
@@ -79,8 +82,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "to_plot = gini_calc.loc(axis=0)[:,:,:,'5yr',0]['mispricing']\n",
-    "to_plot.reset_index(['index','tenor','attach'], drop=True).unstack().plot()"
+    "to_plot = gini_calc.xs(0, level='attach')['mispricing']\n",
+    "to_plot.reset_index(['index','tenor'], drop=True).unstack().plot()"
    ]
   },
   {
@@ -99,8 +102,8 @@
    "outputs": [],
    "source": [
     "#Run a particular gini scenario\n",
-    "scenario = gini_calc.loc(axis=0)[value_date,'HY',33,'5yr',0]\n",
-    "scenario['gini_spread'] = .6\n",
+    "scenario = gini_calc.loc(axis=0)[value_date,33,'HY','5yr',0]\n",
+    "scenario['dispersion'] = .6\n",
     "scenario_disp = np.exp(gini_model[0].predict(scenario))\n",
     "mispricing = (scenario['exp_percentage'] - scenario_disp) * \\\n",
     "             scenario['index_expected_loss'] / \\\n",