2 files changed, 58 insertions, 284 deletions
diff --git a/python/exploration/dispersion.py b/python/exploration/dispersion.py
index c9f219cf..c7bc33d9 100644
--- a/python/exploration/dispersion.py
+++ b/python/exploration/dispersion.py
@@ -119,18 +119,25 @@ def create_models(conn, df) -> (pd.DataFrame, float):
     return (df, model)
 
 
-def create_models_v2(conn, df) -> (pd.DataFrame, float):
+def create_models_v2(conn, df, weights=None) -> (pd.DataFrame, float):
     # Takes the output of get_tranche_data
     attach_max = df.index.get_level_values("attach").max()
     bottom_stack = df[df.index.get_level_values("attach") != attach_max]
-    model = smf.ols(
+    if weights is None:
+        weights = np.ones(len(bottom_stack))
+    else:
+        weights.name = "resids"
+        bottom_stack = bottom_stack.merge(weights, left_index=True, right_index=True)
+        weights = np.array(bottom_stack.resids)
+    model = smf.wls(
         "logit(tranche_loss_per) ~ "
-        "np.log(index_duration) + "
-        "np.log(moneyness) * gini + "
-        "np.log(index_expected_loss)* gini + "
-        "expit(att_moneyness) +"
-        "expit(det_moneyness)",
+        "np.log(index_duration) * np.log(gini)+ "
+        "np.log(moneyness) * np.log(gini) + "
+        "I(np.log(gini)**2) +"
+        "expit(att_moneyness) + I(expit(att_moneyness)**2) +"
+        "expit(det_moneyness) + I(expit(det_moneyness)**2)",
         data=bottom_stack,
+        weights=weights,
     )
     f = model.fit()
     df.loc[
diff --git a/python/notebooks/Dispersion.ipynb b/python/notebooks/Dispersion.ipynb
index 1b722f50..0d7e4cd3 100644
--- a/python/notebooks/Dispersion.ipynb
+++ b/python/notebooks/Dispersion.ipynb
@@ -18,6 +18,7 @@
     "from analytics.basket_index import MarkitBasketIndex\n",
     "from analytics import on_the_run\n",
     "from statsmodels.graphics.regressionplots import plot_fit\n",
+    "from scipy.special import logit, expit\n",
     "from pygam import LinearGAM, s, f, GAM\n",
     "from utils.db import dbengine, dbconn"
    ]
@@ -41,68 +42,17 @@
     "start = (datetime.datetime.today() - pd.offsets.BDay(1) * 365 *4).date()\n",
     "#end = (start + pd.offsets.BDay(1) * 365).date()\n",
     "end = datetime.datetime.today()\n",
-    "index_type = 'IG'\n",
-    "serenitasconn = dbconn(\"serenitasdb\")\n",
-    "serenitasconn.autocommit = True\n",
-    "risk = disp.get_tranche_data(serenitasconn, index_type)\n",
-    "train_data = risk[start: end]\n",
-    "gini_calc, gini_model = disp.create_models(serenitasconn, train_data)\n",
-    "gini_model.fit().summary()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gini_calc.xs(31, level = 'series')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#use trained model to fit rest of the data\n",
-    "f = gini_model.fit()\n",
-    "risk.loc[risk.index.get_level_values(\"attach\") != attach_max, \"predict\"] = expit(f.predict(bottom_stack))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#Plot Gini if (use gini=True, use_log=False)\n",
-    "to_plot_gini = gini_calc.xs(0, level='attach').groupby(['date', 'series']).nth(-1)\n",
-    "to_plot_gini['gini'].unstack().plot()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#last day: mispricing\n",
-    "today = gini_calc.xs([value_date,33], level=['date','series'])\n",
-    "today[['exp_percentage', 'predict', 'mispricing']]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#plot mispricing of a tranche through time \n",
-    "attach = 0\n",
-    "series = 33\n",
-    "to_plot = gini_calc.xs([attach, series], level=['attach', 'series'])['mispricing']\n",
-    "to_plot.reset_index(['index','tenor'], drop=True).unstack().plot()"
+    "gini_model, gini_results = {}, {}\n",
+    "conn = dbconn(\"serenitasdb\")\n",
+    "conn.autocommit = True\n",
+    "for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
+    "    risk = disp.get_tranche_data(dbconn(\"serenitasdb\"), index_type)\n",
+    "    risk = risk[risk.index_duration > .5] #filter out the short duration ones\n",
+    "    gini_results[index_type], gini_model[index_type] = disp.create_models_v2(conn, risk)\n",
+    "    fitted = gini_model[index_type].fit()\n",
+    "    w = 1/(expit(fitted.fittedvalues + fitted.resid) -expit(fitted.fittedvalues))**2\n",
+    "    gini_results[index_type], gini_model[index_type] = disp.create_models_v2(conn, risk, w)\n",
+    "gini_model['HY'].fit().summary()"
    ]
   },
   {
@@ -111,7 +61,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "gini_calc.xs([attach, series], level=['attach', 'series']).to_clipboard()"
+    "fieldlist = ['exp_percentage','dispersion','gini','tranche_loss_per','mispricing']\n",
+    "for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
+    "    gini_results[index_type][fieldlist].to_csv('/home/serenitas/edwin/' + index_type + '_results.csv')"
    ]
   },
   {
@@ -120,10 +72,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#plot mispricing of series through time \n",
-    "series = 33\n",
-    "to_plot = gini_calc.xs(series, level='series')['mispricing']\n",
-    "to_plot.reset_index(['index','tenor'], drop=True).unstack().plot()"
+    "#Run a particular gini scenario\n",
+    "scenario = gini_results['HY'].loc(axis=0)[value_date,'HY',33,:,'5yr',0]\n",
+    "scenario['gini'].iloc[0] = .7\n",
+    "scenario_disp = expit(gini_model['HY'].fit().predict(scenario))\n",
+    "mispricing = scenario['tranche_loss_per'] - scenario_disp\n",
+    "mispricing"
    ]
   },
   {
@@ -132,7 +86,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plot_fit(gini_model[0], 'np.log(index_duration)')"
+    "#plot the residuals\n",
+    "fitted = gini_model['HY'].fit()\n",
+    "plt.figure(figsize=(8,5))\n",
+    "p=plt.scatter(x=expit(fitted.fittedvalues),y=expit(fitted.fittedvalues + fitted.resid) -expit(fitted.fittedvalues),edgecolor='k')\n",
+    "xmin=min(expit(fitted.fittedvalues))\n",
+    "xmax = max(expit(fitted.fittedvalues))\n",
+    "plt.hlines(y=0,xmin=xmin*0.9,xmax=xmax*1.1,color='red',linestyle='--',lw=3)\n",
+    "plt.xlabel(\"Fitted values\",fontsize=15)\n",
+    "plt.ylabel(\"Residuals\",fontsize=15)\n",
+    "plt.title(\"Fitted vs. residuals plot\",fontsize=18)\n",
+    "plt.grid(True)\n",
+    "plt.show()"
    ]
   },
   {
@@ -141,9 +106,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "to_csv('/home/serenitas/edwin/Python/temp1.csv')\n",
-    "\n",
-    "gini_calc.to_csv('/home/serenitas/edwin/Python/' + index_type+ '_tranche_model.csv')"
+    "value_date = (datetime.datetime.today() - pd.offsets.BDay(1)).date()\n",
+    "start = (datetime.datetime.today() - pd.offsets.BDay(1) * 365 *4).date()\n",
+    "#end = (start + pd.offsets.BDay(1) * 365).date()\n",
+    "end = datetime.datetime.today()\n",
+    "index_type = 'IG'\n",
+    "serenitasconn = dbconn(\"serenitasdb\")\n",
+    "serenitasconn.autocommit = True\n",
+    "risk = disp.get_tranche_data(serenitasconn, index_type)\n",
+    "train_data = risk[start: end]\n",
+    "gini_calc, gini_model = disp.create_models(serenitasconn, train_data)\n",
+    "gini_model.fit().summary()"
    ]
   },
   {
@@ -187,212 +160,6 @@
    "source": [
     "tranche_returns.xs(29, level='series').unstack(level='attach').to_csv('/home/serenitas/edwin/Python/temp1.csv')"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#compare models\n",
-    "a = [True, False]\n",
-    "for years in [1,2,3,4,5,6]:\n",
-    "    date_range = pd.bdate_range(end=value_date, freq='5B',periods=52*years)\n",
-    "    risk = disp.get_tranche_data(index_type, serenitas_engine)\n",
-    "    risk = risk[risk.index.get_level_values(0).isin(date_range)]\n",
-    "    for x, y in list(itertools.product(a,a)):\n",
-    "        gini_model, gini_calc = disp.create_models(risk, use_gini=x, use_log=y)\n",
-    "        for i, m in gini_model.items():\n",
-    "            print (years, x, y, i, m.rsquared)\n",
-    "        today = gini_calc.xs([value_date,33], level=['date','series'])\n",
-    "        print (today[['exp_percentage', 'predict_N', 'predict_preN', 'mispricing']])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#Run a particular gini scenario\n",
-    "scenario = gini_calc.loc(axis=0)[value_date,33,'HY','5yr',0]\n",
-    "scenario['dispersion'] = .6\n",
-    "scenario_disp = np.exp(gini_model[0].predict(scenario))\n",
-    "mispricing = (scenario['exp_percentage'] - scenario_disp) * \\\n",
-    "             scenario['index_expected_loss'] / \\\n",
-    "             (scenario['detach_adj'] - scenario['attach_adj']) / \\\n",
-    "             scenario['indexfactor'] * 10000\n",
-    "mispricing"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gini_calc.loc(axis=0)[:,33,'HY','5yr',0]['mispricing']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#Let's use a GAM model instead?\n",
-    "#only use the 5yr point for modeling\n",
-    "equity = gini_calc.loc(axis=0)[:,:,[25,27,29,31,33],'5yr',0]\n",
-    "X = np.array(equity[['gini_spread', 'duration', 'moneyness']])\n",
-    "y = np.array(equity['exp_percentage'])\n",
-    "\n",
-    "#Fit for Lamda\n",
-    "gam_model = GAM(s(0, n_splines=5) +\n",
-    "                  s(1, n_splines=5) +\n",
-    "                  s(2, n_splines=5))\n",
-    "lam = np.logspace(-3, 5, 5, base=3)\n",
-    "lams = [lam] * 3\n",
-    "gam_model.gridsearch(X, y, lam=lams)\n",
-    "\n",
-    "gam_model.summary()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## plotting\n",
-    "fig, axs = plt.subplots(1,3);\n",
-    "\n",
-    "titles = ['gini_spread', 'duration', 'moneyness']\n",
-    "for i, ax in enumerate(axs):\n",
-    "    XX = gam_model.generate_X_grid(term=i)\n",
-    "    ax.plot(XX[:, i], gam_model.partial_dependence(term=i, X=XX))\n",
-    "    ax.plot(XX[:, i], gam_model.partial_dependence(term=i, X=XX, width=.95)[1], c='r', ls='--')\n",
-    "    if i == 0:\n",
-    "        ax.set_ylim(-30,30)\n",
-    "    ax.set_title(titles[i]);"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plt.scatter(y, gam_model.predict(X))\n",
-    "plt.xlabel('actual correlation')\n",
-    "plt.ylabel('predicted correlation')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "today = gini_calc.loc(axis=0)[value_date,'HY',33,'5yr',0]\n",
-    "predict_HY33 = gam_model.predict(np.array(today[['gini_spread', 'duration', 'moneyness']]))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "today, predict_HY33"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "deltas = []\n",
-    "for s in portf.swaptions:\n",
-    "    deltas.append(s.delta)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "value_date = (datetime.datetime.today() - pd.offsets.BDay(1)).date()\n",
-    "start = (datetime.datetime.today() - pd.offsets.BDay(1) * 365 *4).date()\n",
-    "#end = (start + pd.offsets.BDay(1) * 365).date()\n",
-    "end = datetime.datetime.today()\n",
-    "gini_model, gini_results = {}, {}\n",
-    "conn = dbconn(\"serenitasdb\")\n",
-    "conn.autocommit = True\n",
-    "for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
-    "    risk = disp.get_tranche_data(dbconn(\"serenitasdb\"), index_type)\n",
-    "    #gini_results[index_type], gini_model[index_type] = disp.create_separate_models(risk)\n",
-    "    gini_results[index_type], gini_model[index_type] = disp.create_models_v2(conn, risk)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#gini_model['HY'][0].summary()\n",
-    "gini_model['HY'].fit().summary()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gini_results['HY']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fieldlist = ['exp_percentage','dispersion','gini','tranche_loss_per','mispricing']\n",
-    "for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
-    "    gini_results[index_type][fieldlist].to_csv('/home/serenitas/edwin/' + index_type + '_results.csv')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {