1 files changed, 27 insertions, 171 deletions
diff --git a/python/notebooks/dispersion_tranche_model.ipynb b/python/notebooks/dispersion_tranche_model.ipynb
index 46eb348c..56255a42 100644
--- a/python/notebooks/dispersion_tranche_model.ipynb
+++ b/python/notebooks/dispersion_tranche_model.ipynb
@@ -18,7 +18,7 @@
     "import serenitas.analytics.tranche_data as tdata\n",
     "\n",
     "from serenitas.analytics.basket_index import MarkitBasketIndex\n",
-    "from serenitas.analytics import on_the_run\n",
+    "from serenitas.analytics.index_data import on_the_run\n",
     "from statsmodels.graphics.regressionplots import plot_fit\n",
     "from scipy.special import logit, expit\n",
     "from serenitas.utils.db import dbengine, dbconn\n",
@@ -52,117 +52,18 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#Section 1----------------------------------------------------\n",
-    "#index basis doesn't work with HY (opposite reaction to what I think)\n",
-    "#RFE\n",
-    "drop_variable_list = ['tranche_loss_per', 'tranche_id', 'index_price', 'detach', 'corr_at_detach', \n",
-    "                      'corr01', 'exp_percentage', 'indexfactor', 'duration', 'index_expected_loss',\n",
-    "                      'index_theta', 'delta', 'expected_loss', 'attach_adj', 'detach_adj',\n",
-    "                      'cumulativeloss',  \n",
-    "                      'forward_delta', \n",
-    "                      #Comment out to include\n",
-    "    #                  'index_duration',\n",
-    "                     'thickness',\n",
-    "                     'moneyness',\n",
-    "    #                 'index_basis',\n",
-    "    #                  'att_moneyness', \n",
-    "    #                 'det_moneyness',\n",
-    "                      'dispersion',\n",
-    "    #                 'gini', \n",
-    "                      'gamma',\n",
-    "                      'theta',\n",
-    "                      'index_theta'\n",
-    "                     ]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
-    "def run_rfe(index_type):\n",
-    "    risk = disp.get_tranche_data(dbconn(\"serenitasdb\"), index_type)\n",
-    "    attach_max = risk.index.get_level_values(\"attach\").max()\n",
-    "    bottom_stack = risk[risk.index.get_level_values(\"attach\") != attach_max]\n",
-    "    bottom_stack = bottom_stack[bottom_stack.tranche_loss_per > 0].dropna()\n",
-    "\n",
-    "    #prepare the variables\n",
-    "    y = logit(bottom_stack['tranche_loss_per'])\n",
-    "    X = bottom_stack.drop(drop_variable_list, axis=1)\n",
-    "         \n",
-    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n",
-    "    \n",
-    "    pipe_rfe = make_pipeline (PowerTransformer(),\n",
-    "                              #PolynomialFeatures(degree=2),\n",
-    "                              PolynomialFeatures(interaction_only=True),\n",
-    "                              RFECV(estimator=LinearRegression(), \n",
-    "                                    cv=10,\n",
-    "                                    min_features_to_select=1))\n",
-    "    \n",
-    "    pipe_rfe.fit(X_train, y_train)\n",
-    "    n_features_to_select = pipe_rfe['rfecv'].n_features_\n",
-    "    pipe_rfe.steps[-1]= ('rfe', RFE(estimator=LinearRegression(), n_features_to_select = n_features_to_select))\n",
-    "    model = pipe_rfe.fit(X_train, y_train)\n",
-    "  \n",
-    "    #RandomForest\n",
-    "    #params = {'n_estimators': 100,\n",
-    "    #          'min_samples_split': 3,\n",
-    "    #          'verbose':1,\n",
-    "    #         'n_jobs': -1}\n",
-    "    #randomforest = RandomForestRegressor(**params)\n",
-    "    \n",
-    "    \n",
-    "    #gradientboost\n",
-    "    #params = {'n_estimators': 500,\n",
-    "    #          'max_depth': 10,\n",
-    "    #          'min_samples_split': 3,\n",
-    "    #          'learning_rate': 0.01,\n",
-    "    #          'loss': 'huber',\n",
-    "    #          'verbose':1}\n",
-    "    #gb = GradientBoostingRegressor(**params).fit(X_train, y_train)\n",
-    "    \n",
-    "    #model = VotingRegressor([('rf', model), ('gb', gb)]).fit(X_train, y_train)\n",
-    "    #model = VotingRegressor([('lr', pipe_rfe)]).fit(X, logit(y))\n",
-    "\n",
-    "    df = pd.merge(risk, \n",
-    "                       pd.DataFrame(expit(model.predict(X)), \n",
-    "                                    index=X.index, \n",
-    "                                    columns=['predict_tranche_loss']),\n",
-    "                      how='left', left_index=True, right_index=True)\n",
-    "\n",
-    "    df.loc[df.index.get_level_values(\"attach\") != attach_max, \"predict_tranche_loss_per_index\"] = (\n",
-    "        df.predict_tranche_loss * df.thickness / df.index_expected_loss\n",
-    "    )\n",
-    "\n",
-    "    def aux(s):\n",
-    "        temp = s.values\n",
-    "        temp[-1] = 1 - temp[:-1].sum()\n",
-    "        return temp\n",
-    "\n",
-    "    df[\"predict_tranche_loss_per_index\"] = df.groupby([\"index\", \"series\", \"date\"])[\"predict_tranche_loss_per_index\"].transform(aux)\n",
-    "    df = df.assign(\n",
-    "        mispricing=(df.exp_percentage - df.predict_tranche_loss_per_index)\n",
-    "        * df.index_expected_loss\n",
-    "        / (df.detach_adj - df.attach_adj)\n",
-    "    )\n",
-    "    rfe_result = pipe_rfe\n",
-    "    print(index_type, \" num features: \", n_features_to_select)\n",
-    "    print(index_type, \" Chosen columns: \",  np.array(rfe_result['polynomialfeatures'].get_feature_names_out(X.columns))[rfe_result['rfe'].support_])\n",
-    "    print(index_type, \" Training Score: \",  model.score(X_train, y_train))\n",
-    "    print(index_type, \" Testing Score: \",  model.score(X_test, y_test))\n",
-    "    \n",
-    "    return model, df, X\n",
-    "\n",
-    "gini_model, gini_results, gini_X = {}, {}, {}\n",
+    "#Run RFE model\n",
+    "gini_model, gini_results = {}, {}\n",
     "fieldlist = ['exp_percentage','dispersion','gini','tranche_loss_per','mispricing']\n",
     "for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
-    "    gini_model[index_type], gini_results[index_type], gini_X[index_type] = run_rfe(index_type)\n",
+    "    risk = disp.get_tranche_data(dbconn(\"serenitasdb\"), index_type)\n",
+    "    risk = risk[risk.index_duration > 1] #filter out the short duration ones\n",
+    "    gini_results[index_type], gini_model[index_type]  = disp.create_rfe_models(risk)\n",
     "    gini_results[index_type][fieldlist].to_csv('/home/serenitas/edwin/DispersionModel/' + index_type + '_results_rfecv.csv')"
    ]
   },
@@ -179,20 +80,27 @@
     "for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
     "    plots = {}\n",
     "    tranche_attach = []\n",
-    "\n",
-    "    for i, X in gini_X[index_type].groupby('attach'):\n",
+    "    \n",
+    "    res = gini_results[index_type]\n",
+    "    mod = gini_model[index_type]\n",
+    "    \n",
+    "    Xs = res[mod.feature_names_in_]\n",
+    "    \n",
+    "    for i, X in Xs.groupby('attach'):\n",
     "        tranche_attach.append(X.index[0][5])\n",
     "        for var in X.columns:\n",
     "            bins = np.linspace(X[var].min(), X[var].max(),num=steps)\n",
     "            testing_df = pd.DataFrame(bins, columns=[var])\n",
     "            for var_1 in X.drop(var, axis=1).columns:\n",
     "                testing_df = pd.concat([testing_df, pd.Series(np.repeat(X.iloc[-1][var_1], steps),name=var_1)], axis=1)\n",
-    "            plots[i, var] = pd.Series(expit(gini_model[index_type].predict(testing_df[X.columns])), index=testing_df[var])\n",
+    "            plots[i, var] = pd.Series(expit(mod.predict(testing_df[X.columns])), index=testing_df[var])\n",
     "\n",
+    "    #breakpoint()\n",
+    "            \n",
     "    sensitivies = pd.concat(plots, names=['attach', 'shock', 'value'])\n",
     "    sensitivies.to_csv('/home/serenitas/edwin/DispersionModel/' + index_type + '_sensitivies.csv')\n",
     "\n",
-    "    fig, axes = plt.subplots(nrows=3, ncols=len(X.columns), figsize = (20,10))\n",
+    "    fig, axes = plt.subplots(nrows=4, ncols=len(X.columns), figsize = (20,10))\n",
     "    for i, p in enumerate(plots):\n",
     "        x_loc = int(i/len(X.columns))\n",
     "        y_loc = i % len(X.columns)\n",
@@ -206,7 +114,7 @@
     "                                        rotation=90)\n",
     "    fig.savefig(\"/home/serenitas/edwin/PythonGraphs/dispersion_model.png\", bbox_inches='tight')\n",
     "\n",
-    "    fig_1, axes_1 = plt.subplots(nrows=3, ncols=1, figsize = (15,8))\n",
+    "    fig_1, axes_1 = plt.subplots(nrows=4, ncols=1, figsize = (15,8))\n",
     "    for i, p in enumerate(plots):\n",
     "        x_loc = int(i/len(X.columns))\n",
     "        plots[p].plot(ax=axes_1[x_loc], label=p[1], xlabel=\"\", legend=True)\n",
@@ -234,7 +142,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "#Section 3----------------------------------------------------\n",
@@ -259,7 +169,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "#plot the residuals\n",
@@ -289,69 +201,13 @@
     "data = risk[['gini', 'index_duration', 'index_expected_loss']]\n",
     "ols_model = smf.ols(\"gini ~ np.log(index_duration) + np.log(index_expected_loss)\", data=data).fit()\n"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3.9.1 64-bit",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
-   "name": "python39164bit6ddd573894c04d6a858a9a58880cc9d4"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -363,7 +219,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.2"
+   "version": "3.10.8"
   }
  },
  "nbformat": 4,