aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/notebooks/dispersion_tranche_model.ipynb320
1 files changed, 169 insertions, 151 deletions
diff --git a/python/notebooks/dispersion_tranche_model.ipynb b/python/notebooks/dispersion_tranche_model.ipynb
index a56332ac..a2bb5a67 100644
--- a/python/notebooks/dispersion_tranche_model.ipynb
+++ b/python/notebooks/dispersion_tranche_model.ipynb
@@ -13,15 +13,13 @@
"import exploration.dispersion as disp\n",
"import matplotlib.pyplot as plt\n",
"import statsmodels.formula.api as smf\n",
- "import analytics.tranche_data as tdata\n",
- "import ipysheet\n",
+ "import serenitas.analytics.tranche_data as tdata\n",
"\n",
- "from analytics.basket_index import MarkitBasketIndex\n",
- "from analytics import on_the_run\n",
+ "from serenitas.analytics.basket_index import MarkitBasketIndex\n",
+ "from serenitas.analytics import on_the_run\n",
"from statsmodels.graphics.regressionplots import plot_fit\n",
"from scipy.special import logit, expit\n",
- "from pygam import LinearGAM, s, f, GAM\n",
- "from utils.db import dbengine, dbconn"
+ "from serenitas.utils.db import dbengine, dbconn"
]
},
{
@@ -30,89 +28,13 @@
"metadata": {},
"outputs": [],
"source": [
- "%matplotlib inline"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "value_date = (datetime.datetime.today() - pd.offsets.BDay(1)).date()\n",
- "start = (datetime.datetime.today() - pd.offsets.BDay(1) * 365 *4).date()\n",
- "#end = (start + pd.offsets.BDay(1) * 365).date()\n",
- "end = datetime.datetime.today()\n",
- "gini_model, gini_results = {}, {}\n",
- "conn = dbconn(\"serenitasdb\")\n",
- "conn.autocommit = True\n",
- "for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
- " risk = disp.get_tranche_data(dbconn(\"serenitasdb\"), index_type)\n",
- " risk = risk[risk.index_duration > 1] #filter out the short duration ones\n",
- " gini_results[index_type], gini_model[index_type] = disp.create_models_v2(conn, risk)\n",
- " #fitted = gini_model[index_type].fit()\n",
- " #w = 1/(expit(fitted.fittedvalues + fitted.resid) -expit(fitted.fittedvalues))**2\n",
- " #gini_results[index_type], gini_model[index_type] = disp.create_models_v2(conn, risk, w)\n",
- "gini_model['HY'].fit().summary()\n",
- "\n",
- "fieldlist = ['exp_percentage','dispersion','gini','tranche_loss_per','mispricing']\n",
- "for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
- " gini_results[index_type][fieldlist].to_csv('/home/serenitas/edwin/' + index_type + '_results.csv')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "fieldlist = ['exp_percentage','dispersion','gini','tranche_loss_per','mispricing']\n",
- "for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
- " gini_results[index_type][fieldlist].to_csv('/home/serenitas/edwin/' + index_type + '_results.csv')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#plot the residuals\n",
- "fitted = gini_model['HY'].fit()\n",
- "plt.figure(figsize=(8,5))\n",
- "p=plt.scatter(x=expit(fitted.fittedvalues),y=expit(fitted.fittedvalues + fitted.resid) -expit(fitted.fittedvalues),edgecolor='k')\n",
- "xmin=min(expit(fitted.fittedvalues))\n",
- "xmax = max(expit(fitted.fittedvalues))\n",
- "plt.hlines(y=0,xmin=xmin*0.9,xmax=xmax*1.1,color='red',linestyle='--',lw=3)\n",
- "plt.xlabel(\"Fitted values\",fontsize=15)\n",
- "plt.ylabel(\"Residuals\",fontsize=15)\n",
- "plt.title(\"Fitted vs. residuals plot\",fontsize=18)\n",
- "plt.grid(True)\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#plot the gini coefficients\n",
- "ginis = gini_results['HY'].xs([0, '5yr', 'HY'],level=['attach','tenor', 'index']).groupby(['date', 'series']).nth(-1).gini.unstack(level='series')\n",
- "ginis.plot()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#sheet = ipysheet.sheet(rows=2000, columns=7, column_headers=False, row_headers=False)\n",
- "import IPython\n",
- "pd.set_option(\"display.max_rows\", None)\n",
- "#IPython.OutputArea.auto_scroll_threshold = 20\n",
- "ginis.sort_index(ascending=False)"
+ "from sklearn.preprocessing import PolynomialFeatures, PowerTransformer\n",
+ "from sklearn.linear_model import LinearRegression\n",
+ "from sklearn.pipeline import make_pipeline\n",
+ "from sklearn.feature_selection import RFECV, RFE\n",
+ "from sklearn.compose import TransformedTargetRegressor\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor"
]
},
{
@@ -121,7 +43,7 @@
"metadata": {},
"outputs": [],
"source": [
- "#look at Volatility vs Correlation"
+ "%matplotlib inline"
]
},
{
@@ -130,25 +52,26 @@
"metadata": {},
"outputs": [],
"source": [
- "#use RFE to get the model instead\n",
- "from sklearn.preprocessing import PolynomialFeatures, PowerTransformer, normalize\n",
- "from sklearn.linear_model import LinearRegression\n",
- "from sklearn.pipeline import make_pipeline\n",
- "from sklearn.feature_selection import RFECV, SelectKBest, f_regression\n",
- "from sklearn.compose import TransformedTargetRegressor\n",
- "\n",
- "import numpy as np\n",
- "\n",
- "class MyTransformedTargetRegressor(TransformedTargetRegressor):\n",
- " @property\n",
- " def coef_(self):\n",
- " return self.regressor_.coef_\n",
- " \n",
- " @property\n",
- " def feature_importances_(self):\n",
- " return self.regressor_.feature_importances_\n",
- " \n",
- "index_type = 'HY'"
+ "#Section 1----------------------------------------------------\n",
+ "#RFE\n",
+ "drop_variable_list = ['tranche_loss_per', 'tranche_id', 'index_price', 'detach', 'corr_at_detach', \n",
+ " 'corr01', 'exp_percentage', 'indexfactor', 'duration', 'index_expected_loss',\n",
+ " 'index_theta', 'delta', 'expected_loss', 'attach_adj', 'detach_adj',\n",
+ " 'theta', 'cumulativeloss', \n",
+ " 'forward_delta', \n",
+ " #Comment out to include\n",
+ " 'index_duration',\n",
+ " # 'thickness',\n",
+ " # 'moneyness',\n",
+ " # 'index_basis',\n",
+ " 'att_moneyness', \n",
+ " 'det_moneyness',\n",
+ " 'dispersion',\n",
+ " # 'gini', \n",
+ " 'gamma',\n",
+ " 'theta',\n",
+ " 'index_theta'\n",
+ " ]"
]
},
{
@@ -157,21 +80,6 @@
"metadata": {},
"outputs": [],
"source": [
- "all_variable_list = ['tranche_loss_per', 'tranche_id', 'index_price', 'index_basis', 'detach', 'corr_at_detach', \n",
- " 'attach_adj', 'detach_adj', 'index_theta', 'delta','gamma', 'corr01', 'expected_loss', \n",
- " 'exp_percentage', 'indexfactor', 'duration', 'index_expected_loss',\n",
- " #Comment out to include\n",
- " # 'index_duration',\n",
- " 'theta',\n",
- " 'cumulativeloss',\n",
- " # 'att_moneyness',\n",
- " 'det_moneyness',\n",
- " 'thickness',\n",
- " # 'moneyness',\n",
- " # 'dispersion',\n",
- " 'gini',\n",
- " 'forward_delta']\n",
- "\n",
"def run_rfe(index_type):\n",
" risk = disp.get_tranche_data(dbconn(\"serenitasdb\"), index_type)\n",
" attach_max = risk.index.get_level_values(\"attach\").max()\n",
@@ -179,18 +87,44 @@
" bottom_stack = bottom_stack[bottom_stack.tranche_loss_per > 0].dropna()\n",
"\n",
" #prepare the variables\n",
- " y = bottom_stack['tranche_loss_per']\n",
- " X = bottom_stack.drop(all_variable_list, axis=1)\n",
- "\n",
- " poly = PolynomialFeatures(3)\n",
- " X = pd.DataFrame(PowerTransformer().fit_transform(X), index=X.index, columns=X.columns)\n",
- " X_p = pd.DataFrame(poly.fit_transform(X), columns= poly.get_feature_names(X.columns))\n",
- " regr = MyTransformedTargetRegressor(regressor=LinearRegression(), func=logit, inverse_func=expit)\n",
- "\n",
- " rfecv = RFECV(regr).fit(X_p,y)\n",
+ " y = logit(bottom_stack['tranche_loss_per'])\n",
+ " X = bottom_stack.drop(drop_variable_list, axis=1)\n",
+ " \n",
+ " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n",
+ " \n",
+ " pipe_rfe = make_pipeline (PowerTransformer(),\n",
+ " PolynomialFeatures(interaction_only=True),\n",
+ " RFECV(estimator=LinearRegression(), \n",
+ " cv=10,\n",
+ " min_features_to_select=1))\n",
+ " \n",
+ " pipe_rfe.fit(X_train, y_train)\n",
+ " n_features_to_select = pipe_rfe['rfecv'].n_features_\n",
+ " pipe_rfe.steps[-1]= ('rfe', RFE(estimator=LinearRegression(), n_features_to_select = n_features_to_select))\n",
+ " model = pipe_rfe.fit(X_train, y_train)\n",
+ " \n",
+ " #RandomForest\n",
+ " #params = {'n_estimators': 100,\n",
+ " # 'min_samples_split': 3,\n",
+ " # 'verbose':1,\n",
+ " # 'n_jobs': -1}\n",
+ " #randomforest = RandomForestRegressor(**params)\n",
+ " \n",
+ " \n",
+ " #gradientboost\n",
+ " #params = {'n_estimators': 500,\n",
+ " # 'max_depth': 10,\n",
+ " # 'min_samples_split': 3,\n",
+ " # 'learning_rate': 0.01,\n",
+ " # 'loss': 'huber',\n",
+ " # 'verbose':1}\n",
+ " #gb = GradientBoostingRegressor(**params).fit(X_p, y)\n",
+ " \n",
+ " #model = VotingRegressor([('rf', model), ('lr', randomforest)]).fit(X_train, y_train)\n",
+ " #model = VotingRegressor([('lr', pipe_rfe)]).fit(X, logit(y))\n",
"\n",
" df = pd.merge(risk, \n",
- " pd.DataFrame(rfecv.predict(X_p), \n",
+ " pd.DataFrame(expit(model.predict(X)), \n",
" index=X.index, \n",
" columns=['predict_tranche_loss']),\n",
" how='left', left_index=True, right_index=True)\n",
@@ -210,18 +144,19 @@
" * df.index_expected_loss\n",
" / (df.detach_adj - df.attach_adj)\n",
" )\n",
- "\n",
- " print(index_type, \" Chosen columns: \", X_p[X_p.columns[rfecv.support_]].columns)\n",
- " print(index_type, \" Score: \", rfecv.score(X_p, y))\n",
+ " rfe_result = pipe_rfe\n",
+ " print(index_type, \" num features: \", n_features_to_select)\n",
+ " print(index_type, \" Chosen columns: \", np.array(rfe_result['polynomialfeatures'].get_feature_names(X.columns))[rfe_result['rfe'].support_])\n",
+ " print(index_type, \" Training Score: \", model.score(X_train, y_train))\n",
+ " print(index_type, \" Testing Score: \", model.score(X_test, y_test))\n",
" \n",
- " return rfecv, df\n",
+ " return model, df, X\n",
"\n",
- "gini_model, gini_results = {}, {}\n",
- "for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
- " gini_model[index_type], gini_results[index_type] = run_rfe(index_type)\n",
+ "gini_model, gini_results, gini_X = {}, {}, {}\n",
"fieldlist = ['exp_percentage','dispersion','gini','tranche_loss_per','mispricing']\n",
"for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
- " gini_results[index_type][fieldlist].to_csv('/home/serenitas/edwin/' + index_type + '_results_rfecv.csv')"
+ " gini_model[index_type], gini_results[index_type], gini_X[index_type] = run_rfe(index_type)\n",
+ " gini_results[index_type][fieldlist].to_csv('/home/serenitas/edwin/DispersionModel/' + index_type + '_results_rfecv.csv')"
]
},
{
@@ -230,12 +165,70 @@
"metadata": {},
"outputs": [],
"source": [
+ "#examine the effect of any paricular variable\n",
+ "steps = 100\n",
+ "index_type = 'HY'\n",
+ "plots = {}\n",
+ "tranche_attach = []\n",
+ "\n",
+ "for i, X in gini_X[index_type].groupby('attach'):\n",
+ " tranche_attach.append(X.index[0][5])\n",
+ " for var in X.columns:\n",
+ " bins = np.linspace(X[var].min(), X[var].max(),num=steps)\n",
+ " testing_df = pd.DataFrame(bins, columns=[var])\n",
+ " for var_1 in X.drop(var, axis=1).columns:\n",
+ " testing_df = pd.concat([testing_df, pd.Series(np.repeat(X.iloc[-1][var_1], steps),name=var_1)], axis=1)\n",
+ " plots[i, var] = pd.Series(expit(gini_model[index_type].predict(testing_df[X.columns])), index=testing_df[var])\n",
+ "\n",
+ "fig, axes = plt.subplots(nrows=3, ncols=len(X.columns), figsize = (20,10))\n",
+ "for i, p in enumerate(plots):\n",
+ " x_loc = int(i/len(X.columns))\n",
+ " y_loc = i % len(X.columns)\n",
+ " if x_loc == 0:\n",
+ " axes[x_loc, y_loc].set_title(p[1]) \n",
+ " plots[p].plot(ax=axes[x_loc, y_loc], label=i, xlabel=\"\")\n",
+ "for i in [0,1,2]:\n",
+ " fig.axes[i*len(X.columns)].text(-0.2, 0.5, \"tranche attach: \" + str(tranche_attach[i]),\n",
+ " transform=fig.axes[i*len(X.columns)].transAxes,\n",
+ " verticalalignment='center',\n",
+ " rotation=90)\n",
+ "fig.savefig(\"/home/serenitas/edwin/PythonGraphs/dispersion_model.png\", bbox_inches='tight')\n",
+ "\n",
+ "fig_1, axes_1 = plt.subplots(nrows=3, ncols=1, figsize = (15,8))\n",
+ "for i, p in enumerate(plots):\n",
+ " x_loc = int(i/len(X.columns))\n",
+ " plots[p].plot(ax=axes_1[x_loc], label=p[1], xlabel=\"\", legend=True)\n",
+ "for i in [0,1,2]:\n",
+ " fig_1.axes[i].text(-0.05, 0.5, \"tranche attach: \" + str(tranche_attach[i]),\n",
+ " transform=fig_1.axes[i].transAxes,\n",
+ " verticalalignment='center',\n",
+ " rotation=90)\n",
+ "fig_1.savefig(\"/home/serenitas/edwin/PythonGraphs/dispersion_model_consolidated.png\", bbox_inches='tight')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Section 2----------------------------------------------------\n",
+ "#Fixed Model\n",
+ "value_date = (datetime.datetime.today() - pd.offsets.BDay(1)).date()\n",
+ "start = (datetime.datetime.today() - pd.offsets.BDay(1) * 365 *4).date()\n",
+ "end = datetime.datetime.today()\n",
"gini_model, gini_results = {}, {}\n",
+ "conn = dbconn(\"serenitasdb\")\n",
+ "conn.autocommit = True\n",
"for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
- " gini_model[index_type], gini_results[index_type] = run_rfe(index_type)\n",
+ " risk = disp.get_tranche_data(dbconn(\"serenitasdb\"), index_type)\n",
+ " risk = risk[risk.index_duration > 1] #filter out the short duration ones\n",
+ " gini_results[index_type], gini_model[index_type] = disp.create_models_v2(conn, risk)\n",
+ "gini_model['HY'].fit().summary()\n",
+ "\n",
"fieldlist = ['exp_percentage','dispersion','gini','tranche_loss_per','mispricing']\n",
"for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
- " gini_results[index_type][fieldlist].to_csv('/home/serenitas/edwin/' + index_type + '_results_rfecv.csv')"
+ " gini_results[index_type][fieldlist].to_csv('/home/serenitas/edwin/DispersionModel/' + index_type + '_results.csv')"
]
},
{
@@ -244,14 +237,39 @@
"metadata": {},
"outputs": [],
"source": [
- "# Plot number of features VS. cross-validation scores\n",
- "index_type = 'IG'\n",
- "plt.figure()\n",
- "plt.xlabel(\"Number of features selected\")\n",
- "plt.ylabel(\"Cross validation score (nb of correct classifications)\")\n",
- "plt.plot(range(1, len(gini_model[index_type].grid_scores_) + 1), gini_model[index_type].grid_scores_)\n",
+ "#plot the residuals\n",
+ "fitted = gini_model['HY'].fit()\n",
+ "plt.figure(figsize=(8,5))\n",
+ "p=plt.scatter(x=expit(fitted.fittedvalues),y=expit(fitted.fittedvalues + fitted.resid) -expit(fitted.fittedvalues),edgecolor='k')\n",
+ "xmin=min(expit(fitted.fittedvalues))\n",
+ "xmax = max(expit(fitted.fittedvalues))\n",
+ "plt.hlines(y=0,xmin=xmin*0.9,xmax=xmax*1.1,color='red',linestyle='--',lw=3)\n",
+ "plt.xlabel(\"Fitted values\",fontsize=15)\n",
+ "plt.ylabel(\"Residuals\",fontsize=15)\n",
+ "plt.title(\"Fitted vs. residuals plot\",fontsize=18)\n",
+ "plt.grid(True)\n",
"plt.show()"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Section 3----------------------------------------------------\n",
+ "#plot the gini coefficients\n",
+ "index_type='EU'\n",
+ "ginis = gini_results[index_type].xs([0, '5yr', index_type],level=['attach','tenor', 'index']).groupby(['date', 'series']).nth(-1).gini.unstack(level='series')\n",
+ "ginis.sort_index(ascending=False).to_clipboard()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
@@ -270,7 +288,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.5"
+ "version": "3.9.1-final"
}
},
"nbformat": 4,