aboutsummaryrefslogtreecommitdiffstats
path: root/python/notebooks
diff options
context:
space:
mode:
Diffstat (limited to 'python/notebooks')
-rw-r--r--python/notebooks/dispersion_tranche_model.ipynb278
1 files changed, 278 insertions, 0 deletions
diff --git a/python/notebooks/dispersion_tranche_model.ipynb b/python/notebooks/dispersion_tranche_model.ipynb
new file mode 100644
index 00000000..a56332ac
--- /dev/null
+++ b/python/notebooks/dispersion_tranche_model.ipynb
@@ -0,0 +1,278 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import itertools\n",
+ "import datetime\n",
+ "import exploration.dispersion as disp\n",
+ "import matplotlib.pyplot as plt\n",
+ "import statsmodels.formula.api as smf\n",
+ "import analytics.tranche_data as tdata\n",
+ "import ipysheet\n",
+ "\n",
+ "from analytics.basket_index import MarkitBasketIndex\n",
+ "from analytics import on_the_run\n",
+ "from statsmodels.graphics.regressionplots import plot_fit\n",
+ "from scipy.special import logit, expit\n",
+ "from pygam import LinearGAM, s, f, GAM\n",
+ "from utils.db import dbengine, dbconn"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%matplotlib inline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "value_date = (datetime.datetime.today() - pd.offsets.BDay(1)).date()\n",
+ "start = (datetime.datetime.today() - pd.offsets.BDay(1) * 365 *4).date()\n",
+ "#end = (start + pd.offsets.BDay(1) * 365).date()\n",
+ "end = datetime.datetime.today()\n",
+ "gini_model, gini_results = {}, {}\n",
+ "conn = dbconn(\"serenitasdb\")\n",
+ "conn.autocommit = True\n",
+ "for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
+ " risk = disp.get_tranche_data(dbconn(\"serenitasdb\"), index_type)\n",
+ " risk = risk[risk.index_duration > 1] #filter out the short duration ones\n",
+ " gini_results[index_type], gini_model[index_type] = disp.create_models_v2(conn, risk)\n",
+ " #fitted = gini_model[index_type].fit()\n",
+ " #w = 1/(expit(fitted.fittedvalues + fitted.resid) -expit(fitted.fittedvalues))**2\n",
+ " #gini_results[index_type], gini_model[index_type] = disp.create_models_v2(conn, risk, w)\n",
+ "gini_model['HY'].fit().summary()\n",
+ "\n",
+ "fieldlist = ['exp_percentage','dispersion','gini','tranche_loss_per','mispricing']\n",
+ "for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
+ " gini_results[index_type][fieldlist].to_csv('/home/serenitas/edwin/' + index_type + '_results.csv')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fieldlist = ['exp_percentage','dispersion','gini','tranche_loss_per','mispricing']\n",
+ "for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
+ " gini_results[index_type][fieldlist].to_csv('/home/serenitas/edwin/' + index_type + '_results.csv')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#plot the residuals\n",
+ "fitted = gini_model['HY'].fit()\n",
+ "plt.figure(figsize=(8,5))\n",
+ "p=plt.scatter(x=expit(fitted.fittedvalues),y=expit(fitted.fittedvalues + fitted.resid) -expit(fitted.fittedvalues),edgecolor='k')\n",
+ "xmin=min(expit(fitted.fittedvalues))\n",
+ "xmax = max(expit(fitted.fittedvalues))\n",
+ "plt.hlines(y=0,xmin=xmin*0.9,xmax=xmax*1.1,color='red',linestyle='--',lw=3)\n",
+ "plt.xlabel(\"Fitted values\",fontsize=15)\n",
+ "plt.ylabel(\"Residuals\",fontsize=15)\n",
+ "plt.title(\"Fitted vs. residuals plot\",fontsize=18)\n",
+ "plt.grid(True)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#plot the gini coefficients\n",
+ "ginis = gini_results['HY'].xs([0, '5yr', 'HY'],level=['attach','tenor', 'index']).groupby(['date', 'series']).nth(-1).gini.unstack(level='series')\n",
+ "ginis.plot()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#sheet = ipysheet.sheet(rows=2000, columns=7, column_headers=False, row_headers=False)\n",
+ "import IPython\n",
+ "pd.set_option(\"display.max_rows\", None)\n",
+ "#IPython.OutputArea.auto_scroll_threshold = 20\n",
+ "ginis.sort_index(ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#look at Volatility vs Correlation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#use RFE to get the model instead\n",
+ "from sklearn.preprocessing import PolynomialFeatures, PowerTransformer, normalize\n",
+ "from sklearn.linear_model import LinearRegression\n",
+ "from sklearn.pipeline import make_pipeline\n",
+ "from sklearn.feature_selection import RFECV, SelectKBest, f_regression\n",
+ "from sklearn.compose import TransformedTargetRegressor\n",
+ "\n",
+ "import numpy as np\n",
+ "\n",
+ "class MyTransformedTargetRegressor(TransformedTargetRegressor):\n",
+ " @property\n",
+ " def coef_(self):\n",
+ " return self.regressor_.coef_\n",
+ " \n",
+ " @property\n",
+ " def feature_importances_(self):\n",
+ " return self.regressor_.feature_importances_\n",
+ " \n",
+ "index_type = 'HY'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "all_variable_list = ['tranche_loss_per', 'tranche_id', 'index_price', 'index_basis', 'detach', 'corr_at_detach', \n",
+ " 'attach_adj', 'detach_adj', 'index_theta', 'delta','gamma', 'corr01', 'expected_loss', \n",
+ " 'exp_percentage', 'indexfactor', 'duration', 'index_expected_loss',\n",
+ " #Comment out to include\n",
+ " # 'index_duration',\n",
+ " 'theta',\n",
+ " 'cumulativeloss',\n",
+ " # 'att_moneyness',\n",
+ " 'det_moneyness',\n",
+ " 'thickness',\n",
+ " # 'moneyness',\n",
+ " # 'dispersion',\n",
+ " 'gini',\n",
+ " 'forward_delta']\n",
+ "\n",
+ "def run_rfe(index_type):\n",
+ " risk = disp.get_tranche_data(dbconn(\"serenitasdb\"), index_type)\n",
+ " attach_max = risk.index.get_level_values(\"attach\").max()\n",
+ " bottom_stack = risk[risk.index.get_level_values(\"attach\") != attach_max]\n",
+ " bottom_stack = bottom_stack[bottom_stack.tranche_loss_per > 0].dropna()\n",
+ "\n",
+ " #prepare the variables\n",
+ " y = bottom_stack['tranche_loss_per']\n",
+ " X = bottom_stack.drop(all_variable_list, axis=1)\n",
+ "\n",
+ " poly = PolynomialFeatures(3)\n",
+ " X = pd.DataFrame(PowerTransformer().fit_transform(X), index=X.index, columns=X.columns)\n",
+ " X_p = pd.DataFrame(poly.fit_transform(X), columns= poly.get_feature_names(X.columns))\n",
+ " regr = MyTransformedTargetRegressor(regressor=LinearRegression(), func=logit, inverse_func=expit)\n",
+ "\n",
+ " rfecv = RFECV(regr).fit(X_p,y)\n",
+ "\n",
+ " df = pd.merge(risk, \n",
+ " pd.DataFrame(rfecv.predict(X_p), \n",
+ " index=X.index, \n",
+ " columns=['predict_tranche_loss']),\n",
+ " how='left', left_index=True, right_index=True)\n",
+ "\n",
+ " df.loc[df.index.get_level_values(\"attach\") != attach_max, \"predict_tranche_loss_per_index\"] = (\n",
+ " df.predict_tranche_loss * df.thickness / df.index_expected_loss\n",
+ " )\n",
+ "\n",
+ " def aux(s):\n",
+ " temp = s.values\n",
+ " temp[-1] = 1 - temp[:-1].sum()\n",
+ " return temp\n",
+ "\n",
+ " df[\"predict_tranche_loss_per_index\"] = df.groupby([\"index\", \"series\", \"date\"])[\"predict_tranche_loss_per_index\"].transform(aux)\n",
+ " df = df.assign(\n",
+ " mispricing=(df.exp_percentage - df.predict_tranche_loss_per_index)\n",
+ " * df.index_expected_loss\n",
+ " / (df.detach_adj - df.attach_adj)\n",
+ " )\n",
+ "\n",
+ " print(index_type, \" Chosen columns: \", X_p[X_p.columns[rfecv.support_]].columns)\n",
+ " print(index_type, \" Score: \", rfecv.score(X_p, y))\n",
+ " \n",
+ " return rfecv, df\n",
+ "\n",
+ "gini_model, gini_results = {}, {}\n",
+ "for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
+ " gini_model[index_type], gini_results[index_type] = run_rfe(index_type)\n",
+ "fieldlist = ['exp_percentage','dispersion','gini','tranche_loss_per','mispricing']\n",
+ "for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
+ " gini_results[index_type][fieldlist].to_csv('/home/serenitas/edwin/' + index_type + '_results_rfecv.csv')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gini_model, gini_results = {}, {}\n",
+ "for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
+ " gini_model[index_type], gini_results[index_type] = run_rfe(index_type)\n",
+ "fieldlist = ['exp_percentage','dispersion','gini','tranche_loss_per','mispricing']\n",
+ "for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
+ " gini_results[index_type][fieldlist].to_csv('/home/serenitas/edwin/' + index_type + '_results_rfecv.csv')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Plot number of features VS. cross-validation scores\n",
+ "index_type = 'IG'\n",
+ "plt.figure()\n",
+ "plt.xlabel(\"Number of features selected\")\n",
+ "plt.ylabel(\"Cross validation score (nb of correct classifications)\")\n",
+ "plt.plot(range(1, len(gini_model[index_type].grid_scores_) + 1), gini_model[index_type].grid_scores_)\n",
+ "plt.show()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}