{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import itertools\n",
    "import datetime\n",
    "import exploration.dispersion as disp\n",
    "import matplotlib.pyplot as plt\n",
    "import statsmodels.formula.api as smf\n",
    "import serenitas.analytics.tranche_data as tdata\n",
    "\n",
    "from serenitas.analytics.basket_index import MarkitBasketIndex\n",
    "from serenitas.analytics.index_data import on_the_run\n",
    "from statsmodels.graphics.regressionplots import plot_fit\n",
    "from scipy.special import logit, expit\n",
    "from serenitas.utils.db import dbengine, dbconn\n",
    "import statsmodels.formula.api as smf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import PolynomialFeatures, PowerTransformer\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.feature_selection import RFECV, RFE\n",
    "from sklearn.compose import TransformedTargetRegressor\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "#Run RFE model\n",
    "gini_model, gini_results = {}, {}\n",
    "fieldlist = ['exp_percentage','dispersion','gini','tranche_loss_per','mispricing']\n",
    "for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
    "    risk = disp.get_tranche_data(dbconn(\"serenitasdb\"), index_type)\n",
    "    risk = risk[risk.index_duration > 1] #filter out the short duration ones\n",
    "    gini_results[index_type], gini_model[index_type]  = disp.create_rfe_models(risk)\n",
    "    gini_results[index_type][fieldlist].to_csv('/home/serenitas/edwin/DispersionModel/' + index_type + '_results_rfecv.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "#examine the effect of any paricular variable\n",
    "steps = 100\n",
    "for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
    "    plots = {}\n",
    "    tranche_attach = []\n",
    "    \n",
    "    res = gini_results[index_type]\n",
    "    mod = gini_model[index_type]\n",
    "    \n",
    "    Xs = res[mod.feature_names_in_]\n",
    "    \n",
    "    for i, X in Xs.groupby('attach'):\n",
    "        tranche_attach.append(X.index[0][5])\n",
    "        for var in X.columns:\n",
    "            bins = np.linspace(X[var].min(), X[var].max(),num=steps)\n",
    "            testing_df = pd.DataFrame(bins, columns=[var])\n",
    "            for var_1 in X.drop(var, axis=1).columns:\n",
    "                testing_df = pd.concat([testing_df, pd.Series(np.repeat(X.iloc[-1][var_1], steps),name=var_1)], axis=1)\n",
    "            plots[i, var] = pd.Series(expit(mod.predict(testing_df[X.columns])), index=testing_df[var])\n",
    "\n",
    "    #breakpoint()\n",
    "            \n",
    "    sensitivies = pd.concat(plots, names=['attach', 'shock', 'value'])\n",
    "    sensitivies.to_csv('/home/serenitas/edwin/DispersionModel/' + index_type + '_sensitivies.csv')\n",
    "\n",
    "    fig, axes = plt.subplots(nrows=4, ncols=len(X.columns), figsize = (20,10))\n",
    "    for i, p in enumerate(plots):\n",
    "        x_loc = int(i/len(X.columns))\n",
    "        y_loc = i % len(X.columns)\n",
    "        if x_loc == 0:\n",
    "            axes[x_loc, y_loc].set_title(p[1]) \n",
    "        plots[p].plot(ax=axes[x_loc, y_loc], label=i, xlabel=\"\")\n",
    "    for i in [0,1,2]:\n",
    "        fig.axes[i*len(X.columns)].text(-0.2, 0.5, \"tranche attach: \"  + str(tranche_attach[i]),\n",
    "                                        transform=fig.axes[i*len(X.columns)].transAxes,\n",
    "                                        verticalalignment='center',\n",
    "                                        rotation=90)\n",
    "    fig.savefig(\"/home/serenitas/edwin/PythonGraphs/dispersion_model.png\", bbox_inches='tight')\n",
    "\n",
    "    fig_1, axes_1 = plt.subplots(nrows=4, ncols=1, figsize = (15,8))\n",
    "    for i, p in enumerate(plots):\n",
    "        x_loc = int(i/len(X.columns))\n",
    "        plots[p].plot(ax=axes_1[x_loc], label=p[1], xlabel=\"\", legend=True)\n",
    "    for i in [0,1,2]:\n",
    "        fig_1.axes[i].text(-0.05, 0.5, \"tranche attach: \"  + str(tranche_attach[i]),\n",
    "                                        transform=fig_1.axes[i].transAxes,\n",
    "                                        verticalalignment='center',\n",
    "                                        rotation=90)\n",
    "    fig_1.savefig(\"/home/serenitas/edwin/PythonGraphs/dispersion_model_consolidated.png\", bbox_inches='tight')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Section 2----------------------------------------------------\n",
    "#plot the gini coefficients\n",
    "for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
    "    ginis = gini_results[index_type].xs((0, '5yr', index_type),level=['attach','tenor', 'index']).groupby(['date', 'series']).nth(-1).gini.unstack(level='series')\n",
    "    ginis.to_csv('/home/serenitas/edwin/DispersionModel/' + index_type + '_gini.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "#Section 3----------------------------------------------------\n",
    "#Fixed Model\n",
    "value_date = (datetime.datetime.today() - pd.offsets.BDay(1)).date()\n",
    "start = (datetime.datetime.today() - pd.offsets.BDay(1) * 365 *4).date()\n",
    "end = datetime.datetime.today()\n",
    "gini_model, gini_results = {}, {}\n",
    "conn = dbconn(\"serenitasdb\")\n",
    "conn.autocommit = True\n",
    "for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
    "    risk = disp.get_tranche_data(dbconn(\"serenitasdb\"), index_type)\n",
    "    risk = risk[risk.index_duration > 1] #filter out the short duration ones\n",
    "    gini_results[index_type], gini_model[index_type] = disp.create_models_v2(conn, risk)\n",
    "gini_model['HY'].fit().summary()\n",
    "\n",
    "fieldlist = ['exp_percentage','dispersion','gini','tranche_loss_per','mispricing']\n",
    "for index_type in ['HY', 'IG', 'EU', 'XO']:\n",
    "    gini_results[index_type][fieldlist].to_csv('/home/serenitas/edwin/DispersionModel/' + index_type + '_results.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "#plot the residuals\n",
    "fitted = gini_model['HY'].fit()\n",
    "plt.figure(figsize=(8,5))\n",
    "p=plt.scatter(x=expit(fitted.fittedvalues),y=expit(fitted.fittedvalues + fitted.resid) -expit(fitted.fittedvalues),edgecolor='k')\n",
    "xmin=min(expit(fitted.fittedvalues))\n",
    "xmax = max(expit(fitted.fittedvalues))\n",
    "plt.hlines(y=0,xmin=xmin*0.9,xmax=xmax*1.1,color='red',linestyle='--',lw=3)\n",
    "plt.xlabel(\"Fitted values\",fontsize=15)\n",
    "plt.ylabel(\"Residuals\",fontsize=15)\n",
    "plt.title(\"Fitted vs. residuals plot\",fontsize=18)\n",
    "plt.grid(True)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Section 4----------------------------------------------------\n",
    "#Model gini? let's try it out with duration and spread\n",
    "index_type = 'HY'\n",
    "risk = disp.get_tranche_data(dbconn(\"serenitasdb\"), index_type).dropna(subset=['gini', 'index_duration', 'index_expected_loss'])\n",
    "data = risk[['gini', 'index_duration', 'index_expected_loss']]\n",
    "ols_model = smf.ols(\"gini ~ np.log(index_duration) + np.log(index_expected_loss)\", data=data).fit()\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}