In [None]:
import pandas as pd
import numpy as np
import itertools
import datetime
import exploration.dispersion as disp
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import serenitas.analytics.tranche_data as tdata

from serenitas.analytics.basket_index import MarkitBasketIndex
from serenitas.analytics import on_the_run
from statsmodels.graphics.regressionplots import plot_fit
from scipy.special import logit, expit
from serenitas.utils.db import dbengine, dbconn

In [None]:
from sklearn.preprocessing import PolynomialFeatures, PowerTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFECV, RFE
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor

In [None]:
%matplotlib inline

In [None]:
#Section 1----------------------------------------------------
#index basis doesn't work with HY (opposite reaction to what I think)
#RFE
drop_variable_list = ['tranche_loss_per', 'tranche_id', 'index_price', 'detach', 'corr_at_detach', 
 'corr01', 'exp_percentage', 'indexfactor', 'duration', 'index_expected_loss',
 'index_theta', 'delta', 'expected_loss', 'attach_adj', 'detach_adj',
 'cumulativeloss', 
 'forward_delta', 
 #Comment out to include
 # 'index_duration',
 'thickness',
 'moneyness',
 # 'index_basis',
 # 'att_moneyness', 
 # 'det_moneyness',
 'dispersion',
 # 'gini', 
 'gamma',
 'theta',
 'index_theta'
 ]

In [None]:
def run_rfe(index_type):
 risk = disp.get_tranche_data(dbconn("serenitasdb"), index_type)
 attach_max = risk.index.get_level_values("attach").max()
 bottom_stack = risk[risk.index.get_level_values("attach") != attach_max]
 bottom_stack = bottom_stack[bottom_stack.tranche_loss_per > 0].dropna()

 #prepare the variables
 y = logit(bottom_stack['tranche_loss_per'])
 X = bottom_stack.drop(drop_variable_list, axis=1)
 
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
 
 pipe_rfe = make_pipeline (PowerTransformer(),
 #PolynomialFeatures(degree=2),
 PolynomialFeatures(interaction_only=True),
 RFECV(estimator=LinearRegression(), 
 cv=10,
 min_features_to_select=1))
 
 pipe_rfe.fit(X_train, y_train)
 n_features_to_select = pipe_rfe['rfecv'].n_features_
 pipe_rfe.steps[-1]= ('rfe', RFE(estimator=LinearRegression(), n_features_to_select = n_features_to_select))
 model = pipe_rfe.fit(X_train, y_train)
 
 #RandomForest
 #params = {'n_estimators': 100,
 # 'min_samples_split': 3,
 # 'verbose':1,
 # 'n_jobs': -1}
 #randomforest = RandomForestRegressor(**params)
 
 
 #gradientboost
 #params = {'n_estimators': 500,
 # 'max_depth': 10,
 # 'min_samples_split': 3,
 # 'learning_rate': 0.01,
 # 'loss': 'huber',
 # 'verbose':1}
 #gb = GradientBoostingRegressor(**params).fit(X_train, y_train)
 
 #model = VotingRegressor([('rf', model), ('gb', gb)]).fit(X_train, y_train)
 #model = VotingRegressor([('lr', pipe_rfe)]).fit(X, logit(y))

 df = pd.merge(risk, 
 pd.DataFrame(expit(model.predict(X)), 
 index=X.index, 
 columns=['predict_tranche_loss']),
 how='left', left_index=True, right_index=True)

 df.loc[df.index.get_level_values("attach") != attach_max, "predict_tranche_loss_per_index"] = (
 df.predict_tranche_loss * df.thickness / df.index_expected_loss
 )

 def aux(s):
 temp = s.values
 temp[-1] = 1 - temp[:-1].sum()
 return temp

 df["predict_tranche_loss_per_index"] = df.groupby(["index", "series", "date"])["predict_tranche_loss_per_index"].transform(aux)
 df = df.assign(
 mispricing=(df.exp_percentage - df.predict_tranche_loss_per_index)
 * df.index_expected_loss
 / (df.detach_adj - df.attach_adj)
 )
 rfe_result = pipe_rfe
 print(index_type, " num features: ", n_features_to_select)
 print(index_type, " Chosen columns: ", np.array(rfe_result['polynomialfeatures'].get_feature_names(X.columns))[rfe_result['rfe'].support_])
 print(index_type, " Training Score: ", model.score(X_train, y_train))
 print(index_type, " Testing Score: ", model.score(X_test, y_test))
 
 return model, df, X

gini_model, gini_results, gini_X = {}, {}, {}
fieldlist = ['exp_percentage','dispersion','gini','tranche_loss_per','mispricing']
for index_type in ['HY', 'IG', 'EU', 'XO']:
 gini_model[index_type], gini_results[index_type], gini_X[index_type] = run_rfe(index_type)
 gini_results[index_type][fieldlist].to_csv('/home/serenitas/edwin/DispersionModel/' + index_type + '_results_rfecv.csv')

In [None]:
#examine the effect of any paricular variable
steps = 100
for index_type in ['HY', 'IG', 'EU', 'XO']:
 plots = {}
 tranche_attach = []

 for i, X in gini_X[index_type].groupby('attach'):
 tranche_attach.append(X.index[0][5])
 for var in X.columns:
 bins = np.linspace(X[var].min(), X[var].max(),num=steps)
 testing_df = pd.DataFrame(bins, columns=[var])
 for var_1 in X.drop(var, axis=1).columns:
 testing_df = pd.concat([testing_df, pd.Series(np.repeat(X.iloc[-1][var_1], steps),name=var_1)], axis=1)
 plots[i, var] = pd.Series(expit(gini_model[index_type].predict(testing_df[X.columns])), index=testing_df[var])

 sensitivies = pd.concat(plots, names=['attach', 'shock', 'value'])
 sensitivies.to_csv('/home/serenitas/edwin/DispersionModel/' + index_type + '_sensitivies.csv')

 fig, axes = plt.subplots(nrows=3, ncols=len(X.columns), figsize = (20,10))
 for i, p in enumerate(plots):
 x_loc = int(i/len(X.columns))
 y_loc = i % len(X.columns)
 if x_loc == 0:
 axes[x_loc, y_loc].set_title(p[1]) 
 plots[p].plot(ax=axes[x_loc, y_loc], label=i, xlabel="")
 for i in [0,1,2]:
 fig.axes[i*len(X.columns)].text(-0.2, 0.5, "tranche attach: " + str(tranche_attach[i]),
 transform=fig.axes[i*len(X.columns)].transAxes,
 verticalalignment='center',
 rotation=90)
 fig.savefig("/home/serenitas/edwin/PythonGraphs/dispersion_model.png", bbox_inches='tight')

 fig_1, axes_1 = plt.subplots(nrows=3, ncols=1, figsize = (15,8))
 for i, p in enumerate(plots):
 x_loc = int(i/len(X.columns))
 plots[p].plot(ax=axes_1[x_loc], label=p[1], xlabel="", legend=True)
 for i in [0,1,2]:
 fig_1.axes[i].text(-0.05, 0.5, "tranche attach: " + str(tranche_attach[i]),
 transform=fig_1.axes[i].transAxes,
 verticalalignment='center',
 rotation=90)
 fig_1.savefig("/home/serenitas/edwin/PythonGraphs/dispersion_model_consolidated.png", bbox_inches='tight')

In [None]:
#Section 2----------------------------------------------------
#plot the gini coefficients
for index_type in ['HY', 'IG', 'EU', 'XO']:
 ginis = gini_results[index_type].xs([0, '5yr', index_type],level=['attach','tenor', 'index']).groupby(['date', 'series']).nth(-1).gini.unstack(level='series')
 ginis.to_csv('/home/serenitas/edwin/DispersionModel/' + index_type + '_gini.csv')

In [None]:
#Section 3----------------------------------------------------
#Fixed Model
value_date = (datetime.datetime.today() - pd.offsets.BDay(1)).date()
start = (datetime.datetime.today() - pd.offsets.BDay(1) * 365 *4).date()
end = datetime.datetime.today()
gini_model, gini_results = {}, {}
conn = dbconn("serenitasdb")
conn.autocommit = True
for index_type in ['HY', 'IG', 'EU', 'XO']:
 risk = disp.get_tranche_data(dbconn("serenitasdb"), index_type)
 risk = risk[risk.index_duration > 1] #filter out the short duration ones
 gini_results[index_type], gini_model[index_type] = disp.create_models_v2(conn, risk)
gini_model['HY'].fit().summary()

fieldlist = ['exp_percentage','dispersion','gini','tranche_loss_per','mispricing']
for index_type in ['HY', 'IG', 'EU', 'XO']:
 gini_results[index_type][fieldlist].to_csv('/home/serenitas/edwin/DispersionModel/' + index_type + '_results.csv')

In [None]:
#plot the residuals
fitted = gini_model['HY'].fit()
plt.figure(figsize=(8,5))
p=plt.scatter(x=expit(fitted.fittedvalues),y=expit(fitted.fittedvalues + fitted.resid) -expit(fitted.fittedvalues),edgecolor='k')
xmin=min(expit(fitted.fittedvalues))
xmax = max(expit(fitted.fittedvalues))
plt.hlines(y=0,xmin=xmin*0.9,xmax=xmax*1.1,color='red',linestyle='--',lw=3)
plt.xlabel("Fitted values",fontsize=15)
plt.ylabel("Residuals",fontsize=15)
plt.title("Fitted vs. residuals plot",fontsize=18)
plt.grid(True)
plt.show()