diff options
Diffstat (limited to 'python/exploration')
| -rw-r--r-- | python/exploration/dispersion.py | 96 |
1 files changed, 85 insertions, 11 deletions
diff --git a/python/exploration/dispersion.py b/python/exploration/dispersion.py index e633264d..a2165754 100644 --- a/python/exploration/dispersion.py +++ b/python/exploration/dispersion.py @@ -5,10 +5,16 @@ import statsmodels.api as sm import statsmodels.formula.api as smf from serenitas.analytics.basket_index import MarkitBasketIndex -from serenitas.analytics import CreditIndex +from serenitas.analytics.api import CreditIndex from scipy.special import logit, expit from serenitas.utils.db import dbengine +from sklearn.feature_selection import RFECV, RFE +from sklearn.model_selection import train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import PolynomialFeatures, PowerTransformer +from sklearn.linear_model import LinearRegression + def get_corr_data(index_type, series, engine): sql_str = ( @@ -106,11 +112,6 @@ def create_models(conn, df) -> (pd.DataFrame, float): f.predict(bottom_stack) ) - def aux(s): - temp = s.values - temp[-1] = 1 - temp[:-1].sum() - return temp - df["predict"] = df.groupby(["index", "series", "date"])["predict"].transform(aux) df = df.assign( mispricing=(df.exp_percentage - df.predict) @@ -148,11 +149,6 @@ def create_models_v2(conn, df, weights=None) -> (pd.DataFrame, float): df.predict_tranche_loss * df.thickness / df.index_expected_loss ) - def aux(s): - temp = s.values - temp[-1] = 1 - temp[:-1].sum() - return temp - df["predict"] = df.groupby(["index", "series", "date"])["predict"].transform(aux) df = df.assign( mispricing=(df.exp_percentage - df.predict) @@ -202,6 +198,84 @@ def create_separate_models(df): return (calc, model) +def aux(s): + temp = s.values + temp[-1] = 1 - temp[:-1].sum() + return temp + + +def create_rfe_models(df, print_score=False): + # Takes the output of get_tranche_data + attach_max = df.index.get_level_values("attach").max() + bottom_stack = df[df.index.get_level_values("attach") != attach_max] + bottom_stack = bottom_stack[bottom_stack.tranche_loss_per > 0].dropna() + + # prepare the variables + y = logit(bottom_stack["tranche_loss_per"]) + X = bottom_stack[ + ["index_duration", "index_basis", "att_moneyness", "det_moneyness", "gini"] + ] + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + + pipe_rfe = make_pipeline( + PowerTransformer(), + PolynomialFeatures(interaction_only=True), + RFECV(estimator=LinearRegression(), cv=10, min_features_to_select=1), + ) + + pipe_rfe.fit(X_train, y_train) + pipe_rfe.steps[-1] = ( + "rfe", + RFE( + estimator=LinearRegression(), + n_features_to_select=pipe_rfe["rfecv"].n_features_, + ), + ) + model = pipe_rfe.fit(X_train, y_train) + + df = pd.merge( + df, + pd.DataFrame( + expit(model.predict(X)), index=X.index, columns=["predict_tranche_loss"] + ), + how="left", + left_index=True, + right_index=True, + ) + + df.loc[ + df.index.get_level_values("attach") != attach_max, + "predict_tranche_loss_per_index", + ] = ( + df.predict_tranche_loss * df.thickness / df.index_expected_loss + ) + + df["predict_tranche_loss_per_index"] = df.groupby(["index", "series", "date"])[ + "predict_tranche_loss_per_index" + ].transform(aux) + df = df.assign( + mispricing=(df.exp_percentage - df.predict_tranche_loss_per_index) + * df.index_expected_loss + / (df.detach_adj - df.attach_adj) + ) + + if print_score: + index_type = df.index[0][1] + print(index_type, " num features: ", model.feature_names_in_) + print( + index_type, + " Chosen columns: ", + np.array(model["polynomialfeatures"].get_feature_names_out(X.columns))[ + model["rfe"].support_ + ], + ) + print(index_type, " Training Score: ", model.score(X_train, y_train)) + print(index_type, " Testing Score: ", model.score(X_test, y_test)) + + return (df, model) + + if __name__ == "__main__": index_type = "HY" series = 29 |
