aboutsummaryrefslogtreecommitdiffstats
path: root/python/exploration
diff options
context:
space:
mode:
Diffstat (limited to 'python/exploration')
-rw-r--r--python/exploration/dispersion.py96
1 files changed, 85 insertions, 11 deletions
diff --git a/python/exploration/dispersion.py b/python/exploration/dispersion.py
index e633264d..a2165754 100644
--- a/python/exploration/dispersion.py
+++ b/python/exploration/dispersion.py
@@ -5,10 +5,16 @@ import statsmodels.api as sm
import statsmodels.formula.api as smf
from serenitas.analytics.basket_index import MarkitBasketIndex
-from serenitas.analytics import CreditIndex
+from serenitas.analytics.api import CreditIndex
from scipy.special import logit, expit
from serenitas.utils.db import dbengine
+from sklearn.feature_selection import RFECV, RFE
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import PolynomialFeatures, PowerTransformer
+from sklearn.linear_model import LinearRegression
+
def get_corr_data(index_type, series, engine):
sql_str = (
@@ -106,11 +112,6 @@ def create_models(conn, df) -> (pd.DataFrame, float):
f.predict(bottom_stack)
)
- def aux(s):
- temp = s.values
- temp[-1] = 1 - temp[:-1].sum()
- return temp
-
df["predict"] = df.groupby(["index", "series", "date"])["predict"].transform(aux)
df = df.assign(
mispricing=(df.exp_percentage - df.predict)
@@ -148,11 +149,6 @@ def create_models_v2(conn, df, weights=None) -> (pd.DataFrame, float):
df.predict_tranche_loss * df.thickness / df.index_expected_loss
)
- def aux(s):
- temp = s.values
- temp[-1] = 1 - temp[:-1].sum()
- return temp
-
df["predict"] = df.groupby(["index", "series", "date"])["predict"].transform(aux)
df = df.assign(
mispricing=(df.exp_percentage - df.predict)
@@ -202,6 +198,84 @@ def create_separate_models(df):
return (calc, model)
+def aux(s):
+ temp = s.values
+ temp[-1] = 1 - temp[:-1].sum()
+ return temp
+
+
+def create_rfe_models(df, print_score=False):
+ # Takes the output of get_tranche_data
+ attach_max = df.index.get_level_values("attach").max()
+ bottom_stack = df[df.index.get_level_values("attach") != attach_max]
+ bottom_stack = bottom_stack[bottom_stack.tranche_loss_per > 0].dropna()
+
+ # prepare the variables
+ y = logit(bottom_stack["tranche_loss_per"])
+ X = bottom_stack[
+ ["index_duration", "index_basis", "att_moneyness", "det_moneyness", "gini"]
+ ]
+
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
+
+ pipe_rfe = make_pipeline(
+ PowerTransformer(),
+ PolynomialFeatures(interaction_only=True),
+ RFECV(estimator=LinearRegression(), cv=10, min_features_to_select=1),
+ )
+
+ pipe_rfe.fit(X_train, y_train)
+ pipe_rfe.steps[-1] = (
+ "rfe",
+ RFE(
+ estimator=LinearRegression(),
+ n_features_to_select=pipe_rfe["rfecv"].n_features_,
+ ),
+ )
+ model = pipe_rfe.fit(X_train, y_train)
+
+ df = pd.merge(
+ df,
+ pd.DataFrame(
+ expit(model.predict(X)), index=X.index, columns=["predict_tranche_loss"]
+ ),
+ how="left",
+ left_index=True,
+ right_index=True,
+ )
+
+ df.loc[
+ df.index.get_level_values("attach") != attach_max,
+ "predict_tranche_loss_per_index",
+ ] = (
+ df.predict_tranche_loss * df.thickness / df.index_expected_loss
+ )
+
+ df["predict_tranche_loss_per_index"] = df.groupby(["index", "series", "date"])[
+ "predict_tranche_loss_per_index"
+ ].transform(aux)
+ df = df.assign(
+ mispricing=(df.exp_percentage - df.predict_tranche_loss_per_index)
+ * df.index_expected_loss
+ / (df.detach_adj - df.attach_adj)
+ )
+
+ if print_score:
+ index_type = df.index[0][1]
+ print(index_type, " num features: ", model.feature_names_in_)
+ print(
+ index_type,
+ " Chosen columns: ",
+ np.array(model["polynomialfeatures"].get_feature_names_out(X.columns))[
+ model["rfe"].support_
+ ],
+ )
+ print(index_type, " Training Score: ", model.score(X_train, y_train))
+ print(index_type, " Testing Score: ", model.score(X_test, y_test))
+
+ return (df, model)
+
+
if __name__ == "__main__":
index_type = "HY"
series = 29