diff options
Diffstat (limited to 'python/exploration/dispersion.py')
| -rw-r--r-- | python/exploration/dispersion.py | 113 |
1 files changed, 41 insertions, 72 deletions
diff --git a/python/exploration/dispersion.py b/python/exploration/dispersion.py index 5b0c0968..fac383c5 100644 --- a/python/exploration/dispersion.py +++ b/python/exploration/dispersion.py @@ -1,4 +1,3 @@ -import bottleneck as bn import datetime import numpy as np import pandas as pd @@ -7,26 +6,10 @@ import statsmodels.formula.api as smf from analytics.basket_index import MarkitBasketIndex from analytics import CreditIndex -from dateutil.relativedelta import relativedelta +from scipy.special import logit, expit from utils.db import dbengine -def get_dispersion(index_type, series, use_gini=False, use_log=True, dr=None): - index = MarkitBasketIndex(index_type, series, ["5yr"]) - if dr is None: - dr = pd.bdate_range( - index.issue_date, datetime.datetime.today() - pd.offsets.BDay(1) - ) - - dispersion = [] - for d in dr: - # print(d) - index.value_date = d - dispersion.append(index.dispersion(use_gini, use_log)) - - return pd.DataFrame(dispersion, index=dr, columns=["dispersion"]) - - def get_corr_data(index_type, series, engine): sql_str = ( "SELECT quotedate::date, indexrefspread, indexrefprice, index_duration, " @@ -56,17 +39,22 @@ def get_corr_data(index_type, series, engine): return df -def get_tranche_data(index_type, engine): +def get_tranche_data(conn, index_type, tenor="5yr"): sql_string = ( "SELECT * FROM risk_numbers " "LEFT JOIN index_version USING (index, series, version) " - "WHERE index = %s" + "WHERE index = %s AND tenor=%s" ) df = pd.read_sql_query( - sql_string, engine, parse_dates={"date": {"utc": True}}, params=[index_type] + sql_string, + conn, + parse_dates={"date": {"utc": True}}, + params=(index_type, tenor), ) del df["basketid"] - df.date = df.date.dt.normalize().dt.tz_convert(None) + df.date = ( + df.date.dt.tz_convert("America/New_York").dt.tz_localize(None).dt.normalize() + ) df = df.groupby( ["date", "index", "series", "version", "tenor", "attach"], as_index=False ).mean() @@ -78,64 +66,45 @@ def get_tranche_data(index_type, engine): detach_adj=lambda x: np.minimum( (x.detach - x.cumulativeloss) / df.indexfactor, 1 ), + moneyness=lambda x: (x.detach_adj + x.attach_adj) / 2 / x.index_expected_loss, ) - df = df.assign( - moneyness=lambda x: (x.detach_adj + x.attach_adj) - / 2 - / x.indexfactor - / x.index_expected_loss, - ) - df.set_index( - ["date", "index", "series", "tenor", "attach"], append=True, inplace=True + df = df.set_index(["date", "index", "series", "version", "tenor", "attach"]) + series = tuple(df.index.get_level_values("series").unique()) + dispersion = pd.read_sql_query( + "SELECT date, index, series, version, tenor, dispersion, gini from index_quotes " + "WHERE index=%s AND series IN %s AND tenor=%s", + conn, + params=(index_type, series, tenor), + index_col=["date", "index", "series", "version", "tenor"], ) - df.reset_index(level=0, drop=True, inplace=True) + df = df.join(dispersion) return df -def create_models(df, use_gini=False, use_log=True): +def create_models(conn, df) -> (pd.DataFrame, float): # Takes the output of get_tranche_data - dispersion = {} - for g, _ in df.groupby(["series", "index"]): - temp = df.xs(g[0], level="series") - date_range = temp.index.get_level_values("date").unique() - dispersion[g[0]] = get_dispersion( - g[1], g[0], use_gini=use_gini, use_log=use_log, dr=date_range - ) - dispersion = pd.concat(dispersion) - dispersion.index.rename("series", level=0, inplace=True) - df = df.merge(dispersion, left_index=True, right_index=True) - df.dropna(subset=["dispersion"], inplace=True) - gini_model, gini_calc = {}, {} - for attach in df.index.get_level_values("attach").unique(): - gini_calc[attach] = df.xs( - ["5yr", attach], level=["tenor", "attach"], drop_level=False - ) - gini_model[attach] = smf.ols( - "np.log(exp_percentage) ~ " - "dispersion + " - "np.log(index_duration) + " - "np.log(moneyness)", - data=df.xs(attach, level="attach"), - ).fit() - gini_calc[attach]["predict"] = np.exp( - gini_model[attach].predict(gini_calc[attach]) - ) - gini_calc = pd.concat(gini_calc, sort=False).reset_index(level=0, drop=True) - normalization = gini_calc.groupby(["date", "index", "series", "tenor"])[ - "predict" - ].sum() - gini_calc = gini_calc.merge( - normalization, left_index=True, right_index=True, suffixes=["_preN", "_sum"] + attach_max = df.index.get_level_values("attach").max() + bottom_stack = df[df.index.get_level_values("attach") != attach_max] + model = smf.ols( + "logit(exp_percentage) ~ np.log(index_duration) + " + "I(np.log(index_expected_loss)**2) + " + "np.log(moneyness)*dispersion + " + "np.log(index_expected_loss)*dispersion + " + "I(np.log(moneyness)**2) + I(np.log(moneyness)**3)", + data=bottom_stack, ) - gini_calc["predict_N"] = gini_calc["predict_preN"] / gini_calc["predict_sum"] - gini_calc["mispricing"] = ( - (gini_calc["exp_percentage"] - gini_calc["predict_N"]) - * gini_calc["index_expected_loss"] - / (gini_calc["detach_adj"] - gini_calc["attach_adj"]) - / gini_calc["indexfactor"] - * 10000 + f = model.fit() + df.loc[df.index.get_level_values("attach") != attach_max, "predict"] = expit( + f.predict(bottom_stack) ) - return gini_model, gini_calc + + def aux(s): + temp = s.values + temp[-1] = 1 - temp[:-1].sum() + return temp + + df["predict"] = df.groupby(["index", "series", "date"])["predict"].transform(aux) + return (df, model) if __name__ == "__main__": |
