import bottleneck as bn import datetime import numpy as np import pandas as pd import statsmodels.api as sm import statsmodels.formula.api as smf from analytics.basket_index import MarkitBasketIndex from analytics import CreditIndex from dateutil.relativedelta import relativedelta from utils.db import dbengine def get_dispersion(index_type, series, end_date=datetime.date.today()): index = MarkitBasketIndex(index_type, series, ["5yr"]) dr = pd.bdate_range(index.issue_date, end_date) dispersion = [] for d in dr: index.value_date = d dispersion.append(bn.nanstd(np.log(index.spreads()))) return pd.Series(dispersion, index=dr, name="dispersion") def add_cumloss(df, index_type, series, engine): cumloss = pd.read_sql_query( "SELECT lastdate, cumulativeloss AS cumloss FROM index_version " "WHERE index=%s and series=%s order by lastdate", engine, params=(index_type, series), parse_dates=["lastdate"], ) cumloss.iat[-1, 0] = pd.Timestamp.max cumloss = ( cumloss.set_index("lastdate").sort_index().reindex(df.index, method="bfill") ) return df.join(cumloss) def get_corr_data(index_type, series, engine): sql_str = ( "SELECT quotedate::date, indexrefspread, indexrefprice, index_duration, " "index_expected_loss, corr_at_detach " "FROM tranche_risk JOIN tranche_quotes " "ON tranche_risk.tranche_id=tranche_quotes.id " "WHERE index=%s and series=%s and tenor='5yr' and detach=%s order by quotedate desc" ) df = pd.read_sql_query( sql_str, engine, params=(index_type, series, 3 if index_type == "IG" else 15), index_col=["quotedate"], parse_dates=["quotedate"], ) if index_type == "HY": spread_equivalent = [] index = CreditIndex(index_type, series, "5yr") for k, v in df.iterrows(): index.value_date = k index.ref = v["indexrefprice"] spread_equivalent.append(index.spread) df["indexrefspread"] = spread_equivalent df = df.assign( fisher=lambda x: 0.5 * np.log((1 + x.corr_at_detach) / (1 - x.corr_at_detach)) ) return df def get_tranche_data(index_type, engine): sql_string = "select * from index_version where index = %s" idx_ver = pd.read_sql_query( sql_string, engine, params=[index_type,], parse_dates=["lastdate"] ) idx_ver["date"] = pd.to_datetime( [ d.strftime("%Y-%m-%d") if not pd.isnull(d) else datetime.date(2050, 1, 1) for d in idx_ver["lastdate"] ] ) sql_string = "select * from risk_numbers where index = %s" df = pd.read_sql_query( sql_string, engine, parse_dates={"date": {"utc": True}}, params=[index_type] ) df["exp_percentage"] = df["expected_loss"] / df["index_expected_loss"] df.date = df.date.dt.normalize().dt.tz_convert(None) df = df.groupby(["date", "index", "series", "tenor", "attach"]).mean() df.reset_index(inplace=True) idx_ver.sort_values(by=["date"], inplace=True, ascending=True) df = pd.merge_asof( df, idx_ver[["date", "series", "cumulativeloss", "indexfactor"]], left_on=["date"], right_on=["date"], by="series", direction="forward", ) df.set_index("date", inplace=True) df["moneyness"] = df.apply( lambda df: (df.detach - df.cumulativeloss) / df.indexfactor / df.index_expected_loss, axis=1, ) return df def gini(array): """Calculate the Gini coefficient of a numpy array.""" if np.amin(array) < 0: array -= np.amin(array) # values cannot be negative array += 0.0000001 # values cannot be 0 array = np.sort(array) # values must be sorted index = np.arange(1, array.shape[0] + 1) # index per array element n = array.shape[0] # number of array elements return (np.sum((2 * index - n - 1) * array)) / (n * np.sum(array)) def get_gini_spreadstdev(index_type, series, tenor, date): indices = MarkitBasketIndex(index_type, series, tenor, value_date=date) spreads = indices.spreads() spreads = spreads[spreads < 1] return (gini(spreads), np.std(spreads)) if __name__ == "__main__": index_type = "HY" series = 29 serenitas_engine = dbengine("serenitasdb") dispersion = get_dispersion(index_type, series) df = get_corr_data(index_type, series, serenitas_engine) df = df.join(dispersion) if index_type == "HY": df = add_cumloss(df, index_type, series, serenitas_engine) if index_type == "HY": formula = "fisher ~ np.log(dispersion) + cumloss + np.log(index_duration)" else: formula = "fisher ~ np.log(dispersion) + np.log(indexrefspread) + np.log(index_duration)" mod = smf.ols(formula=formula, data=df)