1 files changed, 57 insertions, 185 deletions
diff --git a/python/notebooks/Single Names Monitoring.ipynb b/python/notebooks/Single Names Monitoring.ipynb
index 46230799..be3f2c47 100644
--- a/python/notebooks/Single Names Monitoring.ipynb
+++ b/python/notebooks/Single Names Monitoring.ipynb
@@ -9,14 +9,16 @@
     "import pandas as pd\n",
     "import numpy as np\n",
     "import datetime\n",
+    "import exploration.dispersion as disp\n",
+    "import matplotlib.pyplot as plt\n",
+    "import statsmodels.formula.api as smf\n",
     "\n",
     "from analytics.basket_index import MarkitBasketIndex\n",
     "from analytics import on_the_run\n",
-    "import matplotlib.pyplot as plt\n",
-    "import statsmodels.formula.api as smf\n",
+    "from statsmodels.graphics.regressionplots import plot_fit\n",
     "from pygam import LinearGAM, s, f, GAM\n",
-    "\n",
     "from utils.db import dbengine\n",
+    "\n",
     "serenitas_engine = dbengine('serenitasdb')"
    ]
   },
@@ -36,8 +38,8 @@
    "outputs": [],
    "source": [
     "value_date = (pd.datetime.today() - pd.offsets.BDay(1)).date()\n",
-    "index_type = 'XO'\n",
-    "series = 28"
+    "index_type = 'HY'\n",
+    "series = 33"
    ]
   },
   {
@@ -46,13 +48,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "series_back = 2\n",
     "sql_string = \"select * from index_members(%s, %s)\"\n",
     "\n",
     "df = pd.read_sql_query(sql_string, serenitas_engine, params=(index_type + str(series), value_date), index_col=['markit_ticker'])\n",
-    "df1 = pd.read_sql_query(sql_string, serenitas_engine, params=(index_type + str(series-2), value_date), index_col=['markit_ticker'])\n",
+    "df1 = pd.read_sql_query(sql_string, serenitas_engine, params=(index_type + str(series-series_back), value_date), index_col=['markit_ticker'])\n",
     "\n",
     "default_prob = {}\n",
-    "for s in [series, series-2]:\n",
+    "for s in [series, series-series_back]:\n",
     "    index = MarkitBasketIndex(index_type, s, ['5yr'])\n",
     "    surv_prob, tickers = index.survival_matrix()\n",
     "    default_prob[s] = pd.Series(1 - np.ravel(surv_prob), index=tickers)\n",
@@ -60,7 +63,7 @@
     "default_prob.name = 'default_prob'\n",
     "\n",
     "df = df.merge(default_prob.loc[series], left_index=True, right_index = True)\n",
-    "df1 = df1.merge(default_prob.loc[series-2], left_index=True, right_index = True)"
+    "df1 = df1.merge(default_prob.loc[series-series_back], left_index=True, right_index = True)"
    ]
   },
   {
@@ -89,7 +92,25 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "date_range = pd.bdate_range(value_date - 52 * .5 * pd.offsets.Week(), value_date, freq='5B')\n",
+    "index_type, series, df.nlargest(10, columns='default_prob')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "index_type, series-series_back, df1.nlargest(10, columns='default_prob')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "date_range = pd.bdate_range(end=value_date, freq='1BM',periods=12)\n",
     "index = MarkitBasketIndex(index_type, series, ['5yr'])\n",
     "default_prob = {}\n",
     "for d in date_range:\n",
@@ -136,38 +157,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def gini(array):\n",
-    "    \"\"\"Calculate the Gini coefficient of a numpy array.\"\"\"\n",
-    "    # based on bottom eq: http://www.statsdirect.com/help/content/image/stat0206_wmf.gif\n",
-    "    # from: http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm\n",
-    "    if np.amin(array) < 0:\n",
-    "        array -= np.amin(array) #values cannot be negative\n",
-    "    array += 0.0000001 #values cannot be 0\n",
-    "    array = np.sort(array) #values must be sorted\n",
-    "    index = np.arange(1,array.shape[0]+1) #index per array element\n",
-    "    n = array.shape[0]#number of array elements\n",
-    "    return ((np.sum((2 * index - n  - 1) * array)) / (n * np.sum(array))) "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_gini_spreadstdev(row):\n",
-    "    indices = MarkitBasketIndex(row['index'], row.series, [row.tenor], value_date = row.name)\n",
-    "    spreads = indices.spreads()\n",
-    "    spreads = spreads[spreads<1]\n",
-    "    return (gini(spreads), np.std(spreads))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
     "####################### NAV Basis\n",
     "\n",
     "#            HY              |           IG\n",
@@ -198,11 +187,25 @@
    "outputs": [],
    "source": [
     "####################### Get Gini on indices: this calc bombs a lot so let's do the ones that we were able to calc before (dropna)\n",
-    "df_gini_calc = df.dropna().loc[datetime.date(2019,1,1):, :].reset_index('dist_on_the_run')[\n",
+    "df_gini_calc_temp = df.dropna().loc[datetime.date(2019,1,1):, :].reset_index('dist_on_the_run')[\n",
+    "    ['index','series', 'tenor', 'duration', 'basis', 'closespread']]\n",
+    "temp = df_gini_calc_temp.apply(get_gini_spreadstdev, axis=1)\n",
+    "temp = pd.DataFrame(temp.values.tolist(), columns=['gini_spread','std_spread'], index=temp.index)\n",
+    "df_gini_calc = pd.concat([df_gini_calc_temp, temp], axis=1).dropna()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "####################### Get Gini on the run only - same calc as above\n",
+    "df_gini_calc_temp = df.groupby(['date']).nth(-1).dropna()[\n",
     "    ['index','series', 'tenor', 'duration', 'basis', 'closespread']]\n",
-    "temp = df_gini_calc.apply(get_gini_spreadstdev, axis=1)\n",
+    "temp = df_gini_calc_temp.apply(get_gini_spreadstdev, axis=1)\n",
     "temp = pd.DataFrame(temp.values.tolist(), columns=['gini_spread','std_spread'], index=temp.index)\n",
-    "df_gini_calc = df_gini_calc.merge(temp, left_index=True, right_index=True).dropna()"
+    "df_gini_calc = pd.concat([df_gini_calc_temp, temp], axis=1).dropna()"
    ]
   },
   {
@@ -211,8 +214,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#######################GLS regression of NAV basis to spread/duration\n",
-    "#basis_gini_model = smf.gls(\"basis ~ np.log(duration) + np.log(closespread) + np.log(gini_spread)\", data=df_gini_calc).fit()\n",
+    "#######################OLS regression of NAV basis to spread/duration\n",
+    "#basis_gini_model = smf.ols(\"basis ~ np.log(duration) + np.log(closespread) + np.log(gini_spread)\", data=df_gini_calc).fit()\n",
     "#basis_gini_model.summary()\n",
     "\n",
     "#Let's use a GAM model instead?\n",
@@ -239,7 +242,7 @@
     "## plotting\n",
     "fig, axs = plt.subplots(1,3);\n",
     "\n",
-    "titles = ['duration', 'closespread', third_variable]\n",
+    "titles = ['duration', 'closespread', 'gini_spread']\n",
     "for i, ax in enumerate(axs):\n",
     "    XX = basis_model.generate_X_grid(term=i)\n",
     "    ax.plot(XX[:, i], basis_model.partial_dependence(term=i, X=XX))\n",
@@ -278,148 +281,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#######################Dispersion: std_dev/mean of default_prob\n",
-    "date_range = pd.bdate_range(value_date - 52 * 4 * pd.offsets.Week(), value_date, freq='5B')\n",
-    "default_prob, index_spreads = {}, {}\n",
+    "## BBs \n",
+    "date_range = pd.bdate_range(end=value_date, freq='5B',periods=52*10)\n",
+    "index_spreads = {}\n",
+    "index_type = 'HYBB'\n",
     "for d in date_range:\n",
     "    try:\n",
     "        index = MarkitBasketIndex(index_type, on_the_run(index_type, d), ['5yr'], value_date =d)\n",
-    "        surv_prob, tickers = index.survival_matrix()\n",
-    "        spreads = index.spreads()\n",
-    "        spreads = spreads[spreads<1]            #filter out crazy spreads\n",
-    "        default_prob[d] = pd.Series(1 - np.ravel(surv_prob), index=tickers)\n",
-    "        index_spreads[d] = pd.Series(spreads, index=tickers)\n",
+    "        index_spreads[d] = index.spread()\n",
     "    except:\n",
     "        continue\n",
-    "default_prob = pd.concat(default_prob)\n",
-    "index_spreads = pd.concat(index_spreads)\n",
-    "dispersion = default_prob.unstack(level=0)\n",
-    "dispersion = dispersion.std()/dispersion.mean()\n",
-    "dispersion_spread = index_spreads.unstack(level=0)\n",
-    "dispersion_spread = dispersion_spread.std()/dispersion_spread.mean()\n",
-    "dispersion.plot()\n",
-    "dispersion_spread.plot()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#Get Gini factor\n",
-    "sql_string = \"select * from index_version where index = %s\"\n",
-    "idx_ver = pd.read_sql_query(sql_string, serenitas_engine, params=[index_type,], parse_dates=['lastdate'])\n",
-    "idx_ver['date'] = pd.to_datetime([d.strftime('%Y-%m-%d') if not pd.isnull(d) else datetime.date(2050,1,1) for d in idx_ver['lastdate']])\n",
-    "sql_string = \"select * from risk_numbers where index = %s\"\n",
-    "risk = pd.read_sql_query(sql_string, serenitas_engine, parse_dates={'date': {'utc':True}}, params=[index_type])\n",
-    "risk.date = risk.date.dt.normalize().dt.tz_convert(None)\n",
-    "risk = risk.groupby(['date','index','series','tenor','attach']).mean()\n",
-    "risk.reset_index(inplace=True)\n",
-    "idx_ver.sort_values(by=['date'], inplace=True, ascending=True)\n",
-    "risk = pd.merge_asof(risk, idx_ver[['date','series','cumulativeloss','indexfactor']], left_on=['date'], right_on=['date'], by='series', direction='forward')\n",
-    "risk.set_index('date', inplace=True) \n",
-    "risk['moneyness'] = risk.apply(lambda df: (df.detach-df.cumulativeloss)/df.indexfactor/df.index_expected_loss, axis=1)\n",
-    "\n",
-    "date_range = pd.bdate_range(value_date - 52 * 3 * pd.offsets.Week(), value_date, freq='5B')\n",
-    "gini_calc = risk[(risk.index.isin(date_range)) & (risk.attach == 0)]\n",
-    "temp = gini_calc.apply(get_gini_spreadstdev, axis=1)\n",
-    "gini_calc[['gini_spread', 'std_spread']] = pd.DataFrame(temp.values.tolist(), columns=['gini_spread','std_spread'], index=temp.index)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "to_plot_gini = gini_calc[(gini_calc.tenor == '5yr')].groupby(['date', 'series']).nth(-1)\n",
-    "to_plot_gini['gini_spread'].unstack().plot()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gini_model = smf.gls(\"corr_at_detach ~ gini_spread + duration + moneyness\", data=equity).fit()\n",
-    "gini_model.summary()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "predict_today = equity.reset_index()[['gini_spread', 'duration', 'moneyness']].iloc[-1]\n",
-    "gini_model.predict(predict_today)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#Let's use a GAM model instead?\n",
-    "#only use the 5yr point for modeling\n",
-    "equity = gini_calc[(gini_calc.tenor=='5yr') & (gini_calc.series >= 23)]\n",
-    "X = np.array(equity[['gini_spread', 'duration', 'moneyness']])\n",
-    "y = np.array(equity['corr_at_detach'])\n",
-    "\n",
-    "#Fit for Lamda\n",
-    "gam_model = GAM(s(0, n_splines=5) +\n",
-    "                  s(1, n_splines=5) +\n",
-    "                  s(2, n_splines=5))\n",
-    "lam = np.logspace(-3, 5, 5, base=3)\n",
-    "lams = [lam] * 3\n",
-    "gam_model.gridsearch(X, y, lam=lams)\n",
-    "\n",
-    "gam_model.summary()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## plotting\n",
-    "fig, axs = plt.subplots(1,3);\n",
-    "\n",
-    "titles = ['gini_spread', 'duration', 'moneyness']\n",
-    "for i, ax in enumerate(axs):\n",
-    "    XX = gam_model.generate_X_grid(term=i)\n",
-    "    ax.plot(XX[:, i], gam_model.partial_dependence(term=i, X=XX))\n",
-    "    ax.plot(XX[:, i], gam_model.partial_dependence(term=i, X=XX, width=.95)[1], c='r', ls='--')\n",
-    "    if i == 0:\n",
-    "        ax.set_ylim(-30,30)\n",
-    "    ax.set_title(titles[i]);"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "predict = gam_model.predict(X)\n",
-    "plt.scatter(y, predict)\n",
-    "plt.xlabel('actual correlation')\n",
-    "plt.ylabel('predicted correlation')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "today = (equity.loc[max(equity.index)])\n",
-    "predict_HY31 = gam_model.predict(np.array(today[today.series==31][['gini_spread', 'duration', 'moneyness']]))\n",
-    "today[today.series==31][['corr_at_detach']], predict_HY31"
+    "index_spreads = pd.concat(index_spreads)"
    ]
   }
  ],
@@ -439,7 +311,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.4"
+   "version": "3.8.0"
   }
  },
  "nbformat": 4,