import pandas as pd from db import dbengine import matplotlib import numpy as np import matplotlib.pyplot as plt import statsmodels.api as sm from statsmodels.formula.api import gls df = (pd.read_sql_table('external_marks_mapped',dbengine('dawndb'), parse_dates=['date']). sort_values('date')) df1 = pd.read_sql_query("select trade_date, cusip as identifier, principal_payment, price from bonds where buysell = 'f'" ,dbengine('dawndb'), parse_dates=[ 'trade_date' ]) df1 = df1.sort_values('trade_date') d = {} for source, g in df.groupby('source'): d[source] = pd.merge_asof(df1, g, left_on='trade_date', right_on= 'date', by='identifier') df2 = pd.concat(d, join='inner') df3 = df2.dropna(axis=0, subset=['mark']) df3 = df3.reset_index(level=1, drop=True) df4 = df3.set_index(['trade_date','identifier', 'source', 'principal_payment']) df4 = df4.groupby(level=['trade_date', 'identifier','source', 'principal_payment']).mean() df4 = df4.set_index('price', append=True) df5 = df4.unstack(-3) df5.columns = df5.columns.droplevel() #df6 = df5.reset_index(col_level=1, drop=True) df6 = df5.sortlevel(0, ascending = False) df6 = df6.reset_index() r2 = {} stderr = {} parameters = {} #for g in df5['mark']: for col in df6.columns[4:]: regmodel = gls(formula= "df6['price'] ~df6[col]", data = df6).fit() r2[col] = regmodel.rsquared stderr[col] = regmodel.bse parameters[col] = regmodel.params r2 = pd.Series(r2, name= 'R2') r2.index.name = 'Source' r2.reset_index r2 = pd.DataFrame(r2) stderr = pd.DataFrame(stderr) parameters = pd.DataFrame(parameters) df7 = df6 for col in df7.columns[4:]: df7[col] = (df7.price - df7[col])/df7[col] filter = 2 begindate = '2015-01-01' enddate = '2017-01-01' df8 = df7[(df7['trade_date']>begindate) & (df7['trade_date']