import pandas as pd from db import dbengine import matplotlib import numpy as np import matplotlib.pyplot as plt import statsmodels.api as sm from statsmodels.formula.api import gls import seaborn as sb df = pd.read_sql_table('external_marks_mapped', dbengine('dawndb'), parse_dates=['date']) df = df[df.source.notnull()] df_wide = (pd.pivot_table(df, 'mark', ['identifier', 'date'], 'source'). reset_index(). sort_values('date')) df_trades = {} df_trades['sell'] = pd.read_sql_query("select trade_date, identifier, faceamount, principal_payment, price " \ "from bonds where buysell = 'f'" , dbengine('dawndb'), parse_dates=['trade_date']) df_trades['buy'] = pd.read_sql_query("select trade_date, identifier, faceamount, principal_payment, price " \ "from bonds where buysell = 't'" , dbengine('dawndb'), parse_dates=['trade_date']) for direction, trades in df_trades.items(): df_trades[direction] = trades.sort_values('trade_date') df_sell_wide = pd.merge_asof(df_trades['sell'], df_wide, left_on='trade_date', right_on='date', by='identifier') df_wide_temp = df_wide.set_index('date').shift(periods = -1, freq= 'M', axis = 1).reset_index() df_buy_wide = pd.merge_asof(df_trades['buy'], df_wide_temp , left_on='trade_date', right_on='date', by='identifier') d_sell = {} d_buy = {} df_long = {} for source, g in df.groupby('source'): d_sell[source] = pd.merge_asof(df_trades['sell'], g, left_on='trade_date', right_on='date', by='identifier') d_sell[source]['dir'] = 'sell' d_buy[source] = pd.merge_asof(df_trades['buy'], g.set_index('date').shift(periods = -1, freq= 'BM', axis = 1).reset_index(), left_on='trade_date', right_on='date', by='identifier') d_buy[source]['dir'] = 'buy' #add a column saying it is buy or sell df_long = pd.concat(d_sell, join='inner') df_long = df_long.append(pd.concat(d_buy, join='inner')) df_long = df_long.reset_index().dropna(subset=['price','mark']) df_long['difference'] = (df_long['price'] - df_long['mark'])/df_long['mark'] #filtering diff_threshold = 5 begindate = '2013-01-01' enddate = '2018-01-01' df_long = df_long[df_long.identifier.str.len() == 9] df_long = df_long[(df_long['trade_date'] > begindate) & (df_long['trade_date'] < enddate)] #difference['Std Dev'] = df_long[df_long