diff options
Diffstat (limited to 'python/parse_html.py')
| -rw-r--r-- | python/parse_html.py | 131 |
1 files changed, 131 insertions, 0 deletions
diff --git a/python/parse_html.py b/python/parse_html.py new file mode 100644 index 00000000..ccba0726 --- /dev/null +++ b/python/parse_html.py @@ -0,0 +1,131 @@ +import pandas as pd +import requests +import time +from bs4 import BeautifulSoup +from utils.db import dbconn +from streamz import Stream + + +def combine_df(state, new): + state = state.append(df) + state = state.drop_duplicates(["Dissemination Id"]) + return state + + +def event_loop(conn): + url = "https://kgc0418-tdw-data-0.s3.amazonaws.com/prices/CREDITS_PRICE_DETAIL.HTML" + df_indic = get_index_indicative(conn) + acc = pd.DataFrame() + i = 0 + old_ts = None + while i < 10000: + i += 1 + r = requests.get(url) + if r.status_code == 200: + soup = BeautifulSoup(r.content, features="lxml") + df = parse_soup(soup) + acc = acc.append(df) + acc = acc.drop_duplicates(["Dissemination Id"]) + # df_options = df[df["UPI/Taxonomy"].str.startswith("Credit:Swaptions")] + df_index = acc[acc["UPI/Taxonomy"].str.startswith("Credit:Index")] + df_index = parse_index(df_index, df_indic) + last_trade = df_index.xs(("IG", 32, 1, "5yr")) + last_ts = last_trade.index[-1] + if old_ts is None or last_ts > old_ts: + print(last_ts, last_trade["Price Notation"].iloc[-1]) + old_ts = last_ts + time.sleep(0.5) + return acc + + +def get_index_indicative(conn): + return pd.read_sql_query( + "SELECT redindexcode, index, series, version, tenor, maturity FROM index_desc", + conn, + parse_dates=["maturity"], + ) + + +def parse_soup(soup): + table = soup.find_all("table")[1] + rows = iter(table.find_all("tr")) + header = [th.text for th in next(rows).find_all("th")] + rows = [[td.text.strip() or None for td in r.find_all("td")] for r in rows] + df = pd.DataFrame(rows, columns=header) + df["Dissemination Id"] = df["Dissemination Id"].astype("int") + df["Original Dissemination Id"] = ( + df["Original Dissemination Id"].astype("float").astype("Int64") + ) + for col in ["Execution Timestamp", "End Date"]: + df[col] = pd.to_datetime(df[col]) + df["Execution Timestamp"] = ( + df["Execution Timestamp"] + .dt.tz_localize("utc") + .dt.tz_convert("America/New_York") + ) + df["Underlying Asset 1"] = df["Underlying Asset 1"].str.rsplit( + ":", n=1, expand=True + )[1] + return df + + +def parse_index(df, df_indic): + df = df.rename( + columns={ + "Execution Timestamp": "trade_timestamp", + "Underlying Asset 1": "redindexcode", + "End Date": "maturity", + } + ) + df = df.merge(df_indic, on=["redindexcode", "maturity"]) + df = df.set_index( + ["index", "series", "version", "tenor", "trade_timestamp"] + ).sort_index() + df["Price Notation"] = pd.to_numeric(df["Price Notation"]) + return df + + +def parse_options(df, df_indic): + for col in [ + "Action", + "Cleared or Uncleared", + "Price Notation Type", + "Option Type", + "Option Currency", + "Day Count Convention", + ]: + df[col] = df[col].astype("category") + + for col in ["Option Premium"]: + df[col] = pd.to_numeric(df[col].str.replace(",", "")) + for col in ["Option Strike Price", "Price Notation"]: + df[col] = df[col].astype("float") + df = df.rename( + columns={ + "Option Strike Price": "strike", + "Option Expiration Date": "expiration_date", + "Underlying Asset 1": "redindexcode", + "Rounded Notional Amount1": "notional", + "Option Premium": "premium", + "Option Type": "option_type", + "Price Notation": "price", + "Execution Timestamp": "trade_timestamp", + } + ) + df.strike = df.strike.where(df.strike < 1000, df.strike / 100).where( + df.strike > 10, df.strike * 100 + ) + df.price = (df.price * 1e2).where( + df["Price Notation Type"] == "Percentage", df.price + ) + df = df.merge(df_indic, on="redindexcode") + df = df.set_index(["index", "series", "version", "trade_timestamp"]).sort_index() + return df[ + ["expiration_date", "notional", "strike", "option_type", "premium", "price"] + ] + + +if __name__ == "__main__": + conn = dbconn("serenitasdb") + df = event_loop(conn) + conn.close() |
