aboutsummaryrefslogtreecommitdiffstats
path: root/python/parse_html.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/parse_html.py')
-rw-r--r--python/parse_html.py131
1 files changed, 131 insertions, 0 deletions
diff --git a/python/parse_html.py b/python/parse_html.py
new file mode 100644
index 00000000..ccba0726
--- /dev/null
+++ b/python/parse_html.py
@@ -0,0 +1,131 @@
+import pandas as pd
+import requests
+import time
+from bs4 import BeautifulSoup
+from utils.db import dbconn
+from streamz import Stream
+
+
+def combine_df(state, new):
+ state = state.append(df)
+ state = state.drop_duplicates(["Dissemination Id"])
+ return state
+
+
+def event_loop(conn):
+ url = "https://kgc0418-tdw-data-0.s3.amazonaws.com/prices/CREDITS_PRICE_DETAIL.HTML"
+ df_indic = get_index_indicative(conn)
+ acc = pd.DataFrame()
+ i = 0
+ old_ts = None
+ while i < 10000:
+ i += 1
+ r = requests.get(url)
+ if r.status_code == 200:
+ soup = BeautifulSoup(r.content, features="lxml")
+ df = parse_soup(soup)
+ acc = acc.append(df)
+ acc = acc.drop_duplicates(["Dissemination Id"])
+ # df_options = df[df["UPI/Taxonomy"].str.startswith("Credit:Swaptions")]
+ df_index = acc[acc["UPI/Taxonomy"].str.startswith("Credit:Index")]
+ df_index = parse_index(df_index, df_indic)
+ last_trade = df_index.xs(("IG", 32, 1, "5yr"))
+ last_ts = last_trade.index[-1]
+ if old_ts is None or last_ts > old_ts:
+ print(last_ts, last_trade["Price Notation"].iloc[-1])
+ old_ts = last_ts
+ time.sleep(0.5)
+ return acc
+
+
+def get_index_indicative(conn):
+ return pd.read_sql_query(
+ "SELECT redindexcode, index, series, version, tenor, maturity FROM index_desc",
+ conn,
+ parse_dates=["maturity"],
+ )
+
+
+def parse_soup(soup):
+ table = soup.find_all("table")[1]
+ rows = iter(table.find_all("tr"))
+ header = [th.text for th in next(rows).find_all("th")]
+ rows = [[td.text.strip() or None for td in r.find_all("td")] for r in rows]
+ df = pd.DataFrame(rows, columns=header)
+ df["Dissemination Id"] = df["Dissemination Id"].astype("int")
+ df["Original Dissemination Id"] = (
+ df["Original Dissemination Id"].astype("float").astype("Int64")
+ )
+ for col in ["Execution Timestamp", "End Date"]:
+ df[col] = pd.to_datetime(df[col])
+ df["Execution Timestamp"] = (
+ df["Execution Timestamp"]
+ .dt.tz_localize("utc")
+ .dt.tz_convert("America/New_York")
+ )
+ df["Underlying Asset 1"] = df["Underlying Asset 1"].str.rsplit(
+ ":", n=1, expand=True
+ )[1]
+ return df
+
+
+def parse_index(df, df_indic):
+ df = df.rename(
+ columns={
+ "Execution Timestamp": "trade_timestamp",
+ "Underlying Asset 1": "redindexcode",
+ "End Date": "maturity",
+ }
+ )
+ df = df.merge(df_indic, on=["redindexcode", "maturity"])
+ df = df.set_index(
+ ["index", "series", "version", "tenor", "trade_timestamp"]
+ ).sort_index()
+ df["Price Notation"] = pd.to_numeric(df["Price Notation"])
+ return df
+
+
+def parse_options(df, df_indic):
+ for col in [
+ "Action",
+ "Cleared or Uncleared",
+ "Price Notation Type",
+ "Option Type",
+ "Option Currency",
+ "Day Count Convention",
+ ]:
+ df[col] = df[col].astype("category")
+
+ for col in ["Option Premium"]:
+ df[col] = pd.to_numeric(df[col].str.replace(",", ""))
+ for col in ["Option Strike Price", "Price Notation"]:
+ df[col] = df[col].astype("float")
+ df = df.rename(
+ columns={
+ "Option Strike Price": "strike",
+ "Option Expiration Date": "expiration_date",
+ "Underlying Asset 1": "redindexcode",
+ "Rounded Notional Amount1": "notional",
+ "Option Premium": "premium",
+ "Option Type": "option_type",
+ "Price Notation": "price",
+ "Execution Timestamp": "trade_timestamp",
+ }
+ )
+ df.strike = df.strike.where(df.strike < 1000, df.strike / 100).where(
+ df.strike > 10, df.strike * 100
+ )
+ df.price = (df.price * 1e2).where(
+ df["Price Notation Type"] == "Percentage", df.price
+ )
+ df = df.merge(df_indic, on="redindexcode")
+ df = df.set_index(["index", "series", "version", "trade_timestamp"]).sort_index()
+ return df[
+ ["expiration_date", "notional", "strike", "option_type", "premium", "price"]
+ ]
+
+
+if __name__ == "__main__":
+ conn = dbconn("serenitasdb")
+ df = event_loop(conn)
+ conn.close()