aboutsummaryrefslogtreecommitdiffstats
path: root/python/parse_citi_pdf.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/parse_citi_pdf.py')
-rw-r--r--python/parse_citi_pdf.py72
1 files changed, 0 insertions, 72 deletions
diff --git a/python/parse_citi_pdf.py b/python/parse_citi_pdf.py
deleted file mode 100644
index def5bc67..00000000
--- a/python/parse_citi_pdf.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import pandas as pd
-import subprocess
-from bs4 import BeautifulSoup
-from env import DAILY_DIR
-
-
-def load_pdf(file_path):
- proc = subprocess.run(
- ["pdftohtml", "-xml", "-stdout", "-i", file_path.as_posix()],
- capture_output=True,
- )
- soup = BeautifulSoup(proc.stdout, features="lxml")
- l = soup.findAll("text")
- l = sorted(l, key=lambda x: (int(x["top"]), int(x["left"])))
- return l
-
-
-def get_col(l, top, bottom, left, right):
- return [
- c.text
- for c in l
- if int(c["left"]) >= left
- and int(c["left"]) < right
- and int(c["top"]) >= top
- and int(c["top"]) < bottom
- ]
-
-
-def parse_num(s):
- s = s.replace(",", "")
- if s[0] == "(":
- return -float(s[1:-1])
- else:
- return float(s)
-
-
-def get_df(l, col1, col2, col3):
- df = pd.DataFrame(
- {"amount": get_col(l, *col2), "currency": get_col(l, *col3)},
- index=get_col(l, *col1),
- )
- df.amount = df.amount.apply(parse_num)
- df.index = df.index.str.lstrip()
- return df
-
-
-def get_citi_collateral(d):
- try:
- fname = next(
- (DAILY_DIR / "CITI_reports").glob(
- f"262966_MarginNotice_{d.strftime('%Y%m%d')}_*.pdf"
- )
- )
- except StopIteration:
- raise FileNotFoundError(f"CITI file not found for date {d.date()}")
- l = load_pdf(fname)
- col1 = (370, 500, 70, 100)
- col2 = (370, 500, 100, 500)
- col3 = (370, 500, 500, 600)
-
- variation_margin = get_df(l, col1, col2, col3)
- anchor = next(c for c in l if c.text == "Non Regulatory Initial Margin")
- top = int(anchor["top"]) + 10
- bottom = top + 150
- col1 = (top, bottom, 70, 100)
- col2 = (top, bottom, 100, 505)
- col3 = (top, bottom, 505, 600)
- initial_margin = get_df(l, col1, col2, col3)
- return (
- variation_margin.loc["VM Total Collateral", "amount"]
- + initial_margin.loc["Non Reg IM Total Collateral", "amount"]
- )