import pandas as pd import subprocess from bs4 import BeautifulSoup from env import DAILY_DIR def load_pdf(file_path): proc = subprocess.run( ["pdftohtml", "-xml", "-stdout", "-i", file_path.as_posix()], capture_output=True, ) soup = BeautifulSoup(proc.stdout, features="lxml") l = soup.findAll("text") l = sorted(l, key=lambda x: (int(x["top"]), int(x["left"]))) return l def get_col(l, top, bottom, left, right): return [ c.text for c in l if int(c["left"]) >= left and int(c["left"]) < right and int(c["top"]) >= top and int(c["top"]) < bottom ] def parse_num(s): s = s.replace(",", "") if s[0] == "(": return -float(s[1:-1]) else: return float(s) def get_df(l, col1, col2, col3): df = pd.DataFrame( {"amount": get_col(l, *col2), "currency": get_col(l, *col3)}, index=get_col(l, *col1), ) df.amount = df.amount.apply(parse_num) df.index = df.index.str.lstrip() return df def get_citi_collateral(d): try: fname = next( (DAILY_DIR / "CITI_reports").glob( f"262966_MarginNotice_{d.strftime('%Y%m%d')}_*.pdf" ) ) except StopIteration: raise FileNotFoundError(f"CITI file not found for date {d.date()}") l = load_pdf(fname) col1 = (370, 500, 70, 100) col2 = (370, 500, 100, 500) col3 = (370, 500, 500, 600) variation_margin = get_df(l, col1, col2, col3) anchor = next(c for c in l if c.text == "Non Regulatory Initial Margin") top = int(anchor["top"]) + 10 bottom = top + 150 col1 = (top, bottom, 70, 100) col2 = (top, bottom, 100, 500) col3 = (top, bottom, 500, 600) initial_margin = get_df(l, col1, col2, col3) return ( variation_margin.loc["VM Total Collateral", "amount"] + initial_margin.loc["Non Reg IM Total Collateral", "amount"] )