diff options
Diffstat (limited to 'python/parse_citi_pdf.py')
| -rw-r--r-- | python/parse_citi_pdf.py | 72 |
1 files changed, 0 insertions, 72 deletions
diff --git a/python/parse_citi_pdf.py b/python/parse_citi_pdf.py deleted file mode 100644 index def5bc67..00000000 --- a/python/parse_citi_pdf.py +++ /dev/null @@ -1,72 +0,0 @@ -import pandas as pd -import subprocess -from bs4 import BeautifulSoup -from env import DAILY_DIR - - -def load_pdf(file_path): - proc = subprocess.run( - ["pdftohtml", "-xml", "-stdout", "-i", file_path.as_posix()], - capture_output=True, - ) - soup = BeautifulSoup(proc.stdout, features="lxml") - l = soup.findAll("text") - l = sorted(l, key=lambda x: (int(x["top"]), int(x["left"]))) - return l - - -def get_col(l, top, bottom, left, right): - return [ - c.text - for c in l - if int(c["left"]) >= left - and int(c["left"]) < right - and int(c["top"]) >= top - and int(c["top"]) < bottom - ] - - -def parse_num(s): - s = s.replace(",", "") - if s[0] == "(": - return -float(s[1:-1]) - else: - return float(s) - - -def get_df(l, col1, col2, col3): - df = pd.DataFrame( - {"amount": get_col(l, *col2), "currency": get_col(l, *col3)}, - index=get_col(l, *col1), - ) - df.amount = df.amount.apply(parse_num) - df.index = df.index.str.lstrip() - return df - - -def get_citi_collateral(d): - try: - fname = next( - (DAILY_DIR / "CITI_reports").glob( - f"262966_MarginNotice_{d.strftime('%Y%m%d')}_*.pdf" - ) - ) - except StopIteration: - raise FileNotFoundError(f"CITI file not found for date {d.date()}") - l = load_pdf(fname) - col1 = (370, 500, 70, 100) - col2 = (370, 500, 100, 500) - col3 = (370, 500, 500, 600) - - variation_margin = get_df(l, col1, col2, col3) - anchor = next(c for c in l if c.text == "Non Regulatory Initial Margin") - top = int(anchor["top"]) + 10 - bottom = top + 150 - col1 = (top, bottom, 70, 100) - col2 = (top, bottom, 100, 505) - col3 = (top, bottom, 505, 600) - initial_margin = get_df(l, col1, col2, col3) - return ( - variation_margin.loc["VM Total Collateral", "amount"] - + initial_margin.loc["Non Reg IM Total Collateral", "amount"] - ) |
