diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/parse_citi_pdf.py | 53 |
1 files changed, 53 insertions, 0 deletions
diff --git a/python/parse_citi_pdf.py b/python/parse_citi_pdf.py new file mode 100644 index 00000000..ccbcd56b --- /dev/null +++ b/python/parse_citi_pdf.py @@ -0,0 +1,53 @@ +import pandas as pd +import subprocess +from bs4 import BeautifulSoup +from pathlib import Path + +def load_pdf(file_path): + proc = subprocess.run(["pdftohtml", "-xml", "-stdout", "-i", + file_path.as_posix()], + capture_output=True) + soup = BeautifulSoup(proc.stdout, features="lxml") + l = soup.findAll("text") + l = sorted(l, key=lambda x: (int(x["top"]), int(x["left"]))) + return l + +def get_col(l, top, bottom, left, right): + return [c.text for c in l if int(c["left"]) >= left and \ + int(c["left"]) < right and \ + int(c["top"]) >= top and int(c["top"]) < bottom ] + +def parse_num(s): + s = s.replace(",", "") + if s[0] == "(": + return -float(s[1:-1]) + else: + return float(s) + +def get_df(l, col1, col2, col3): + df = pd.DataFrame({"amount": get_col(l, *col2), + "currency": get_col(l, *col3)}, + index=get_col(l, *col1)) + df.amount = df.amount.apply(parse_num) + df.index = df.index.str.lstrip() + return df + +def get_citi_collateral(d): + try: + fname = next((DAILY_DIR / "CITI_reports"). + glob("262966_MarginNotice_*_.pdf")) + except StopIteration: + raise FileNotFoundError(f"CITI file not found for date {d}") + l = load_pdf(fname) + col1 = (370, 500, 70, 100) + col2 = (370, 500, 100, 500) + col3 = (370, 500, 500, 600) + + variation_margin = get_df(l, col1, col2, col3) + + col1 = (650, 800, 70, 100) + col2 = (650, 800, 100, 500) + col3 = (650, 800, 500, 600) + initial_margin = get_df(l, col1, col2, col3) + return variation_margin.loc["VM Total Collateral", "amount"] + \ + initial_margin.loc["Non Reg IM Total Collateral", "amount"] |
