diff options
Diffstat (limited to 'python/parse_citi_pdf.py')
| -rw-r--r-- | python/parse_citi_pdf.py | 43 |
1 files changed, 30 insertions, 13 deletions
diff --git a/python/parse_citi_pdf.py b/python/parse_citi_pdf.py index 71bdc4c8..39a33f5a 100644 --- a/python/parse_citi_pdf.py +++ b/python/parse_citi_pdf.py @@ -3,19 +3,28 @@ import subprocess from bs4 import BeautifulSoup from env import DAILY_DIR + def load_pdf(file_path): - proc = subprocess.run(["pdftohtml", "-xml", "-stdout", "-i", - file_path.as_posix()], - capture_output=True) + proc = subprocess.run( + ["pdftohtml", "-xml", "-stdout", "-i", file_path.as_posix()], + capture_output=True, + ) soup = BeautifulSoup(proc.stdout, features="lxml") l = soup.findAll("text") l = sorted(l, key=lambda x: (int(x["top"]), int(x["left"]))) return l + def get_col(l, top, bottom, left, right): - return [c.text for c in l if int(c["left"]) >= left and \ - int(c["left"]) < right and \ - int(c["top"]) >= top and int(c["top"]) < bottom ] + return [ + c.text + for c in l + if int(c["left"]) >= left + and int(c["left"]) < right + and int(c["top"]) >= top + and int(c["top"]) < bottom + ] + def parse_num(s): s = s.replace(",", "") @@ -24,18 +33,24 @@ def parse_num(s): else: return float(s) + def get_df(l, col1, col2, col3): - df = pd.DataFrame({"amount": get_col(l, *col2), - "currency": get_col(l, *col3)}, - index=get_col(l, *col1)) + df = pd.DataFrame( + {"amount": get_col(l, *col2), "currency": get_col(l, *col3)}, + index=get_col(l, *col1), + ) df.amount = df.amount.apply(parse_num) df.index = df.index.str.lstrip() return df + def get_citi_collateral(d): try: - fname = next((DAILY_DIR / "CITI_reports"). - glob(f"262966_MarginNotice_{d.strftime('%Y%m%d')}_*.pdf")) + fname = next( + (DAILY_DIR / "CITI_reports").glob( + f"262966_MarginNotice_{d.strftime('%Y%m%d')}_*.pdf" + ) + ) except StopIteration: raise FileNotFoundError(f"CITI file not found for date {d.date()}") l = load_pdf(fname) @@ -51,5 +66,7 @@ def get_citi_collateral(d): col2 = (top, bottom, 100, 500) col3 = (top, bottom, 500, 600) initial_margin = get_df(l, col1, col2, col3) - return variation_margin.loc["VM Total Collateral", "amount"] + \ - initial_margin.loc["Non Reg IM Total Collateral", "amount"] + return ( + variation_margin.loc["VM Total Collateral", "amount"] + + initial_margin.loc["Non Reg IM Total Collateral", "amount"] + ) |
