aboutsummaryrefslogtreecommitdiffstats
path: root/python/parse_citi_pdf.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/parse_citi_pdf.py')
-rw-r--r--python/parse_citi_pdf.py43
1 files changed, 30 insertions, 13 deletions
diff --git a/python/parse_citi_pdf.py b/python/parse_citi_pdf.py
index 71bdc4c8..39a33f5a 100644
--- a/python/parse_citi_pdf.py
+++ b/python/parse_citi_pdf.py
@@ -3,19 +3,28 @@ import subprocess
from bs4 import BeautifulSoup
from env import DAILY_DIR
+
def load_pdf(file_path):
- proc = subprocess.run(["pdftohtml", "-xml", "-stdout", "-i",
- file_path.as_posix()],
- capture_output=True)
+ proc = subprocess.run(
+ ["pdftohtml", "-xml", "-stdout", "-i", file_path.as_posix()],
+ capture_output=True,
+ )
soup = BeautifulSoup(proc.stdout, features="lxml")
l = soup.findAll("text")
l = sorted(l, key=lambda x: (int(x["top"]), int(x["left"])))
return l
+
def get_col(l, top, bottom, left, right):
- return [c.text for c in l if int(c["left"]) >= left and \
- int(c["left"]) < right and \
- int(c["top"]) >= top and int(c["top"]) < bottom ]
+ return [
+ c.text
+ for c in l
+ if int(c["left"]) >= left
+ and int(c["left"]) < right
+ and int(c["top"]) >= top
+ and int(c["top"]) < bottom
+ ]
+
def parse_num(s):
s = s.replace(",", "")
@@ -24,18 +33,24 @@ def parse_num(s):
else:
return float(s)
+
def get_df(l, col1, col2, col3):
- df = pd.DataFrame({"amount": get_col(l, *col2),
- "currency": get_col(l, *col3)},
- index=get_col(l, *col1))
+ df = pd.DataFrame(
+ {"amount": get_col(l, *col2), "currency": get_col(l, *col3)},
+ index=get_col(l, *col1),
+ )
df.amount = df.amount.apply(parse_num)
df.index = df.index.str.lstrip()
return df
+
def get_citi_collateral(d):
try:
- fname = next((DAILY_DIR / "CITI_reports").
- glob(f"262966_MarginNotice_{d.strftime('%Y%m%d')}_*.pdf"))
+ fname = next(
+ (DAILY_DIR / "CITI_reports").glob(
+ f"262966_MarginNotice_{d.strftime('%Y%m%d')}_*.pdf"
+ )
+ )
except StopIteration:
raise FileNotFoundError(f"CITI file not found for date {d.date()}")
l = load_pdf(fname)
@@ -51,5 +66,7 @@ def get_citi_collateral(d):
col2 = (top, bottom, 100, 500)
col3 = (top, bottom, 500, 600)
initial_margin = get_df(l, col1, col2, col3)
- return variation_margin.loc["VM Total Collateral", "amount"] + \
- initial_margin.loc["Non Reg IM Total Collateral", "amount"]
+ return (
+ variation_margin.loc["VM Total Collateral", "amount"]
+ + initial_margin.loc["Non Reg IM Total Collateral", "amount"]
+ )