python/parse_citi_pdf.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72

import pandas as pd
import subprocess
from bs4 import BeautifulSoup
from env import DAILY_DIR


def load_pdf(file_path):
    proc = subprocess.run(
        ["pdftohtml", "-xml", "-stdout", "-i", file_path.as_posix()],
        capture_output=True,
    )
    soup = BeautifulSoup(proc.stdout, features="lxml")
    l = soup.findAll("text")
    l = sorted(l, key=lambda x: (int(x["top"]), int(x["left"])))
    return l


def get_col(l, top, bottom, left, right):
    return [
        c.text
        for c in l
        if int(c["left"]) >= left
        and int(c["left"]) < right
        and int(c["top"]) >= top
        and int(c["top"]) < bottom
    ]


def parse_num(s):
    s = s.replace(",", "")
    if s[0] == "(":
        return -float(s[1:-1])
    else:
        return float(s)


def get_df(l, col1, col2, col3):
    df = pd.DataFrame(
        {"amount": get_col(l, *col2), "currency": get_col(l, *col3)},
        index=get_col(l, *col1),
    )
    df.amount = df.amount.apply(parse_num)
    df.index = df.index.str.lstrip()
    return df


def get_citi_collateral(d):
    try:
        fname = next(
            (DAILY_DIR / "CITI_reports").glob(
                f"262966_MarginNotice_{d.strftime('%Y%m%d')}_*.pdf"
            )
        )
    except StopIteration:
        raise FileNotFoundError(f"CITI file not found for date {d.date()}")
    l = load_pdf(fname)
    col1 = (370, 500, 70, 100)
    col2 = (370, 500, 100, 500)
    col3 = (370, 500, 500, 600)

    variation_margin = get_df(l, col1, col2, col3)
    anchor = next(c for c in l if c.text == "Non Regulatory Initial Margin")
    top = int(anchor["top"]) + 10
    bottom = top + 150
    col1 = (top, bottom, 70, 100)
    col2 = (top, bottom, 100, 500)
    col3 = (top, bottom, 500, 600)
    initial_margin = get_df(l, col1, col2, col3)
    return (
        variation_margin.loc["VM Total Collateral", "amount"]
        + initial_margin.loc["Non Reg IM Total Collateral", "amount"]
    )