diff options
Diffstat (limited to 'python/collateral/citi.py')
| -rw-r--r-- | python/collateral/citi.py | 25 |
1 files changed, 1 insertions, 24 deletions
diff --git a/python/collateral/citi.py b/python/collateral/citi.py index 5ea8ce60..acda4b1f 100644 --- a/python/collateral/citi.py +++ b/python/collateral/citi.py @@ -1,8 +1,7 @@ import pandas as pd -import subprocess -from bs4 import BeautifulSoup from pandas.tseries.offsets import BDay from . import DAILY_DIR, bus_day +from .common import load_pdf, get_col def load_file(d): @@ -33,28 +32,6 @@ def download_files(count=20): p.write_bytes(attach.content) -def load_pdf(file_path): - proc = subprocess.run( - ["pdftohtml", "-xml", "-stdout", "-i", file_path.as_posix()], - capture_output=True, - ) - soup = BeautifulSoup(proc.stdout, features="lxml") - l = soup.findAll("text") - l = sorted(l, key=lambda x: (int(x["top"]), int(x["left"]))) - return l - - -def get_col(l, top, bottom, left, right): - return [ - c.text - for c in l - if int(c["left"]) >= left - and int(c["left"]) < right - and int(c["top"]) >= top - and int(c["top"]) < bottom - ] - - def parse_num(s): s = s.replace(",", "") if s[0] == "(": |
