diff options
Diffstat (limited to 'python/collateral/common.py')
| -rw-r--r-- | python/collateral/common.py | 24 |
1 files changed, 24 insertions, 0 deletions
diff --git a/python/collateral/common.py b/python/collateral/common.py index 882a3a74..64498fca 100644 --- a/python/collateral/common.py +++ b/python/collateral/common.py @@ -1,5 +1,7 @@ import datetime import logging +import subprocess +from bs4 import BeautifulSoup import pandas as pd from exchangelib import HTMLBody from sqlalchemy.engine import Engine @@ -102,3 +104,25 @@ def send_email(d: datetime.date, df: pd.DataFrame) -> None: ["serenitas.otc@sscinc.com"], ["nyops@lmcg.com"], ) + + +def load_pdf(file_path): + proc = subprocess.run( + ["pdftohtml", "-xml", "-stdout", "-i", file_path.as_posix()], + capture_output=True, + ) + soup = BeautifulSoup(proc.stdout, features="lxml") + l = soup.findAll("text") + l = sorted(l, key=lambda x: (int(x["top"]), int(x["left"]))) + return l + + +def get_col(l, top, bottom, left, right): + return [ + c.text + for c in l + if int(c["left"]) >= left + and int(c["left"]) < right + and int(c["top"]) >= top + and int(c["top"]) < bottom + ] |
