diff options
Diffstat (limited to 'python/collateral')
| -rw-r--r-- | python/collateral/common.py | 15 |
1 files changed, 11 insertions, 4 deletions
diff --git a/python/collateral/common.py b/python/collateral/common.py index be08a25a..95315d5a 100644 --- a/python/collateral/common.py +++ b/python/collateral/common.py @@ -111,15 +111,22 @@ def send_email(d: datetime.date, df: pd.DataFrame) -> None: ) -def load_pdf(file_path): +def load_pdf(file_path, pages=False): proc = subprocess.run( ["pdftohtml", "-xml", "-stdout", "-i", file_path.as_posix()], capture_output=True, ) soup = BeautifulSoup(proc.stdout, features="lxml") - l = soup.findAll("text") - l = sorted(l, key=lambda x: (int(x["top"]), int(x["left"]))) - return l + if pages: + r = [] + for page in soup.findAll("page"): + l = page.findAll("text") + r.append(sorted(l, key=lambda x: (int(x["top"]), int(x["left"])))) + return r + else: + l = soup.findAll("text") + l = sorted(l, key=lambda x: (int(x["top"]), int(x["left"]))) + return l def get_col(l, top, bottom, left, right): |
