aboutsummaryrefslogtreecommitdiffstats
path: root/python/collateral/common.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/collateral/common.py')
-rw-r--r--python/collateral/common.py15
1 files changed, 11 insertions, 4 deletions
diff --git a/python/collateral/common.py b/python/collateral/common.py
index be08a25a..95315d5a 100644
--- a/python/collateral/common.py
+++ b/python/collateral/common.py
@@ -111,15 +111,22 @@ def send_email(d: datetime.date, df: pd.DataFrame) -> None:
)
-def load_pdf(file_path):
+def load_pdf(file_path, pages=False):
proc = subprocess.run(
["pdftohtml", "-xml", "-stdout", "-i", file_path.as_posix()],
capture_output=True,
)
soup = BeautifulSoup(proc.stdout, features="lxml")
- l = soup.findAll("text")
- l = sorted(l, key=lambda x: (int(x["top"]), int(x["left"])))
- return l
+ if pages:
+ r = []
+ for page in soup.findAll("page"):
+ l = page.findAll("text")
+ r.append(sorted(l, key=lambda x: (int(x["top"]), int(x["left"]))))
+ return r
+ else:
+ l = soup.findAll("text")
+ l = sorted(l, key=lambda x: (int(x["top"]), int(x["left"])))
+ return l
def get_col(l, top, bottom, left, right):