diff options
Diffstat (limited to 'python/download_emails.py')
| -rw-r--r-- | python/download_emails.py | 83 |
1 files changed, 0 insertions, 83 deletions
diff --git a/python/download_emails.py b/python/download_emails.py deleted file mode 100644 index cf738e9e..00000000 --- a/python/download_emails.py +++ /dev/null @@ -1,83 +0,0 @@ -import base64 -import json -import logging -import os -import sys -import unicodedata - -from apiclient import errors -from bs4 import BeautifulSoup, NavigableString, Tag -from pathlib import Path -from pytz import timezone -from gmail_helpers import GmailMessage -from email.utils import parsedate_to_datetime - - -def print_citi_html(email): - soup = BeautifulSoup(email.get_content(), features="lxml") - p = soup.find('p') - s = p.next - if isinstance(s, NavigableString): - l = [unicodedata.normalize("NFKD", s)] - else: - raise ValueError("weird email") - for br in p.findAll('br'): - s = br.next - if isinstance(s, NavigableString): - l.append(unicodedata.normalize("NFKD", s)) - elif isinstance(s, Tag) and s.name == 'br': - l.append('\n') - else: - raise ValueError("weird email") - return "\n".join(l) - -def save_emails(update=True): - """Download new emails that were labeled swaptions.""" - DATA_DIR = Path(os.getenv("DATA_DIR")) - - if update: - last_history_id = int((DATA_DIR / ".lastHistoryId").read_text()) - existing_msgs = [] - else: - p = DATA_DIR / "swaptions" - existing_msgs = set(str(x).split("_")[1] for x in p.iterdir() if x.is_file()) - last_history_id = None - - gm = GmailMessage() - for msg in gm.list_msg_ids('swaptions', last_history_id): - if msg['id'] in existing_msgs: - continue - try: - message = gm.from_id(msg['id']) - logging.info(message.history_id) - subject = message['subject'] - date = parsedate_to_datetime(message['date']) - if date.tzinfo is None: - date = date.replace(tzinfo=timezone('utc')) - date = date.astimezone(timezone('America/New_York')) - body = message.get_body('plain') - if body is None: - content = print_citi_html(message.get_body('html')) - else: - content = body.get_content() - except (KeyError, UnicodeDecodeError, AttributeError) as e: - logging.error("error decoding " + msg['id']) - continue - else: - email = (DATA_DIR / "swaptions" / - f"{date:%Y-%m-%d %H-%M-%S}_{msg['id']}") - with email.open("w") as fh: - fh.write(subject + "\r\n") - fh.write(content) - try: - new_history_id = message.history_id - (DATA_DIR / ".lastHistoryId").write_text(message.history_id) - except UnboundLocalError: - pass - -if __name__ == '__main__': - try: - save_emails() - except errors.HttpError as e: - logging.error(e) - save_emails(update=False) |
