diff options
Diffstat (limited to 'python/quote_parsing/download_emails.py')
| -rw-r--r-- | python/quote_parsing/download_emails.py | 75 |
1 files changed, 75 insertions, 0 deletions
diff --git a/python/quote_parsing/download_emails.py b/python/quote_parsing/download_emails.py new file mode 100644 index 00000000..24d87601 --- /dev/null +++ b/python/quote_parsing/download_emails.py @@ -0,0 +1,75 @@ +import base64 +import json +import os +import sys +import unicodedata + +from . import logger +from apiclient import errors +from bs4 import BeautifulSoup, NavigableString, Tag +from pathlib import Path +from pytz import timezone +from gmail_helpers import GmailMessage +from email.utils import parsedate_to_datetime + +def print_citi_html(email): + soup = BeautifulSoup(email.get_content(), features="lxml") + p = soup.find('p') + s = p.next + if isinstance(s, NavigableString): + l = [unicodedata.normalize("NFKD", s)] + else: + raise ValueError("weird email") + for br in p.findAll('br'): + s = br.next + if isinstance(s, NavigableString): + l.append(unicodedata.normalize("NFKD", s)) + elif isinstance(s, Tag) and s.name == 'br': + l.append('\n') + else: + raise ValueError("weird email") + return "\n".join(l) + +def save_emails(update=True): + """Download new emails that were labeled swaptions.""" + DATA_DIR = Path(os.getenv("DATA_DIR")) + + if update: + last_history_id = int((DATA_DIR / ".lastHistoryId").read_text()) + existing_msgs = [] + else: + p = DATA_DIR / "swaptions" + existing_msgs = set(str(x).split("_")[1] for x in p.iterdir() if x.is_file()) + last_history_id = None + + gm = GmailMessage() + for msg in gm.list_msg_ids('swaptions', last_history_id): + if msg['id'] in existing_msgs: + continue + try: + message = gm.from_id(msg['id']) + logger.info(message.history_id) + subject = message['subject'] + date = parsedate_to_datetime(message['date']) + if date.tzinfo is None: + date = date.replace(tzinfo=timezone('utc')) + date = date.astimezone(timezone('America/New_York')) + body = message.get_body('plain') + if body is None: + content = print_citi_html(message.get_body('html')) + else: + content = body.get_content() + except (KeyError, UnicodeDecodeError, AttributeError) as e: + logger.error("error decoding " + msg['id']) + continue + else: + email = (DATA_DIR / "swaptions" / + f"{date:%Y-%m-%d %H-%M-%S}_{msg['id']}") + with email.open("w") as fh: + fh.write(subject + "\r\n") + fh.write(content) + try: + new_history_id = message.history_id + (DATA_DIR / ".lastHistoryId").write_text(message.history_id) + except UnboundLocalError: + pass |
