import base64 import json import logging import os import sys import unicodedata from apiclient import errors from bs4 import BeautifulSoup, NavigableString, Tag from pathlib import Path from pytz import timezone from gmail_helpers import GmailMessage from email.utils import parsedate_to_datetime def print_citi_html(email): soup = BeautifulSoup(email.get_content(), features="lxml") p = soup.find('p') s = p.next if isinstance(s, NavigableString): l = [unicodedata.normalize("NFKD", s)] else: raise ValueError("weird email") for br in p.findAll('br'): s = br.next if isinstance(s, NavigableString): l.append(unicodedata.normalize("NFKD", s)) elif isinstance(s, Tag) and s.name == 'br': l.append('\n') else: raise ValueError("weird email") return "\n".join(l) def save_emails(update=True): """Download new emails that were labeled swaptions.""" DATA_DIR = Path(os.getenv("DATA_DIR")) if update: last_history_id = int((DATA_DIR / ".lastHistoryId").read_text()) existing_msgs = [] else: p = DATA_DIR / "swaptions" existing_msgs = set(str(x).split("_")[1] for x in p.iterdir() if x.is_file()) last_history_id = None for msg in GmailMessage.list_msg_ids('swaptions', last_history_id): if msg['id'] in existing_msgs: continue try: message = GmailMessage.from_id(msg['id']) logging.info(message.history_id) subject = message['subject'] date = parsedate_to_datetime(message['date']) if date.tzinfo is None: date = date.replace(tzinfo=timezone('utc')) date = date.astimezone(timezone('America/New_York')) body = message.get_body('plain') if body is None: content = print_citi_html(message.get_body('html')) else: content = body.get_content() except (KeyError, UnicodeDecodeError, AttributeError) as e: logging.error("error decoding " + msg['id']) continue else: email = (DATA_DIR / "swaptions" / f"{date:%Y-%m-%d %H-%M-%S}_{msg['id']}") with email.open("w") as fh: fh.write(subject + "\r\n") fh.write(content) try: new_history_id = message.history_id (DATA_DIR / ".lastHistoryId").write_text(message.history_id) except UnboundLocalError: pass if __name__ == '__main__': try: save_emails() except errors.HttpError as e: logging.error(e) save_emails(update=False)