import base64 import json import os import sys import unicodedata from . import logger from apiclient import errors from bs4 import BeautifulSoup, NavigableString, Tag from pathlib import Path from pytz import timezone from gmail_helpers import GmailMessage from email.utils import parsedate_to_datetime def print_citi_html2(soup): l = [] for pre in soup.findAll("pre"): l.extend([unicodedata.normalize("NFKD", s) for s in pre.strings]) return "\n".join(l) def print_citi_html(email): soup = BeautifulSoup(email.get_content(), features="lxml") p = soup.find("p") if p is None: return print_citi_html2(soup) s = p.next if isinstance(s, NavigableString): l = [unicodedata.normalize("NFKD", s)] else: raise ValueError("weird email") for br in p.findAll("br"): s = br.next if isinstance(s, NavigableString): l.append(unicodedata.normalize("NFKD", s)) elif isinstance(s, Tag) and s.name == "br": l.append("\n") else: raise ValueError return "\n".join(l) def save_emails(update=True): """Download new emails that were labeled swaptions.""" DATA_DIR = Path(os.getenv("DATA_DIR")) if update: last_history_id = int((DATA_DIR / ".lastHistoryId").read_text()) existing_msgs = [] else: p = (DATA_DIR / "swaptions").glob("????-??/*") existing_msgs = set(str(x).split("_")[1] for x in p if x.is_file()) last_history_id = None gm = GmailMessage() for msg in gm.list_msg_ids("swaptions", last_history_id): if msg["id"] in existing_msgs: continue try: message = gm.from_id(msg["id"]) logger.info(message.history_id) subject = message["subject"] date = parsedate_to_datetime(message["date"]) if date.tzinfo is None: date = date.replace(tzinfo=timezone("utc")) date = date.astimezone(timezone("America/New_York")) body = message.get_body("plain") if body is None: try: content = print_citi_html(message.get_body("html")) except ValueError: logger.error( "Can't parse HTML email with subject: " f"{subject} and id: {msg['id']}" ) continue else: content = body.get_content() except (KeyError, UnicodeDecodeError, AttributeError) as e: logger.error("error decoding " + msg["id"]) continue else: save_dir = DATA_DIR / "swaptions" / f"{date:%Y-%m}" if not save_dir.exists(): save_dir.mkdir() email = save_dir / f"{date:%Y-%m-%d %H-%M-%S}_{msg['id']}" with email.open("w") as fh: fh.write(subject + "\r\n") fh.write(content) try: new_history_id = message.history_id (DATA_DIR / ".lastHistoryId").write_text(message.history_id) except UnboundLocalError: pass