diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/download_emails.py | 26 |
1 files changed, 25 insertions, 1 deletions
diff --git a/python/download_emails.py b/python/download_emails.py index ac4c7b13..9072f4ec 100644 --- a/python/download_emails.py +++ b/python/download_emails.py @@ -3,13 +3,34 @@ import json import logging import os import sys +import unicodedata from apiclient import errors +from bs4 import BeautifulSoup, NavigableString, Tag from pathlib import Path from pytz import timezone from gmail_helpers import GmailMessage from email.utils import parsedate_to_datetime + +def print_citi_html(email): + soup = BeautifulSoup(email.get_content(), features="lxml") + p = soup.find('p') + s = p.next + if isinstance(s, NavigableString): + l = [unicodedata.normalize("NFKD", s)] + else: + raise ValueError("weird email") + for br in p.findAll('br'): + s = br.next + if isinstance(s, NavigableString): + l.append(unicodedata.normalize("NFKD", s)) + elif isinstance(s, Tag) and s.name == 'br': + l.append('\n') + else: + raise ValueError("weird email") + return "\n".join(l) + def save_emails(update=True): """Download new emails that were labeled swaptions.""" DATA_DIR = Path(os.getenv("DATA_DIR")) @@ -34,7 +55,10 @@ def save_emails(update=True): date = date.replace(tzinfo=timezone('utc')) date = date.astimezone(timezone('America/New_York')) body = message.get_body('plain') - content = body.get_content() + if body is None: + content = print_citi_html(message.get_body('html')) + else: + content = body.get_content() except (KeyError, UnicodeDecodeError, AttributeError) as e: logging.error("error decoding " + msg['id']) continue |
