aboutsummaryrefslogtreecommitdiffstats
path: root/python/download_emails.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/download_emails.py')
-rw-r--r--python/download_emails.py26
1 files changed, 25 insertions, 1 deletions
diff --git a/python/download_emails.py b/python/download_emails.py
index ac4c7b13..9072f4ec 100644
--- a/python/download_emails.py
+++ b/python/download_emails.py
@@ -3,13 +3,34 @@ import json
import logging
import os
import sys
+import unicodedata
from apiclient import errors
+from bs4 import BeautifulSoup, NavigableString, Tag
from pathlib import Path
from pytz import timezone
from gmail_helpers import GmailMessage
from email.utils import parsedate_to_datetime
+
+def print_citi_html(email):
+ soup = BeautifulSoup(email.get_content(), features="lxml")
+ p = soup.find('p')
+ s = p.next
+ if isinstance(s, NavigableString):
+ l = [unicodedata.normalize("NFKD", s)]
+ else:
+ raise ValueError("weird email")
+ for br in p.findAll('br'):
+ s = br.next
+ if isinstance(s, NavigableString):
+ l.append(unicodedata.normalize("NFKD", s))
+ elif isinstance(s, Tag) and s.name == 'br':
+ l.append('\n')
+ else:
+ raise ValueError("weird email")
+ return "\n".join(l)
+
def save_emails(update=True):
"""Download new emails that were labeled swaptions."""
DATA_DIR = Path(os.getenv("DATA_DIR"))
@@ -34,7 +55,10 @@ def save_emails(update=True):
date = date.replace(tzinfo=timezone('utc'))
date = date.astimezone(timezone('America/New_York'))
body = message.get_body('plain')
- content = body.get_content()
+ if body is None:
+ content = print_citi_html(message.get_body('html'))
+ else:
+ content = body.get_content()
except (KeyError, UnicodeDecodeError, AttributeError) as e:
logging.error("error decoding " + msg['id'])
continue