aboutsummaryrefslogtreecommitdiffstats
path: root/python/download_emails.py
blob: 9072f4ec73f915c8065e913b6fe34376509d6f4f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import base64
import json
import logging
import os
import sys
import unicodedata

from apiclient import errors
from bs4 import BeautifulSoup, NavigableString, Tag
from pathlib import Path
from pytz import timezone
from gmail_helpers import GmailMessage
from email.utils import parsedate_to_datetime


def print_citi_html(email):
    soup = BeautifulSoup(email.get_content(), features="lxml")
    p = soup.find('p')
    s = p.next
    if isinstance(s, NavigableString):
        l = [unicodedata.normalize("NFKD", s)]
    else:
        raise ValueError("weird email")
    for br in p.findAll('br'):
        s = br.next
        if isinstance(s, NavigableString):
            l.append(unicodedata.normalize("NFKD", s))
        elif isinstance(s, Tag) and s.name == 'br':
            l.append('\n')
        else:
            raise ValueError("weird email")
    return "\n".join(l)

def save_emails(update=True):
    """Download new emails that were labeled swaptions."""
    DATA_DIR = Path(os.getenv("DATA_DIR"))

    if update:
        last_history_id = int((DATA_DIR / ".lastHistoryId").read_text())
        existing_msgs = []
    else:
        p = DATA_DIR / "swaptions"
        existing_msgs = set(str(x).split("_")[1] for x in p.iterdir() if x.is_file())
        last_history_id = None

    for msg in GmailMessage.list_msg_ids('swaptions', last_history_id):
        if msg['id'] in existing_msgs:
            continue
        try:
            message = GmailMessage.from_id(msg['id'])
            logging.info(message.history_id)
            subject = message['subject']
            date = parsedate_to_datetime(message['date'])
            if date.tzinfo is None:
                date = date.replace(tzinfo=timezone('utc'))
            date = date.astimezone(timezone('America/New_York'))
            body = message.get_body('plain')
            if body is None:
                content = print_citi_html(message.get_body('html'))
            else:
                content = body.get_content()
        except (KeyError, UnicodeDecodeError, AttributeError) as e:
            logging.error("error decoding " + msg['id'])
            continue
        else:
            email = (DATA_DIR / "swaptions" /
                     f"{date:%Y-%m-%d %H-%M-%S}_{msg['id']}")
            with email.open("w") as fh:
                fh.write(subject + "\r\n")
                fh.write(content)
    try:
        new_history_id = message.history_id
        (DATA_DIR / ".lastHistoryId").write_text(message.history_id)
    except UnboundLocalError:
        pass

if __name__ == '__main__':
    try:
        save_emails()
    except errors.HttpError as e:
        logging.error(e)
        save_emails(update=False)