blob: 9072f4ec73f915c8065e913b6fe34376509d6f4f (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
|
import base64
import json
import logging
import os
import sys
import unicodedata
from apiclient import errors
from bs4 import BeautifulSoup, NavigableString, Tag
from pathlib import Path
from pytz import timezone
from gmail_helpers import GmailMessage
from email.utils import parsedate_to_datetime
def print_citi_html(email):
soup = BeautifulSoup(email.get_content(), features="lxml")
p = soup.find('p')
s = p.next
if isinstance(s, NavigableString):
l = [unicodedata.normalize("NFKD", s)]
else:
raise ValueError("weird email")
for br in p.findAll('br'):
s = br.next
if isinstance(s, NavigableString):
l.append(unicodedata.normalize("NFKD", s))
elif isinstance(s, Tag) and s.name == 'br':
l.append('\n')
else:
raise ValueError("weird email")
return "\n".join(l)
def save_emails(update=True):
"""Download new emails that were labeled swaptions."""
DATA_DIR = Path(os.getenv("DATA_DIR"))
if update:
last_history_id = int((DATA_DIR / ".lastHistoryId").read_text())
existing_msgs = []
else:
p = DATA_DIR / "swaptions"
existing_msgs = set(str(x).split("_")[1] for x in p.iterdir() if x.is_file())
last_history_id = None
for msg in GmailMessage.list_msg_ids('swaptions', last_history_id):
if msg['id'] in existing_msgs:
continue
try:
message = GmailMessage.from_id(msg['id'])
logging.info(message.history_id)
subject = message['subject']
date = parsedate_to_datetime(message['date'])
if date.tzinfo is None:
date = date.replace(tzinfo=timezone('utc'))
date = date.astimezone(timezone('America/New_York'))
body = message.get_body('plain')
if body is None:
content = print_citi_html(message.get_body('html'))
else:
content = body.get_content()
except (KeyError, UnicodeDecodeError, AttributeError) as e:
logging.error("error decoding " + msg['id'])
continue
else:
email = (DATA_DIR / "swaptions" /
f"{date:%Y-%m-%d %H-%M-%S}_{msg['id']}")
with email.open("w") as fh:
fh.write(subject + "\r\n")
fh.write(content)
try:
new_history_id = message.history_id
(DATA_DIR / ".lastHistoryId").write_text(message.history_id)
except UnboundLocalError:
pass
if __name__ == '__main__':
try:
save_emails()
except errors.HttpError as e:
logging.error(e)
save_emails(update=False)
|