python/parse_emails.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93

import pandas as pd
import re
from pathlib import Path
import pdb
from download_emails import update_emails

def makedf(r, indextype, ref):
    if indextype=='IG':
        cols = ['Strike', 'RecBid', 'RecOffer', 'DeltaRec', 'PayBid',
                'PayOffer', 'DeltaPay', 'Vol', 'Gamma']
    else:
        cols = ['Strike', 'RecBid', 'RecOffer', 'DeltaRec', 'PayBid',
                'PayOffer', 'DeltaPay', 'Vol', 'PxVol', 'Gamma']
    df = pd.DataFrame.from_records(r, columns = cols)
    df['ref'] = ref
    for col in ['DeltaRec', 'DeltaPay', 'Vol', 'PxVol', 'Gamma']:
        if col in df:
            df[col] = df[col].str.strip("%").astype('float')/100
    for k in df:
        if df.dtypes[k]=='object':
            df[k] = pd.to_numeric(df[k])
    df.set_index('strike', inplace=True)
    return df

if __name__=="__main__":
    update_emails()
    emails = [f for f in Path("../../data/swaptions").iterdir() if f.is_file()]
    masterdf = {}
    for f in emails:
        with f.open("rb") as fh:
            subject = fh.readline()
            m = re.match("(?:Fwd:)?(\w{2})([0-9]{1,2})\s", subject.decode('utf-8'))
            if m:
                indextype, series = m.groups()
                series = int(series)
            else:
                print("can't parse subject line for {0}".format(f))
                print(subject.decode("utf-8"))
                continue
            flag = False
            allexpiriesdf = {}
            for line in fh:
                line = line.decode('utf-8', 'ignore')
                line = line.rstrip()
                if line.startswith("At"):
                    for p in ['%m/%d  %H:%M:%S', '%b  %d %Y %H:%M:%S']:
                        try:
                            quotedate = pd.to_datetime(line, format=p, exact=False)
                        except ValueError:
                            continue
                        else:
                            if quotedate.year == 1900:
                                quotedate = quotedate.replace(year=2015)
                            break
                    else:
                        pdb.set_trace()
                if line.startswith("Ref"):
                    m = re.match("Ref:(\S+)\s+(?:Fwd Px:(\S+)\s+)?Fwd(?: Spd)?:(\S+)\s+Fwd Bpv:(\S+)\s+Expiry:(\S+)",
                                 line)
                    if m:
                        if len(m.groups())==4:
                            ref, fwspread, fwfwbpv, expiry = m.groups()
                        elif len(m.groups())==5:
                            ref, fwprice, fwspread, fwfwbpv, expiry = m.groups()
                    else:
                        print("something wrong with {0}".format(f))
                    expiry = pd.datetime.strptime(expiry, '%d-%b-%y')
                    continue
                if line.startswith("Strike"):
                    if "Px Vol" in line:
                        indextype='HY'
                    else:
                        indextype='IG'
                    flag = True
                    r = []
                    continue
                if flag:
                    if line:
                        line = re.sub("[/|]", " ", line)
                        vals = re.sub(" +", " ", line).rstrip().split(" ")
                        r.append(vals)
                        continue
                    else:
                        allexpiriesdf[expiry] = makedf(r, indextype, ref)
                        flag = False
                        r = []
                        continue
        if flag:
            allexpiriesdf[expiry] = makedf(r, indextype, ref)
        if allexpiriesdf:
            masterdf[(quotedate, indextype, series)] = pd.concat(allexpiriesdf, names=['expiry', 'strike'])
    masterdf = pd.concat(masterdf, names=['quotedate', 'indextype', 'series'])
    masterdf.to_hdf('swaptions.hdf', key='swaptions')