diff options
Diffstat (limited to 'python/parse_emails.py')
| -rw-r--r-- | python/parse_emails.py | 225 |
1 files changed, 148 insertions, 77 deletions
diff --git a/python/parse_emails.py b/python/parse_emails.py index 79d54a81..0f359a6c 100644 --- a/python/parse_emails.py +++ b/python/parse_emails.py @@ -6,13 +6,15 @@ from download_emails import update_emails import datetime import sys -def makedf(r, indextype): +def makedf(r, indextype, quote_source): if indextype=='IG': cols = ['strike', 'rec_bid', 'rec_offer', 'delta_rec', 'pay_bid', - 'pay_offer', 'delta_pay', 'vol', 'gamma'] + 'pay_offer', 'delta_pay', 'vol'] else: cols = ['strike', 'rec_bid', 'rec_offer', 'delta_rec', 'pay_bid', - 'pay_offer', 'delta_pay', 'vol', 'price_vol', 'gamma'] + 'pay_offer', 'delta_pay', 'vol', 'price_vol'] + if quote_source == "BAML": + cols.append('gamma') df = pd.DataFrame.from_records(r, columns = cols) for col in ['delta_rec', 'delta_pay', 'vol', 'price_vol', 'gamma']: if col in df: @@ -23,77 +25,144 @@ def makedf(r, indextype): df[k] = pd.to_numeric(df[k]) except ValueError: pdb.set_trace() + df['quote_source'] = quote_source df.set_index('strike', inplace=True) return df -def parse_email(email_path): - with email_path.open("rb") as fh: - date_received = datetime.datetime.fromtimestamp(int(fh.readline())/1000) - subject = fh.readline() - m = re.match("(?:Fwd:){0,2}(?:BAML )?(\w{2})([0-9]{1,2})\s", subject.decode('utf-8')) - if m: - indextype, series = m.groups() - series = int(series) - else: - raise RuntimeError("can't parse subject line: {0} for email {1}".format( - subject.decode("utf-8"), email_path.name)) - flag = False - option_stack = {} - fwd_index = [] - for line in fh: - line = line.decode('utf-8', 'ignore') - line = line.rstrip() - if line.startswith("At"): - for p in ['%m/%d %H:%M:%S', '%b %d %Y %H:%M:%S']: - try: - quotedate = pd.to_datetime(line, format=p, exact=False) - except ValueError: - continue - else: - if quotedate.year == 1900: - quotedate = quotedate.replace(year=date_received.year) - break - else: - raise RuntimeError("can't parse date") - if line.startswith("Ref"): - regex = "Ref:(?P<ref>\S+)\s+(?:Fwd Px:(?P<fwdprice>\S+)\s+)?" \ - "Fwd(?: Spd)?:(?P<fwdspread>\S+)\s+Fwd Bpv:(?P<fwdbpv>\S+)" \ - "\s+Expiry:(?P<expiry>\S+)" - m = re.match(regex, line) +def parse_quotedate(fh, date_received): + for line in fh: + line = line.rstrip() + if line.startswith("At"): + for p in ['%m/%d %H:%M:%S', '%b %d %Y %H:%M:%S']: try: - d = m.groupdict() - d['quotedate'] = quotedate - d['index'] = indextype - d['series'] = series - d['expiry'] = pd.to_datetime(d['expiry'], format='%d-%b-%y') - except AttributeError: - print("something wrong with {0}".format(email_path.name)) - continue - if line.startswith("Strike"): - flag = True - r = [] - continue - if flag: - if line: - line = re.sub("[/|]", " ", line) - vals = re.sub(" +", " ", line).rstrip().split(" ") - r.append(vals) + quotedate = pd.to_datetime(line, format=p, exact=False) + except ValueError: continue else: - option_stack[d['expiry']] = makedf(r, indextype) - fwd_index.append(d) - flag = False - r = [] - continue - if flag: - option_stack[d['expiry']] = makedf(r, indextype) + if quotedate.year == 1900: + quotedate = quotedate.replace(year=date_received.year) + break + else: + raise RuntimeError("can't parse date") + return quotedate + +def parse_refline(line): + regex = "Ref:(?P<ref>\S+)\s+(?:Fwd Px:(?P<fwdprice>\S+)\s+)?" \ + "Fwd(?: Spd)?:(?P<fwdspread>\S+)\s+Fwd Bpv:(?P<fwdbpv>\S+)" \ + "\s+Expiry:(?P<expiry>\S+)" + m = re.match(regex, line) + try: + d = m.groupdict() + d['expiry'] = pd.to_datetime(d['expiry'], format='%d-%b-%y') + except AttributeError: + print("something wrong with " + fh.name) + return d + +def parse_baml(fh, indextype, series, quotedate): + option_stack = {} + fwd_index = [] + line = "" + while True: + if line == "": + try: + line = next(fh) + except StopIteration: + break + if line.startswith("Ref"): + d = parse_refline(line) + d.update({'quotedate': quotedate, 'index': indextype, 'series': series}) + df, line = parse_baml_block(fh, indextype) + option_stack[d['expiry']] = df fwd_index.append(d) - if option_stack: - fwd_index = pd.DataFrame.from_records(fwd_index, - index='quotedate') - return (quotedate, indextype, series), option_stack, fwd_index else: - raise RuntimeError("empty email: {0}".format(email_path.name)) + line = "" + if option_stack: + fwd_index = pd.DataFrame.from_records(fwd_index, + index='quotedate') + return option_stack, fwd_index + else: + raise RuntimeError("empty email: " + fh.name) + + +def parse_baml_block(fh, indextype): + next(fh) ## skip header + r = [] + line = "" + for line in fh: + line = line.strip() + if line.startswith("Ref") or line == "": + break + line = re.sub("[/|]", " ", line) + vals = re.sub(" +", " ", line).rstrip().split(" ") + if len(vals) < 10: + line = "" + break + r.append(vals) + return makedf(r, indextype, "BAML"), line + +def parse_ms_block(fh, indextype): + next(fh) ## skip header + r = [] + for line in fh: + line = line.rstrip() + if line == "": + break + strike, payer, receiver, vol = line.split("|") + strike = strike.strip() + if indextype == "HY": + strike = strike.split()[0] + pay_bid, pay_offer, delta_pay = payer.strip().split() + rec_bid, rec_offer, rec_pay = receiver.strip().split() + vol = vol.strip() + if indextype == "HY": + vol, price_vol = vol.replace("[","").replace("]","").split() + r.append([strike, pay_bid, pay_offer, delta_pay, + rec_bid, rec_offer, rec_pay, vol, price_vol]) + else: + r.append([strike, pay_bid, pay_offer, delta_pay, + rec_bid, rec_offer, rec_pay, vol]) + return makedf(r, indextype, "MS") + + +def parse_ms(fh, indextype): + option_stack = {} + for line in fh: + line = line.rstrip() + if "EXPIRY" in line: + expiry = line.split(" ")[1] + expiry = pd.to_datetime(expiry, format="%d-%b-%Y") + option_stack[expiry] = parse_ms_block(fh, indextype) + return option_stack + +subject_BAML = re.compile("(?:Fwd:){0,2}(?:BAML )?(\w{2})([0-9]{1,2})\s") +subject_MS = re.compile("\$\$ MS CDX OPTIONS: (IG|HY)(\d{2})[^\d]*(\d.(?:\.\d*)?)") + +def parse_email(email_path): + with email_path.open("rt") as fh: + date_received = datetime.datetime.fromtimestamp(int(fh.readline())/1000) + subject = next(fh) + m = subject_BAML.match(subject) + if m: + indextype, series = m.groups() + series = int(series) + quotedate = parse_quotedate(fh, date_received) + return (quotedate, indextype, series), parse_baml(fh, indextype, series, quotedate) + m = subject_MS.match(subject) + if m: + indextype, series, ref = m.groups() + series = int(series) + ref = float(series) + quotedate = parse_quotedate(fh, date_received) + option_stack = parse_ms(fh, indextype) + fwd_index = pd.DataFrame({'quotedate': quotedate, + 'ref': ref, + 'index': indextype, + 'series': series, + 'expiry': list(option_stack.keys())}) + fwd_index.set_index('quotedate', inplace = True) + return (quotedate, indextype, series), (option_stack, fwd_index) + raise RuntimeError("can't parse subject line: {0} for email {1}".format( + subject, email_path.name)) if __name__=="__main__": import pickle @@ -107,14 +176,14 @@ if __name__=="__main__": if f.name in already_uploaded: continue else: - already_uploaded.add(f.name) - try: - key, option_stack, fwd_index = parse_email(f) - except RuntimeError as e: - print(e) - else: - swaption_stack[key] = pd.concat(option_stack, names=['expiry', 'strike']) - index_data = index_data.append(fwd_index) + try: + key, (option_stack, fwd_index) = parse_email(f) + except RuntimeError as e: + print(e) + else: + swaption_stack[key] = pd.concat(option_stack, names=['expiry', 'strike']) + index_data = index_data.append(fwd_index) + already_uploaded.add(f.name) if index_data.empty: sys.exit() for col in ['fwdbpv', 'fwdprice', 'fwdspread', 'ref']: @@ -122,15 +191,17 @@ if __name__=="__main__": index_data['index'] = index_data['index'].astype('category') swaption_stack = pd.concat(swaption_stack, names=['quotedate', 'index', 'series']) - import feather - feather.write_dataframe(swaption_stack, '../../data/swaptions.fth') - feather.write_dataframe(index_data, '../../data/index_data.fth') + # import feather + # feather.write_dataframe(swaption_stack, '../../data/swaptions.fth') + # feather.write_dataframe(index_data, '../../data/index_data.fth') swaption_stack = swaption_stack.drop_duplicates() swaption_stack = swaption_stack.reset_index() index_data = index_data.drop_duplicates() from db import dbengine + import psyscopg2 serenitasdb = dbengine('serenitasdb') + psycopg2.extensions.register_adapter(float, nan_to_null) from sqlalchemy import MetaData, Table meta = MetaData(bind=serenitasdb) swaption_quotes = Table('swaption_quotes', meta, autoload=True) |
