aboutsummaryrefslogtreecommitdiffstats
path: root/python/parse_emails.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/parse_emails.py')
-rw-r--r--python/parse_emails.py225
1 files changed, 148 insertions, 77 deletions
diff --git a/python/parse_emails.py b/python/parse_emails.py
index 79d54a81..0f359a6c 100644
--- a/python/parse_emails.py
+++ b/python/parse_emails.py
@@ -6,13 +6,15 @@ from download_emails import update_emails
import datetime
import sys
-def makedf(r, indextype):
+def makedf(r, indextype, quote_source):
if indextype=='IG':
cols = ['strike', 'rec_bid', 'rec_offer', 'delta_rec', 'pay_bid',
- 'pay_offer', 'delta_pay', 'vol', 'gamma']
+ 'pay_offer', 'delta_pay', 'vol']
else:
cols = ['strike', 'rec_bid', 'rec_offer', 'delta_rec', 'pay_bid',
- 'pay_offer', 'delta_pay', 'vol', 'price_vol', 'gamma']
+ 'pay_offer', 'delta_pay', 'vol', 'price_vol']
+ if quote_source == "BAML":
+ cols.append('gamma')
df = pd.DataFrame.from_records(r, columns = cols)
for col in ['delta_rec', 'delta_pay', 'vol', 'price_vol', 'gamma']:
if col in df:
@@ -23,77 +25,144 @@ def makedf(r, indextype):
df[k] = pd.to_numeric(df[k])
except ValueError:
pdb.set_trace()
+ df['quote_source'] = quote_source
df.set_index('strike', inplace=True)
return df
-def parse_email(email_path):
- with email_path.open("rb") as fh:
- date_received = datetime.datetime.fromtimestamp(int(fh.readline())/1000)
- subject = fh.readline()
- m = re.match("(?:Fwd:){0,2}(?:BAML )?(\w{2})([0-9]{1,2})\s", subject.decode('utf-8'))
- if m:
- indextype, series = m.groups()
- series = int(series)
- else:
- raise RuntimeError("can't parse subject line: {0} for email {1}".format(
- subject.decode("utf-8"), email_path.name))
- flag = False
- option_stack = {}
- fwd_index = []
- for line in fh:
- line = line.decode('utf-8', 'ignore')
- line = line.rstrip()
- if line.startswith("At"):
- for p in ['%m/%d %H:%M:%S', '%b %d %Y %H:%M:%S']:
- try:
- quotedate = pd.to_datetime(line, format=p, exact=False)
- except ValueError:
- continue
- else:
- if quotedate.year == 1900:
- quotedate = quotedate.replace(year=date_received.year)
- break
- else:
- raise RuntimeError("can't parse date")
- if line.startswith("Ref"):
- regex = "Ref:(?P<ref>\S+)\s+(?:Fwd Px:(?P<fwdprice>\S+)\s+)?" \
- "Fwd(?: Spd)?:(?P<fwdspread>\S+)\s+Fwd Bpv:(?P<fwdbpv>\S+)" \
- "\s+Expiry:(?P<expiry>\S+)"
- m = re.match(regex, line)
+def parse_quotedate(fh, date_received):
+ for line in fh:
+ line = line.rstrip()
+ if line.startswith("At"):
+ for p in ['%m/%d %H:%M:%S', '%b %d %Y %H:%M:%S']:
try:
- d = m.groupdict()
- d['quotedate'] = quotedate
- d['index'] = indextype
- d['series'] = series
- d['expiry'] = pd.to_datetime(d['expiry'], format='%d-%b-%y')
- except AttributeError:
- print("something wrong with {0}".format(email_path.name))
- continue
- if line.startswith("Strike"):
- flag = True
- r = []
- continue
- if flag:
- if line:
- line = re.sub("[/|]", " ", line)
- vals = re.sub(" +", " ", line).rstrip().split(" ")
- r.append(vals)
+ quotedate = pd.to_datetime(line, format=p, exact=False)
+ except ValueError:
continue
else:
- option_stack[d['expiry']] = makedf(r, indextype)
- fwd_index.append(d)
- flag = False
- r = []
- continue
- if flag:
- option_stack[d['expiry']] = makedf(r, indextype)
+ if quotedate.year == 1900:
+ quotedate = quotedate.replace(year=date_received.year)
+ break
+ else:
+ raise RuntimeError("can't parse date")
+ return quotedate
+
+def parse_refline(line):
+ regex = "Ref:(?P<ref>\S+)\s+(?:Fwd Px:(?P<fwdprice>\S+)\s+)?" \
+ "Fwd(?: Spd)?:(?P<fwdspread>\S+)\s+Fwd Bpv:(?P<fwdbpv>\S+)" \
+ "\s+Expiry:(?P<expiry>\S+)"
+ m = re.match(regex, line)
+ try:
+ d = m.groupdict()
+ d['expiry'] = pd.to_datetime(d['expiry'], format='%d-%b-%y')
+ except AttributeError:
+ print("something wrong with " + fh.name)
+ return d
+
+def parse_baml(fh, indextype, series, quotedate):
+ option_stack = {}
+ fwd_index = []
+ line = ""
+ while True:
+ if line == "":
+ try:
+ line = next(fh)
+ except StopIteration:
+ break
+ if line.startswith("Ref"):
+ d = parse_refline(line)
+ d.update({'quotedate': quotedate, 'index': indextype, 'series': series})
+ df, line = parse_baml_block(fh, indextype)
+ option_stack[d['expiry']] = df
fwd_index.append(d)
- if option_stack:
- fwd_index = pd.DataFrame.from_records(fwd_index,
- index='quotedate')
- return (quotedate, indextype, series), option_stack, fwd_index
else:
- raise RuntimeError("empty email: {0}".format(email_path.name))
+ line = ""
+ if option_stack:
+ fwd_index = pd.DataFrame.from_records(fwd_index,
+ index='quotedate')
+ return option_stack, fwd_index
+ else:
+ raise RuntimeError("empty email: " + fh.name)
+
+
+def parse_baml_block(fh, indextype):
+ next(fh) ## skip header
+ r = []
+ line = ""
+ for line in fh:
+ line = line.strip()
+ if line.startswith("Ref") or line == "":
+ break
+ line = re.sub("[/|]", " ", line)
+ vals = re.sub(" +", " ", line).rstrip().split(" ")
+ if len(vals) < 10:
+ line = ""
+ break
+ r.append(vals)
+ return makedf(r, indextype, "BAML"), line
+
+def parse_ms_block(fh, indextype):
+ next(fh) ## skip header
+ r = []
+ for line in fh:
+ line = line.rstrip()
+ if line == "":
+ break
+ strike, payer, receiver, vol = line.split("|")
+ strike = strike.strip()
+ if indextype == "HY":
+ strike = strike.split()[0]
+ pay_bid, pay_offer, delta_pay = payer.strip().split()
+ rec_bid, rec_offer, rec_pay = receiver.strip().split()
+ vol = vol.strip()
+ if indextype == "HY":
+ vol, price_vol = vol.replace("[","").replace("]","").split()
+ r.append([strike, pay_bid, pay_offer, delta_pay,
+ rec_bid, rec_offer, rec_pay, vol, price_vol])
+ else:
+ r.append([strike, pay_bid, pay_offer, delta_pay,
+ rec_bid, rec_offer, rec_pay, vol])
+ return makedf(r, indextype, "MS")
+
+
+def parse_ms(fh, indextype):
+ option_stack = {}
+ for line in fh:
+ line = line.rstrip()
+ if "EXPIRY" in line:
+ expiry = line.split(" ")[1]
+ expiry = pd.to_datetime(expiry, format="%d-%b-%Y")
+ option_stack[expiry] = parse_ms_block(fh, indextype)
+ return option_stack
+
+subject_BAML = re.compile("(?:Fwd:){0,2}(?:BAML )?(\w{2})([0-9]{1,2})\s")
+subject_MS = re.compile("\$\$ MS CDX OPTIONS: (IG|HY)(\d{2})[^\d]*(\d.(?:\.\d*)?)")
+
+def parse_email(email_path):
+ with email_path.open("rt") as fh:
+ date_received = datetime.datetime.fromtimestamp(int(fh.readline())/1000)
+ subject = next(fh)
+ m = subject_BAML.match(subject)
+ if m:
+ indextype, series = m.groups()
+ series = int(series)
+ quotedate = parse_quotedate(fh, date_received)
+ return (quotedate, indextype, series), parse_baml(fh, indextype, series, quotedate)
+ m = subject_MS.match(subject)
+ if m:
+ indextype, series, ref = m.groups()
+ series = int(series)
+ ref = float(series)
+ quotedate = parse_quotedate(fh, date_received)
+ option_stack = parse_ms(fh, indextype)
+ fwd_index = pd.DataFrame({'quotedate': quotedate,
+ 'ref': ref,
+ 'index': indextype,
+ 'series': series,
+ 'expiry': list(option_stack.keys())})
+ fwd_index.set_index('quotedate', inplace = True)
+ return (quotedate, indextype, series), (option_stack, fwd_index)
+ raise RuntimeError("can't parse subject line: {0} for email {1}".format(
+ subject, email_path.name))
if __name__=="__main__":
import pickle
@@ -107,14 +176,14 @@ if __name__=="__main__":
if f.name in already_uploaded:
continue
else:
- already_uploaded.add(f.name)
- try:
- key, option_stack, fwd_index = parse_email(f)
- except RuntimeError as e:
- print(e)
- else:
- swaption_stack[key] = pd.concat(option_stack, names=['expiry', 'strike'])
- index_data = index_data.append(fwd_index)
+ try:
+ key, (option_stack, fwd_index) = parse_email(f)
+ except RuntimeError as e:
+ print(e)
+ else:
+ swaption_stack[key] = pd.concat(option_stack, names=['expiry', 'strike'])
+ index_data = index_data.append(fwd_index)
+ already_uploaded.add(f.name)
if index_data.empty:
sys.exit()
for col in ['fwdbpv', 'fwdprice', 'fwdspread', 'ref']:
@@ -122,15 +191,17 @@ if __name__=="__main__":
index_data['index'] = index_data['index'].astype('category')
swaption_stack = pd.concat(swaption_stack, names=['quotedate', 'index', 'series'])
- import feather
- feather.write_dataframe(swaption_stack, '../../data/swaptions.fth')
- feather.write_dataframe(index_data, '../../data/index_data.fth')
+ # import feather
+ # feather.write_dataframe(swaption_stack, '../../data/swaptions.fth')
+ # feather.write_dataframe(index_data, '../../data/index_data.fth')
swaption_stack = swaption_stack.drop_duplicates()
swaption_stack = swaption_stack.reset_index()
index_data = index_data.drop_duplicates()
from db import dbengine
+ import psyscopg2
serenitasdb = dbengine('serenitasdb')
+ psycopg2.extensions.register_adapter(float, nan_to_null)
from sqlalchemy import MetaData, Table
meta = MetaData(bind=serenitasdb)
swaption_quotes = Table('swaption_quotes', meta, autoload=True)