aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/murano.py133
1 files changed, 133 insertions, 0 deletions
diff --git a/python/murano.py b/python/murano.py
new file mode 100644
index 00000000..6ea78e70
--- /dev/null
+++ b/python/murano.py
@@ -0,0 +1,133 @@
+import pandas as pd
+from db import dbengine
+import db
+import datetime
+
+df = pd.read_table("/home/share/guillaume/murano leads.csv.txt", encoding="utf-16-le", low_memory=False)
+## first drop all NA values
+df = df.dropna(1, 'all')
+## then drop columns with only one values
+todrop = []
+for col in df:
+ if df[col].value_counts(dropna=False).size == 1:
+ todrop.append(col)
+df = df.drop(todrop, axis=1)
+## drop columns with ID in it
+df = df.drop([col for col in df if 'ID' in col], axis=1)
+## cleanup url
+df.Website.replace("http://","")
+df.loc[df.Website.str.startswith('www').fillna(False),'Website'] = 'http://' + \
+ df.Website[df.Website.str.startswith('www').fillna(False)]
+
+df = df.drop(['Teams', 'Do Not Call', 'Assigned User Name', 'Hit Status'], axis=1)
+df['Minimum Track Record'] = pd.to_numeric(df['Minimum Track Record'], errors='coerce')
+for col in ['Date Created', 'Date Modified', 'Last Spoke Date']:
+ df[col] = pd.to_datetime(df[col])
+for col in df:
+ if df[col].dtypes == 'object':
+ df.loc[df[col].isnull(),col] = None
+
+for col in ['Allocating', 'Allocating to FoHFs']:
+ df.loc[df[col] == 'Active', col] = True
+ df.loc[df[col] == 'Inactive', col] = False
+ df.loc[df[col] == 'Unknown', col] = None
+## strip out commas
+df.loc[df['Non-primary emails'].notnull(),'Non-primary emails'] = \
+ df.loc[df['Non-primary emails'].notnull(),'Non-primary emails'].str.extract("([^,]*)", expand=False)
+df.Continent = df.Continent.str.title()
+for col in ['General Strategy Preferences',
+ 'Specific Strategy Preferences']:
+ df.loc[df[col].notnull(),col] = (df.loc[df[col].notnull(),col].
+ str.replace("__SugarMassUpdateClearField__,",""))
+for col in ['Specific Strategy Preferences']:
+ df.loc[df[col].notnull(),col] = (df.loc[df[col].notnull(),col].str.
+ replace("HF_Corporate_Bonds","HF: Corporate Bonds").str.
+ replace("HF_Multi_Asset","HF: Multi Asset").str.
+ replace("OA_Direct_Lending", "OA: Direct Lending").str.
+ replace("PE_Energy", "PE: Energy").str.
+ replace("lo_multi_asset", "LO: Multi Asset").str.
+ replace("PE_Oil_and_Gas", "PE: Oil and Gas"))
+for col in ['Investor Type', 'Investment Geography', 'General Strategy Preferences',
+ 'Specific Strategy Preferences', 'Volatility Preference',
+ 'Location Preferences']:
+ df.loc[df[col].notnull(),col] = df.loc[df[col].notnull(),col].str.split(",")
+df['FuM Currency'] = df['FuM Currency'].replace("Unknown", None)
+
+## cleanup country names
+col = df['Primary Address Country']
+col = col.str.strip().str.title()
+col = col.replace(["Republic Of Ireland", "United State Of America", "The Netherlands"],
+ ["Ireland", "United States Of America", "Netherlands"])
+
+
+def generate_table(df):
+ print("CREATE TABLE investors(ID serial PRIMARY KEY,")
+ r = []
+ for col, t in df.dtypes.iteritems():
+ if col in ['Allocating', 'Allocating to FoHFs']:
+ sql_type = 'boolean'
+ elif col == 'Continent':
+ sql_type = 'Region'
+ elif col == 'Investor Type':
+ sql_type = 'InvestorType[]'
+ elif col == 'FuM Currency':
+ sql_type = 'Currency'
+ elif col in ['Investment Geography', 'Location Preferences']:
+ sql_type = 'Geography[]'
+ elif col == 'Mifid':
+ sql_type = 'Mifid'
+ elif col == 'Specific Strategy Preferences':
+ sql_type = 'Strategy_detailed[]'
+ elif col == 'General Strategy Preferences':
+ sql_type = 'Strategy[]'
+ elif col == 'Volatility Preference':
+ sql_type = 'Volatility_tolerance[]'
+ elif col == 'Last Spoke Date':
+ sql_type = 'date'
+ elif t.name == "object":
+ sql_type = 'text'
+ elif t.name == "datetime64[ns]":
+ sql_type = 'timestamp'
+ elif t.name == 'float64':
+ sql_type = 'smallint'
+ else:
+ raise ValueError("Something fell through the crack")
+ r.append('"{}" {}'.format(col, sql_type))
+ print(",\n".join(r),")")
+
+def list_placeholders(df):
+ r = ['%s'] # ID
+ for col in df:
+ if col == 'Investor Type':
+ r.append("%s::InvestorType[]")
+ elif col in ['Investment Geography', 'Location Preferences']:
+ r.append("%s::Geography[]")
+ elif col == 'Specific Strategy Preferences':
+ r.append('%s::Strategy_detailed[]')
+ elif col == 'General Strategy Preferences':
+ r.append('%s::Strategy[]')
+ elif col == 'Volatility Preference':
+ r.append('%s::Volatility_tolerance[]')
+ else:
+ r.append('%s')
+ return r
+
+from psycopg2.extensions import register_adapter, AsIs
+import psycopg2
+def nat_to_null(d, _NULL=AsIs('NULL'),
+ _Timestamp=psycopg2.extensions.TimestampFromPy):
+ if not type(d) == pd.tslib.NaTType:
+ return _Timestamp(d)
+ return _NULL
+
+register_adapter(datetime.datetime, nat_to_null)
+investorsdb = dbengine('investorsdb')
+sql_str = "INSERT INTO investors VALUES({})".format(",".join(list_placeholders(df)))
+conn = investorsdb.raw_connection()
+with conn.cursor() as c:
+ for r in df.itertuples():
+ try:
+ c.execute(sql_str, r)
+ except db.DataError as e:
+ print(e)
+conn.commit()