diff options
Diffstat (limited to 'python/murano.py')
| -rw-r--r-- | python/murano.py | 133 |
1 files changed, 133 insertions, 0 deletions
diff --git a/python/murano.py b/python/murano.py new file mode 100644 index 00000000..6ea78e70 --- /dev/null +++ b/python/murano.py @@ -0,0 +1,133 @@ +import pandas as pd +from db import dbengine +import db +import datetime + +df = pd.read_table("/home/share/guillaume/murano leads.csv.txt", encoding="utf-16-le", low_memory=False) +## first drop all NA values +df = df.dropna(1, 'all') +## then drop columns with only one values +todrop = [] +for col in df: + if df[col].value_counts(dropna=False).size == 1: + todrop.append(col) +df = df.drop(todrop, axis=1) +## drop columns with ID in it +df = df.drop([col for col in df if 'ID' in col], axis=1) +## cleanup url +df.Website.replace("http://","") +df.loc[df.Website.str.startswith('www').fillna(False),'Website'] = 'http://' + \ + df.Website[df.Website.str.startswith('www').fillna(False)] + +df = df.drop(['Teams', 'Do Not Call', 'Assigned User Name', 'Hit Status'], axis=1) +df['Minimum Track Record'] = pd.to_numeric(df['Minimum Track Record'], errors='coerce') +for col in ['Date Created', 'Date Modified', 'Last Spoke Date']: + df[col] = pd.to_datetime(df[col]) +for col in df: + if df[col].dtypes == 'object': + df.loc[df[col].isnull(),col] = None + +for col in ['Allocating', 'Allocating to FoHFs']: + df.loc[df[col] == 'Active', col] = True + df.loc[df[col] == 'Inactive', col] = False + df.loc[df[col] == 'Unknown', col] = None +## strip out commas +df.loc[df['Non-primary emails'].notnull(),'Non-primary emails'] = \ + df.loc[df['Non-primary emails'].notnull(),'Non-primary emails'].str.extract("([^,]*)", expand=False) +df.Continent = df.Continent.str.title() +for col in ['General Strategy Preferences', + 'Specific Strategy Preferences']: + df.loc[df[col].notnull(),col] = (df.loc[df[col].notnull(),col]. + str.replace("__SugarMassUpdateClearField__,","")) +for col in ['Specific Strategy Preferences']: + df.loc[df[col].notnull(),col] = (df.loc[df[col].notnull(),col].str. + replace("HF_Corporate_Bonds","HF: Corporate Bonds").str. + replace("HF_Multi_Asset","HF: Multi Asset").str. + replace("OA_Direct_Lending", "OA: Direct Lending").str. + replace("PE_Energy", "PE: Energy").str. + replace("lo_multi_asset", "LO: Multi Asset").str. + replace("PE_Oil_and_Gas", "PE: Oil and Gas")) +for col in ['Investor Type', 'Investment Geography', 'General Strategy Preferences', + 'Specific Strategy Preferences', 'Volatility Preference', + 'Location Preferences']: + df.loc[df[col].notnull(),col] = df.loc[df[col].notnull(),col].str.split(",") +df['FuM Currency'] = df['FuM Currency'].replace("Unknown", None) + +## cleanup country names +col = df['Primary Address Country'] +col = col.str.strip().str.title() +col = col.replace(["Republic Of Ireland", "United State Of America", "The Netherlands"], + ["Ireland", "United States Of America", "Netherlands"]) + + +def generate_table(df): + print("CREATE TABLE investors(ID serial PRIMARY KEY,") + r = [] + for col, t in df.dtypes.iteritems(): + if col in ['Allocating', 'Allocating to FoHFs']: + sql_type = 'boolean' + elif col == 'Continent': + sql_type = 'Region' + elif col == 'Investor Type': + sql_type = 'InvestorType[]' + elif col == 'FuM Currency': + sql_type = 'Currency' + elif col in ['Investment Geography', 'Location Preferences']: + sql_type = 'Geography[]' + elif col == 'Mifid': + sql_type = 'Mifid' + elif col == 'Specific Strategy Preferences': + sql_type = 'Strategy_detailed[]' + elif col == 'General Strategy Preferences': + sql_type = 'Strategy[]' + elif col == 'Volatility Preference': + sql_type = 'Volatility_tolerance[]' + elif col == 'Last Spoke Date': + sql_type = 'date' + elif t.name == "object": + sql_type = 'text' + elif t.name == "datetime64[ns]": + sql_type = 'timestamp' + elif t.name == 'float64': + sql_type = 'smallint' + else: + raise ValueError("Something fell through the crack") + r.append('"{}" {}'.format(col, sql_type)) + print(",\n".join(r),")") + +def list_placeholders(df): + r = ['%s'] # ID + for col in df: + if col == 'Investor Type': + r.append("%s::InvestorType[]") + elif col in ['Investment Geography', 'Location Preferences']: + r.append("%s::Geography[]") + elif col == 'Specific Strategy Preferences': + r.append('%s::Strategy_detailed[]') + elif col == 'General Strategy Preferences': + r.append('%s::Strategy[]') + elif col == 'Volatility Preference': + r.append('%s::Volatility_tolerance[]') + else: + r.append('%s') + return r + +from psycopg2.extensions import register_adapter, AsIs +import psycopg2 +def nat_to_null(d, _NULL=AsIs('NULL'), + _Timestamp=psycopg2.extensions.TimestampFromPy): + if not type(d) == pd.tslib.NaTType: + return _Timestamp(d) + return _NULL + +register_adapter(datetime.datetime, nat_to_null) +investorsdb = dbengine('investorsdb') +sql_str = "INSERT INTO investors VALUES({})".format(",".join(list_placeholders(df))) +conn = investorsdb.raw_connection() +with conn.cursor() as c: + for r in df.itertuples(): + try: + c.execute(sql_str, r) + except db.DataError as e: + print(e) +conn.commit() |
