import numpy as np from scipy.special import expit from scipy.optimize import minimize from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.metrics import accuracy_score from sklearn.cross_validation import KFold from sklearn import preprocessing import seaborn import matplotlib.pyplot as plt from math import exp import sys seaborn.set_style("white") class LogReg(BaseEstimator, ClassifierMixin): def __init__(self, sigmas=1.): self.sigmas = sigmas def fit(self, x, y): w = np.zeros(x.shape[1]) def aux(w): return l_gradient(w, x, y, self.sigmas) self.w = minimize(aux, w, jac=True, method='L-BFGS-B', options={'disp': False, 'gtol': 1e-10}).x return self def predict(self, x): pred = np.dot(x, self.w) return (pred > 0).astype(int) def l(w, x, y, sigmas): x2 = np.dot(x, w) l = (-np.log(expit(x2)).sum() + np.inner(1-y, x2) + 1 / (2 * sigmas) * np.inner(w, w)) return l def l_gradient(w, x, y, sigmas): x2 = np.dot(x, w) l = (-np.log(expit(x2)).sum() + np.inner(x2, (1 - y)) + 1 / (2 * sigmas) * np.inner(w, w)) grad = (np.sum(-x * expit(-x2)[:, None], axis=0) + np.sum(x * (1 - y)[:, None], axis=0) + 1 / sigmas * w) return l, grad def test_gradient(w, x, y, sigmas): eps = 1e-10 _, g = l_gradient(w, x, y, sigmas) for i in xrange(len(g)): z = np.zeros(len(g)) z[i] = eps l1 = l(w + z, x, y, sigmas) z = np.zeros(len(g)) z[i] = -eps l2 = l(w + z, x, y, sigmas) print (l1 - l2) / (2 * eps), g[i] def cross_validation(x, y): for sigmas in [10 ** i for i in xrange(-8, 9)]: train_scores = [] test_scores = [] for train, test in KFold(n=x.shape[0], n_folds=10, shuffle=True): mod = LogReg(sigmas=sigmas) x_train_f = x[train] y_train_f = y[train] x_test_f = x[test] y_test_f = y[test] mod = mod.fit(x_train_f, y_train_f) train_scores.append(accuracy_score(y_test_f, mod.predict(x_test_f))) test_scores.append(accuracy_score(y_test, mod.predict(x_test))) print np.mean(train_scores), np.mean(test_scores) def plot_scores(): plt.figure(figsize=(9, 6)) values = [map(float, line.strip().split()) for line in open(sys.argv[1])] y1, y2 = zip(*values) x = xrange(-8, 9) y1 = 1 - np.array(y1) y2 = 1 - np.array(y2) plt.plot(x, y1, label='CV error') plt.plot(x, y2, label='Test error') plt.legend() plt.xlabel("log(sigma^2)") plt.ylabel("Error") plt.savefig(sys.argv[1] + ".pdf") def three(x, y): sigmas = exp(2) mod = LogReg(sigmas=sigmas) mod.fit(x, y) print 1 - accuracy_score(y, mod.predict(x)) print 1 - accuracy_score(y_test, mod.predict(x_test)) print mod.w def norm(x): for i in [54, 55, 56]: x[:, i] = np.log(1 + x[:, i]) return x if __name__ == "__main__": data_train = np.loadtxt("spam.train.dat") x_train = data_train[:, :-1] y_train = data_train[:, -1].astype(int) data_test = np.loadtxt("spam.test.dat") x_test = data_test[:, :-1] y_test = data_test[:, -1].astype(int) #x_train = preprocessing.scale(x_train) #x_test = preprocessing.scale(x_test) #x_train = norm(x_train) #x_test = norm(x_test) cross_validation(x_train, y_train) #plot_scores() #three(x_train, y_train)