diff options
Diffstat (limited to 'hw2/p3.py')
| -rw-r--r-- | hw2/p3.py | 125 |
1 files changed, 125 insertions, 0 deletions
diff --git a/hw2/p3.py b/hw2/p3.py new file mode 100644 index 0000000..f4a3ffa --- /dev/null +++ b/hw2/p3.py @@ -0,0 +1,125 @@ +import numpy as np +from scipy.special import expit +from scipy.optimize import minimize +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.metrics import accuracy_score +from sklearn.cross_validation import KFold +from sklearn import preprocessing +import seaborn +import matplotlib.pyplot as plt +from math import exp +import sys + +seaborn.set_style("white") + + +class LogReg(BaseEstimator, ClassifierMixin): + + def __init__(self, sigmas=1.): + self.sigmas = sigmas + + def fit(self, x, y): + w = np.zeros(x.shape[1]) + + def aux(w): + return l_gradient(w, x, y, self.sigmas) + + self.w = minimize(aux, w, jac=True, method='L-BFGS-B', + options={'disp': False, 'gtol': 1e-10}).x + return self + + def predict(self, x): + pred = np.dot(x, self.w) + return (pred > 0).astype(int) + + +def l(w, x, y, sigmas): + x2 = np.dot(x, w) + l = (-np.log(expit(x2)).sum() + np.inner(1-y, x2) + + 1 / (2 * sigmas) * np.inner(w, w)) + return l + + +def l_gradient(w, x, y, sigmas): + x2 = np.dot(x, w) + l = (-np.log(expit(x2)).sum() + np.inner(x2, (1 - y)) + + 1 / (2 * sigmas) * np.inner(w, w)) + grad = (np.sum(-x * expit(-x2)[:, None], axis=0) + + np.sum(x * (1 - y)[:, None], axis=0) + 1 / sigmas * w) + return l, grad + + +def test_gradient(w, x, y, sigmas): + eps = 1e-10 + _, g = l_gradient(w, x, y, sigmas) + for i in xrange(len(g)): + z = np.zeros(len(g)) + z[i] = eps + l1 = l(w + z, x, y, sigmas) + z = np.zeros(len(g)) + z[i] = -eps + l2 = l(w + z, x, y, sigmas) + print (l1 - l2) / (2 * eps), g[i] + + +def cross_validation(x, y): + for sigmas in [10 ** i for i in xrange(-8, 9)]: + train_scores = [] + test_scores = [] + for train, test in KFold(n=x.shape[0], n_folds=10, shuffle=True): + mod = LogReg(sigmas=sigmas) + x_train_f = x[train] + y_train_f = y[train] + x_test_f = x[test] + y_test_f = y[test] + mod = mod.fit(x_train_f, y_train_f) + train_scores.append(accuracy_score(y_test_f, + mod.predict(x_test_f))) + test_scores.append(accuracy_score(y_test, mod.predict(x_test))) + print np.mean(train_scores), np.mean(test_scores) + + +def plot_scores(): + plt.figure(figsize=(9, 6)) + values = [map(float, line.strip().split()) + for line in open(sys.argv[1])] + y1, y2 = zip(*values) + x = xrange(-8, 9) + y1 = 1 - np.array(y1) + y2 = 1 - np.array(y2) + plt.plot(x, y1, label='CV error') + plt.plot(x, y2, label='Test error') + plt.legend() + plt.xlabel("log(sigma^2)") + plt.ylabel("Error") + plt.savefig(sys.argv[1] + ".pdf") + + +def three(x, y): + sigmas = exp(2) + mod = LogReg(sigmas=sigmas) + mod.fit(x, y) + print 1 - accuracy_score(y, mod.predict(x)) + print 1 - accuracy_score(y_test, mod.predict(x_test)) + print mod.w + + +def norm(x): + for i in [54, 55, 56]: + x[:, i] = np.log(1 + x[:, i]) + return x + +if __name__ == "__main__": + data_train = np.loadtxt("spam.train.dat") + x_train = data_train[:, :-1] + y_train = data_train[:, -1].astype(int) + data_test = np.loadtxt("spam.test.dat") + x_test = data_test[:, :-1] + y_test = data_test[:, -1].astype(int) + #x_train = preprocessing.scale(x_train) + #x_test = preprocessing.scale(x_test) + #x_train = norm(x_train) + #x_test = norm(x_test) + cross_validation(x_train, y_train) + #plot_scores() + #three(x_train, y_train) |
