summaryrefslogtreecommitdiffstats
path: root/hw2/p3.py
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2015-10-08 22:39:22 -0400
committerThibaut Horel <thibaut.horel@gmail.com>2015-10-08 22:39:22 -0400
commitc1e4e0e41442c087284f34d7585cf9f551e52c2f (patch)
tree5154724e4f018128edaffc3b58392bf22188d613 /hw2/p3.py
parent0f9fc1bdb1755f5677e3201bcd35706e09235844 (diff)
downloadcs281-c1e4e0e41442c087284f34d7585cf9f551e52c2f.tar.gz
[hw2]
Diffstat (limited to 'hw2/p3.py')
-rw-r--r--hw2/p3.py125
1 files changed, 125 insertions, 0 deletions
diff --git a/hw2/p3.py b/hw2/p3.py
new file mode 100644
index 0000000..f4a3ffa
--- /dev/null
+++ b/hw2/p3.py
@@ -0,0 +1,125 @@
+import numpy as np
+from scipy.special import expit
+from scipy.optimize import minimize
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.metrics import accuracy_score
+from sklearn.cross_validation import KFold
+from sklearn import preprocessing
+import seaborn
+import matplotlib.pyplot as plt
+from math import exp
+import sys
+
+seaborn.set_style("white")
+
+
+class LogReg(BaseEstimator, ClassifierMixin):
+
+ def __init__(self, sigmas=1.):
+ self.sigmas = sigmas
+
+ def fit(self, x, y):
+ w = np.zeros(x.shape[1])
+
+ def aux(w):
+ return l_gradient(w, x, y, self.sigmas)
+
+ self.w = minimize(aux, w, jac=True, method='L-BFGS-B',
+ options={'disp': False, 'gtol': 1e-10}).x
+ return self
+
+ def predict(self, x):
+ pred = np.dot(x, self.w)
+ return (pred > 0).astype(int)
+
+
+def l(w, x, y, sigmas):
+ x2 = np.dot(x, w)
+ l = (-np.log(expit(x2)).sum() + np.inner(1-y, x2)
+ + 1 / (2 * sigmas) * np.inner(w, w))
+ return l
+
+
+def l_gradient(w, x, y, sigmas):
+ x2 = np.dot(x, w)
+ l = (-np.log(expit(x2)).sum() + np.inner(x2, (1 - y))
+ + 1 / (2 * sigmas) * np.inner(w, w))
+ grad = (np.sum(-x * expit(-x2)[:, None], axis=0)
+ + np.sum(x * (1 - y)[:, None], axis=0) + 1 / sigmas * w)
+ return l, grad
+
+
+def test_gradient(w, x, y, sigmas):
+ eps = 1e-10
+ _, g = l_gradient(w, x, y, sigmas)
+ for i in xrange(len(g)):
+ z = np.zeros(len(g))
+ z[i] = eps
+ l1 = l(w + z, x, y, sigmas)
+ z = np.zeros(len(g))
+ z[i] = -eps
+ l2 = l(w + z, x, y, sigmas)
+ print (l1 - l2) / (2 * eps), g[i]
+
+
+def cross_validation(x, y):
+ for sigmas in [10 ** i for i in xrange(-8, 9)]:
+ train_scores = []
+ test_scores = []
+ for train, test in KFold(n=x.shape[0], n_folds=10, shuffle=True):
+ mod = LogReg(sigmas=sigmas)
+ x_train_f = x[train]
+ y_train_f = y[train]
+ x_test_f = x[test]
+ y_test_f = y[test]
+ mod = mod.fit(x_train_f, y_train_f)
+ train_scores.append(accuracy_score(y_test_f,
+ mod.predict(x_test_f)))
+ test_scores.append(accuracy_score(y_test, mod.predict(x_test)))
+ print np.mean(train_scores), np.mean(test_scores)
+
+
+def plot_scores():
+ plt.figure(figsize=(9, 6))
+ values = [map(float, line.strip().split())
+ for line in open(sys.argv[1])]
+ y1, y2 = zip(*values)
+ x = xrange(-8, 9)
+ y1 = 1 - np.array(y1)
+ y2 = 1 - np.array(y2)
+ plt.plot(x, y1, label='CV error')
+ plt.plot(x, y2, label='Test error')
+ plt.legend()
+ plt.xlabel("log(sigma^2)")
+ plt.ylabel("Error")
+ plt.savefig(sys.argv[1] + ".pdf")
+
+
+def three(x, y):
+ sigmas = exp(2)
+ mod = LogReg(sigmas=sigmas)
+ mod.fit(x, y)
+ print 1 - accuracy_score(y, mod.predict(x))
+ print 1 - accuracy_score(y_test, mod.predict(x_test))
+ print mod.w
+
+
+def norm(x):
+ for i in [54, 55, 56]:
+ x[:, i] = np.log(1 + x[:, i])
+ return x
+
+if __name__ == "__main__":
+ data_train = np.loadtxt("spam.train.dat")
+ x_train = data_train[:, :-1]
+ y_train = data_train[:, -1].astype(int)
+ data_test = np.loadtxt("spam.test.dat")
+ x_test = data_test[:, :-1]
+ y_test = data_test[:, -1].astype(int)
+ #x_train = preprocessing.scale(x_train)
+ #x_test = preprocessing.scale(x_test)
+ #x_train = norm(x_train)
+ #x_test = norm(x_test)
+ cross_validation(x_train, y_train)
+ #plot_scores()
+ #three(x_train, y_train)