summaryrefslogtreecommitdiffstats
path: root/hw2/p3.py
blob: f4a3ffae08007b74556321d45898fc67470edd66 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import numpy as np
from scipy.special import expit
from scipy.optimize import minimize
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import KFold
from sklearn import preprocessing
import seaborn
import matplotlib.pyplot as plt
from math import exp
import sys

seaborn.set_style("white")


class LogReg(BaseEstimator, ClassifierMixin):

    def __init__(self, sigmas=1.):
        self.sigmas = sigmas

    def fit(self, x, y):
        w = np.zeros(x.shape[1])

        def aux(w):
            return l_gradient(w, x, y, self.sigmas)

        self.w = minimize(aux, w, jac=True, method='L-BFGS-B',
                          options={'disp': False, 'gtol': 1e-10}).x
        return self

    def predict(self, x):
        pred = np.dot(x, self.w)
        return (pred > 0).astype(int)


def l(w, x, y, sigmas):
    x2 = np.dot(x, w)
    l = (-np.log(expit(x2)).sum() + np.inner(1-y, x2)
         + 1 / (2 * sigmas) * np.inner(w, w))
    return l


def l_gradient(w, x, y, sigmas):
    x2 = np.dot(x, w)
    l = (-np.log(expit(x2)).sum() + np.inner(x2, (1 - y))
         + 1 / (2 * sigmas) * np.inner(w, w))
    grad = (np.sum(-x * expit(-x2)[:, None], axis=0)
            + np.sum(x * (1 - y)[:, None], axis=0) + 1 / sigmas * w)
    return l, grad


def test_gradient(w, x, y, sigmas):
    eps = 1e-10
    _, g = l_gradient(w, x, y, sigmas)
    for i in xrange(len(g)):
        z = np.zeros(len(g))
        z[i] = eps
        l1 = l(w + z, x, y, sigmas)
        z = np.zeros(len(g))
        z[i] = -eps
        l2 = l(w + z, x, y, sigmas)
        print (l1 - l2) / (2 * eps), g[i]


def cross_validation(x, y):
    for sigmas in [10 ** i for i in xrange(-8, 9)]:
        train_scores = []
        test_scores = []
        for train, test in KFold(n=x.shape[0], n_folds=10, shuffle=True):
            mod = LogReg(sigmas=sigmas)
            x_train_f = x[train]
            y_train_f = y[train]
            x_test_f = x[test]
            y_test_f = y[test]
            mod = mod.fit(x_train_f, y_train_f)
            train_scores.append(accuracy_score(y_test_f,
                                               mod.predict(x_test_f)))
            test_scores.append(accuracy_score(y_test, mod.predict(x_test)))
        print np.mean(train_scores), np.mean(test_scores)


def plot_scores():
    plt.figure(figsize=(9, 6))
    values = [map(float, line.strip().split())
              for line in open(sys.argv[1])]
    y1, y2 = zip(*values)
    x = xrange(-8, 9)
    y1 = 1 - np.array(y1)
    y2 = 1 - np.array(y2)
    plt.plot(x, y1, label='CV error')
    plt.plot(x, y2, label='Test error')
    plt.legend()
    plt.xlabel("log(sigma^2)")
    plt.ylabel("Error")
    plt.savefig(sys.argv[1] + ".pdf")


def three(x, y):
    sigmas = exp(2)
    mod = LogReg(sigmas=sigmas)
    mod.fit(x, y)
    print 1 - accuracy_score(y, mod.predict(x))
    print 1 - accuracy_score(y_test, mod.predict(x_test))
    print mod.w


def norm(x):
    for i in [54, 55, 56]:
        x[:, i] = np.log(1 + x[:, i])
    return x

if __name__ == "__main__":
    data_train = np.loadtxt("spam.train.dat")
    x_train = data_train[:, :-1]
    y_train = data_train[:, -1].astype(int)
    data_test = np.loadtxt("spam.test.dat")
    x_test = data_test[:, :-1]
    y_test = data_test[:, -1].astype(int)
    #x_train = preprocessing.scale(x_train)
    #x_test = preprocessing.scale(x_test)
    #x_train = norm(x_train)
    #x_test = norm(x_test)
    cross_validation(x_train, y_train)
    #plot_scores()
    #three(x_train, y_train)