From c1e4e0e41442c087284f34d7585cf9f551e52c2f Mon Sep 17 00:00:00 2001 From: Thibaut Horel Date: Thu, 8 Oct 2015 22:39:22 -0400 Subject: [hw2] --- hw2/harvardml.cls | 97 ++++++++++++++++ hw2/main.tex | 326 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ hw2/p1.py | 37 +++++++ hw2/p2.py | 102 +++++++++++++++++ hw2/p3.py | 125 +++++++++++++++++++++ hw2/w.txt.pdf | 16 +++ 6 files changed, 703 insertions(+) create mode 100644 hw2/harvardml.cls create mode 100644 hw2/main.tex create mode 100644 hw2/p1.py create mode 100644 hw2/p2.py create mode 100644 hw2/p3.py create mode 100644 hw2/w.txt.pdf diff --git a/hw2/harvardml.cls b/hw2/harvardml.cls new file mode 100644 index 0000000..e285173 --- /dev/null +++ b/hw2/harvardml.cls @@ -0,0 +1,97 @@ +% Ryan Adams +% School of Engineering and Applied Sciences +% Harvard University +% v0.01, 31 August 2013 +% Based on HMC Math Dept. template by Eric J. Malm. +\NeedsTeXFormat{LaTeX2e}[1995/01/01] +\ProvidesClass{harvardml} +[2013/08/31 v0.01 Harvard ML Assignment Class] + +\RequirePackage{ifpdf} + +\newif\ifhmlset@submit +\DeclareOption{submit}{% + \hmlset@submittrue% +} +\DeclareOption{nosubmit}{% + \hmlset@submitfalse% +} + +\DeclareOption*{\PassOptionsToClass{\CurrentOption}{article}} +\ExecuteOptions{nosubmit} +\ProcessOptions\relax + +\LoadClass[10pt,letterpaper]{article} + +\newif\ifhmlset@header + +\hmlset@headertrue + +\RequirePackage{mathpazo} +\RequirePackage{palatino} +\RequirePackage{amsmath} +\RequirePackage{amssymb} +\RequirePackage{amsthm} +\RequirePackage{fullpage} +\RequirePackage{mdframed} + +\newtheoremstyle{hmlplain} + {3pt}% Space above + {3pt}% Space below + {}% Body font + {}% Indent amount + {\bfseries}% Theorem head font + {\\*[3pt]}% Punctuation after theorem head + {.5em}% Space after theorem head + {}% Theorem head spec (can be left empty, meaning `normal') + +\def\titlebar{\hrule height2pt\vskip .25in\vskip-\parskip} + +\newcommand{\headerblock}{% + \noindent\begin{minipage}{0.33\textwidth} + \begin{flushleft} + \ifhmlset@submit + \mbox{\hmlset@name}\\ + \mbox{\tt \hmlset@email}\\ + \mbox{\hmlset@course} + \fi + \end{flushleft} + \end{minipage} + \noindent\begin{minipage}{0.33\textwidth} + \begin{center} + \mbox{\Large\hmlset@assignment}\protect\\ + Due: \hmlset@duedate + \end{center} + \end{minipage} + \noindent\begin{minipage}{0.33\textwidth} + \begin{flushright} + \ifhmlset@submit + Collaborators: \hmlset@collaborators + \fi + \end{flushright} + \end{minipage} + \vspace{0.1cm} + \titlebar +} + +\ifhmlset@header\AtBeginDocument{\headerblock}\fi + +\def\hmlset@name{} +\def\hmlset@email{} +\def\hmlset@course{} +\def\hmlset@assignment{} +\def\hmlset@duedate{} +\def\hmlset@collaborators{} +\def\hmlset@extraline{} + +% commands to set header block info +\newcommand{\name}[1]{\def\hmlset@name{#1}} +\newcommand{\email}[1]{\def\hmlset@email{#1}} +\newcommand{\course}[1]{\def\hmlset@course{#1}} +\newcommand{\assignment}[1]{\def\hmlset@assignment{#1}} +\newcommand{\duedate}[1]{\def\hmlset@duedate{#1}} +\newcommand{\collaborators}[1]{\def\hmlset@collaborators{#1}} +\newcommand{\extraline}[1]{\def\hmlset@extraline{#1}} + +\theoremstyle{hmlplain} +\newmdtheoremenv[skipabove=\topsep,skipbelow=\topsep,nobreak=true]{problem}{Problem} diff --git a/hw2/main.tex b/hw2/main.tex new file mode 100644 index 0000000..f4a1067 --- /dev/null +++ b/hw2/main.tex @@ -0,0 +1,326 @@ +% No 'submit' option for the problems by themselves. +\documentclass[submit]{harvardml} +% Use the 'submit' option when you submit your solutions. +%\documentclass[submit]{harvardml} + +% Put in your full name and email address. +\name{Thibaut Horel} +\email{thorel@seas.harvard.edu} +\usepackage[utf8x]{inputenc} + +% List any people you worked with. +\collaborators{% +} + +% You don't need to change these. +\course{CS281-F13} +\assignment{Assignment \#2} +\duedate{23:59pm October 7, 2015} + +\usepackage{graphicx} +% Useful macros. +\DeclareMathOperator*{\argmax}{argmax} +\DeclareMathOperator*{\argmin}{argmin} +\newcommand{\given}{\,|\,} +\newcommand{\prob}{\mathbb{P}} +\newcommand{\trans}{\mathsf{T}} +\newcommand{\bx}{\mathbf{x}} +\newcommand{\by}{\mathbf{y}} +\newcommand{\bw}{\mathbf{w}} +\newcommand{\distNorm}{\mathcal{N}} +\newcommand{\bzero}{\mathbf{0}} +\newcommand{\ident}{\mathbb{I}} + +\begin{document} + + +\begin{problem}[10pts] +One intuitive way to summarize a probability density is via the mode, +as this is the ``most likely'' value in some sense. A common example +of this is using the maximum \textit{a posteriori} (MAP) estimate of a +model's parameters. In high dimensions, however, the mode becomes +less and less representative of typical samples. Consider variates +from a~$D$-dimensional zero mean spherical Gaussian with unit +variance: +\begin{align*} + \bx &\sim \distNorm(\bzero_D, \ident_D), +\end{align*} +where~$\bzero_D$ indicates a column vector of~$D$ zeros and~$\ident_D$ +is a~${D\times D}$ identity matrix. +\begin{enumerate} + \item Compute the distribution that this implies over the distance + of these points from the origin. That is, compute the + distribution over~$\sqrt{\bx^\trans\bx}$, if~$\bx$ is a + realization from~$\distNorm(\bzero_D, \ident_D)$. (Hint: Consider + transformations of a Gamma distribution.) + \item Make a plot that shows this probability density function for + several different values of~$D$, up to ~${D=100}$. + + \item Make a plot of the cumulative distribution function (CDF) over + this distance distribution for~${D=100}$. A closed-form solution + may be difficult to compute, so you can do this numerically.) + + \item From examining the CDF we can think about where most of the + mass lives as a function of radius. For example, most of the mass + for~${D=100}$ is within a thin spherical shell. From eyeballing + the plot, what are the inner and outer radii for the shell that + contains 90\% of the mass in this case? +\end{enumerate} +\end{problem} + +\paragraph{Solution.} The code for this problem can be found in file +\textsf{p1.py}. + +1. By definition of $\bx$, its coordinates are +independent univariate Gaussian variables of mean 0 and variance one. Hence, +$\bx^T\bx = \sum_{i=1}^D\bx_i^2$ is exactly a chi-squared distribution with $D$ +degrees of freedom. So for all $a, b$ we have: +\begin{align*} + p(a\leq \sqrt{\bx^T\bx}\leq b) + &= p(a^2\leq \bx^T\bx \leq b) + =\frac{1}{2^{D/2}\Gamma(D/2)}\int_{a^2}^{b^2}x^{D/2-1}e^{-x/2}dx\\ + &= \frac{1}{2^{D/2-1}\Gamma(D/2)} u^{D-1}e^{-u^2/2} +\end{align*} +where we used the formula of the chi-squared distribution for the second +equality and the change of variable $u=\sqrt{x}$ for the last equality. Since +this is true for all $a,b$, the distribution of the distance is given by: +\begin{displaymath} + p(u) = \frac{1}{2^{D/2-1}\Gamma(D/2)} u^{D-1}e^{-u^2/2} +\end{displaymath} + +2. The pdf for different values of $D$ can be found below: +\begin{center} +\includegraphics[width=0.8\textwidth]{dist1.pdf} +\end{center} + +3. The cdf plot is below for $D=100$ +\begin{center} +\includegraphics[width=0.8\textwidth]{cumul.pdf} +\end{center} + +4. Eyeballing the plot, 90\% is in the shell whose inner and outer radii are + $9$ and $11$ respectively. + +\begin{problem}[Bayesian Linear Regression, 30pts] +Here are some one-dimensional data to regress: +\begin{verbatim} +x = [-1.87 -1.76 -1.67 -1.22 -0.07 0.11 0.67 1.60 2.22 2.51]' +y = [0.06 1.67 0.54 -1.45 -0.18 -0.67 0.92 2.95 5.13 5.18]' +\end{verbatim} +Construct a Bayesian linear regression model using Gaussian observation noise with fixed variance $\sigma^2$, and a basis of your choosing (e.g., polynomial, sinusoids, radial basis functions). +For the prior on the regression weights, use a normal distribution: +$$p(\bw) = \distNorm(\bw | \bw_0, \mathbf V_0)$$ + + +\begin{enumerate} + \item Write down: + \begin{enumerate} + \item your basis + \item the posterior distribution on $\bw$ given the data: $p(\bw | \bx, \by, \bw_0, \mathbf V_0, \sigma^2)$. + \item the posterior predictive distribution $p(y^\star | x^\star, \bx, \by, \bw_0, \mathbf V_0, \sigma^2)$ of a new function value $y^\star = f(x^\star)$ obtained by integrating out $\bw$. + \item The marginal likelihood, $p(\by | \bx, \bw_0,\boldsymbol V_0, \sigma^2)$ obtained by integrating out the regression weights. + \end{enumerate} + \item Choose values of $\bw_0, \mathbf V_0$ and $\sigma^2$ that seem sensible for the regression weight prior and the noise. + Plot several random functions drawn from your prior. + + \item Plot the data, as well as several typical posterior samples of the function given the data. + + \item Plot the 95\% central credible interval region of the + predictive density as a function of~$x$. That is, produce a plot + that shows the ``tube'' containing most of the functions that are + consistent with the data under your model. + + \item Try varying the basis functions used. For example, you could change the + order of polynomial, or how many radial basis functions to put + down. Produce a bar plot of the marginal likelihood for several of these hypotheses. + Which hypotheses are supported by the data? + \end{enumerate} +\end{problem} + +\paragraph{Solution.} 1. (a) I chose the Gaussian radial basis functions as my +basis: +\begin{displaymath} + \phi_j(x) = \exp\left(\frac{-(x-\mu_j)^2}{2\epsilon^2}\right) +\end{displaymath} +For the plots I chose 10 basis functions whose $\mu_j$ were evenly spread over +the interval $[-2, 3]$ and $\epsilon = 1$. + +(b) Let us denote by $\Phi(\bx)$ the matrix of the observations after applying +the basis functions to them. That is, the $i$th row of $\Phi(\bx)$ is $\phi(x) += [\phi_1(x_i),\ldots,\phi_N(x_i)]$ where $N$ is the number of basis functions. +Then the posterior is given by: +\begin{displaymath} + p(\bw \given \by) \propto \distNorm(\Phi(\bx)\bw, \sigma^2Id)\distNorm(\bw_0, + \mathbf V_0) +\end{displaymath} +Using standard results for Gaussian variables (e.g. Theorem 4.4.1 from Murphy) +we get that $p(\bw \given \bx, \by) \sim \distNorm(\mu, \Sigma)$ with: +\begin{displaymath} + \Sigma^{-1} = V_0^{-1} + \frac{1}{\sigma^2}\Phi(\bx)^T\Phi(\bx) + \quad\text{and}\quad \mu = \Sigma V_0^{-1}\bw_0 + +\frac{1}{\sigma^2}\Sigma\Phi(\bx)^Ty +\end{displaymath} + +(c) Again using Theorem 4.4.1 we compute the posterior predictive: +\begin{displaymath} + p(y^*\given x^*) \sim \distNorm\left(\phi(x^*)^T\mu, \sigma^2 + \phi(x^*)^T + \Sigma \phi(x^*)\right) +\end{displaymath} + +(d) And the marginal likelihood: +\begin{displaymath} + p(\by\given \bx) + = \distNorm(\Phi(\bx)^T\bw_0, \sigma^2Id + \Phi(\bx)V_0\Phi(\bx)^T) +\end{displaymath} +I am using a slight abuse of notation here: in (c) I meant that the posterior +predictive is a normal distribution whose parameters where indicated as the +arguments of the $\distNorm$ function. Here I mean that the marginal likelihood +is computed by evaluating the cdf of normal distribution whose parameters are +indicated at the point $\by$. + +2. I chose $w_0$ to be the zero vector, $V_0$ to be $Id$ and $\sigma^2$ to be + 1. Several functions drawn from the prior are draw below: +\begin{center} +\includegraphics[width=0.7\textwidth]{prior.pdf} +\end{center} + +3. The data and several functions drawn from the posterior are drawn below: +\begin{center} +\includegraphics[width=0.8\textwidth]{posterior.pdf} +\end{center} + +4. The data, mean of the posterior predictive and 95\% interval for the +posterior predictive (blue-grey tube) can be found below. For any point $x$, +the posterior predictive distribution at this point is a univariate gaussian +whose mean and variance was computed in part 1. The 95\% interval is compute by +considering the interval of length $2\times 1.96$ times the standard deviation +around the mean. +\begin{center} +\includegraphics[width=0.8\textwidth]{predictive.pdf} +\end{center} + +5. The plot below shows the marginal likelihood as a function of the number of + basis functions in my RBF basis. +\begin{center} +\includegraphics[width=0.8\textwidth]{ml.pdf} +\end{center} +It seems that about 25 functions in the basis is a reasonable prior hypothesis. + + + +\begin{problem}[30pts, Hastie et al., Murphy] +In this problem, we'll apply logistic regression to a data set of spam +email. These data consist of 4601 email messages, from which 57 +features have been extracted. These are as follows: +\begin{itemize} + \item $48$ features in $[0,100]$, giving the percentage of words in + a given message which match a given word on a list containing, + e.g., ``business'', ``free'', etc. + \item $6$ features in $[0,100]$, giving the percentage of characters + in the email that match characters on a list containing, e.g., + ``\$'', ``\#'', etc. + \item Feature 55: The average length of an uninterrupted sequence of + capital letters. + \item Feature 56: The length of the longest uninterrupted sequence + of capital letters. + \item Feature 57: The sum of the lengths of uninterrupted sequences + of capital letters. +\end{itemize} +There are files \texttt{spam.train.dat} and \texttt{spam.test.dat} +(available on the course website) in which each row is an email. +There are 3000 training and 1601 test examples. The final column in +each file indicates whether the email was spam. + +\begin{enumerate} + \item Code up ~$\ell_2$-regularized logistic regression. + That is, code up a model of the form: + \begin{align} + p(\mathbf{w}) & = \mathcal{N}(0, \sigma^2 I) \\ + p(y_i = 1 | \mathbf{x}, \mathbf{w}) & = \frac{1}{1 + \exp(-\mathbf{w}^T \mathbf{x})} + \end{align} + + and use it to compute the MAP estimate of $\mathbf{w}$, given the training examples. + Compute the gradients of the posterior probability of the data with respect to $\mathbf{w}$, and check that they're correct numerically. + Show the code you used for computing the derivatives. + + \item Use + 10-fold cross-validation on the training set to determine an appropriate regularization + penalty $\sigma^2$. Plot different values of $\log_e \sigma^2$ ranging from -8 to 8 against the cross-validation error and test error. Which is the best? + \item There are different ways one might preprocess the data. One + typical thing to do is to linearly ``standardize'' each input feature so + that it has mean zero and variance one. Do this standardization + and evaluate the model again. Report the discovered $\bw$, and the training and test error after performing cross-validation as in part 2. + \item It may also be the case that some dimensions are better represented a binary variables, or by taking the logarithm, than the given values. + Hypothesize whether any of the variables in this data sets might be better coded as binary (or using their log), and explain your reasoning. + Then do the transform and apply the same cross-validation as before. Do your results improve? Conjecture why or why not. +\end{enumerate} + +\end{problem} + +\paragraph{Solution.} The code for this problem can be found in \textsf{p3.py} + +1. The MAP estimate of $w$ is given by: +\begin{displaymath} + w^* = \argmax_{w} \sum_{i=1}^n \log p(x_i, y_i \given w) + \log p(w) +\end{displaymath} +For $\ell$-2 regularized logistic regression: +\begin{align*} + \log p(x_i, y_i\given w) &= - y_i\log\left(1+\exp(-w^Tx_i)\right) + + (1- y_i)\log\left(\frac{\exp(-w^Tx_i)}{1+\exp(-w^Tx_i)}\right)\\ + &= -(1-y_i) w^Tx_i -\log\left(1+\exp(-w^T x_i)\right) +\end{align*} +and: +\begin{displaymath} + \log p(w) = -\frac{1}{2\sigma^2}\|w\|^2 + C +\end{displaymath} +where $C$ is the normalization constant which does not depend on $w$. So the +MAP optimization problem can be written like this: +\begin{displaymath} + w^* = \argmin_w \sum_{i=1}^n \log\left(1+\exp(-w^Tx_i)\right) + + \sum_{i=1}^n(1-y_i)w^Tx_i + \frac{1}{2\sigma^2}\|w\|^2 +\end{displaymath} +Denoting by $L(w)$ the objective function of the above optimization problem. +The gradient is given by: +\begin{displaymath} + \nabla L(w) = \sum_{i=1}^n \frac{-x_i}{1+\exp(w^Tx_i)} + + \sum_{i=1}^n (1-y_i)x_i + \frac{1}{\sigma^2}w +\end{displaymath} +The code to compute $L$ and its gradient and to test the gradient can be found +in \textsf{p3.py}. + +2. The plot of the cross-validation error and test error as a function of $\log + \sigma^2$ can be found below: +\begin{center} +\includegraphics[width=0.8\textwidth]{scores_txt.pdf} +\end{center} +I used fraction of misclassified samples are a measure of error. Strangely, +the plot doesn't exhibit the standard ``sweetspot behavior''. The test error +should be increasing for large values of $\sigma^2$ because at this point we +are simply over-fitting the data, but I haven't been able to understand why +I am not observing it. In practice, I would use $\log\sigma^2 = -2$. + +3. The cross-validation plot for the normalized data can be found below: +\begin{center} +\includegraphics[width=0.8\textwidth]{scores2_txt.pdf} +\end{center} +The error is much smaller in this case. For $\log\sigma^2 = 2$, I obtained +a cross-validation error of 0.073 and a test error of 0.086. The value of $w$ +can be found in file \textsf{w.txt.pdf}. + +4. I believe feature 55, 56 and 57 would be better represented by taking their + logarithms. My reasoning is the following: if for example an email is + written entirely in capital letters the variable will simply be equal to the + length of the email. But two emails of different lengths written entirely in + capital should be considered equally bad. In other words, as long as there + are many capital letters, whether it is 100 or 1000, it seems to me that the + emails are almost equally likely to be spam. Taking the logarithm will favor + the order of magnitude rather than the actual count. + + The plot of the CV and test error after this normalization can be found + here: +\begin{center} +\includegraphics[width=0.8\textwidth]{scores3_txt.pdf} +\end{center} +Despite the normalization the error stays the same. + +\end{document} diff --git a/hw2/p1.py b/hw2/p1.py new file mode 100644 index 0000000..682abc1 --- /dev/null +++ b/hw2/p1.py @@ -0,0 +1,37 @@ +from scipy.special import gamma +import numpy as np +from math import exp +import matplotlib.pyplot as plt +from scipy.stats import rv_continuous + + +def pdf(u, D): + return 1 / (2 ** ((D / 2.) - 1.) + * gamma(D / 2.)) * u ** (D - 1.) * exp(-u ** 2. / 2.) + + +class my(rv_continuous): + + def _pdf(self, u, D): + return pdf(u, D) + + +def dist(): + a = np.linspace(0, 20, 1000) + plt.figure(figsize=(9, 6)) + for D in [1, 2, 5, 10, 20, 50, 100]: + b = [pdf(u, D) for u in a] + plt.plot(a, b, label="D=" + str(D)) + plt.legend() + plt.savefig("dist1.pdf") + + +def cdf(): + a = my(a=0, b=float("inf")) + px = np.linspace(5, 15, 100) + plt.figure(figsize=(9, 6)) + plt.plot(px, a.cdf(px, 100)) + plt.savefig("cumul.pdf") + +if __name__ == "__main__": + cdf() diff --git a/hw2/p2.py b/hw2/p2.py new file mode 100644 index 0000000..1659112 --- /dev/null +++ b/hw2/p2.py @@ -0,0 +1,102 @@ +import numpy as np +import seaborn +import matplotlib.pyplot as plt +from math import sqrt +from scipy.stats import multivariate_normal + +seaborn.set_style("white") + +x = np.array([-1.87, -1.76, -1.67, -1.22, -0.07, 0.11, 0.67, 1.60, 2.22, 2.51]) +y = np.array([0.06, 1.67, 0.54, -1.45, -0.18, -0.67, 0.92, 2.95, 5.13, 5.18]) + +D = 20 +eps = 1 +base = np.linspace(-2, 3, num=D) + + +def map_features(x, base=base): + x = x.reshape(len(x), 1) + phi = np.exp(-(x - base) ** 2 / (2 * eps ** 2)) + phi = np.concatenate((phi, np.ones((phi.shape[0], 1))), 1) + return phi + + +def draw_prior(x): + phi = map_features(x) + D = phi.shape[1] + w = np.random.normal(0, 1, D) + return np.dot(phi, w) + + +def posterior(): + phi = map_features(x) + D = phi.shape[1] + invsigma = np.identity(D) + np.dot(phi.T, phi) + sigma = np.linalg.inv(invsigma) + mu = np.dot(sigma, np.dot(phi.T, y)) + return mu, sigma + + +def plot_posteriors(): + plt.figure(figsize=(9, 6)) + mu, sigma = posterior() + px = np.linspace(-2, 3, num=200) + phi = map_features(px) + plt.plot(x, y, "o", label="data") + for i in xrange(5): + w = np.random.multivariate_normal(mu, sigma) + plt.plot(px, np.dot(phi, w), label="post. " + str(i)) + plt.legend(loc="lower right") + plt.savefig("posterior.pdf") + + +def plot_tube(): + fig = plt.figure(figsize=(9, 6)) + mu, sigma = posterior() + px = np.linspace(-2, 3, num=200) + phi = map_features(px) + plt.plot(px, np.dot(phi, mu), label="post. pred. mean") + ax = fig.get_axes()[0] + inf = [np.dot(row, mu) - 1.96 * sqrt(1 + np.dot(row, np.dot(sigma, row))) + for row in phi] + sup = [np.dot(row, mu) + 1.96 * sqrt(1 + np.dot(row, np.dot(sigma, row))) + for row in phi] + ax.fill_between(px, inf, sup, alpha=0.2) + plt.plot(x, y, "o", label="data") + plt.legend(loc="lower right") + plt.savefig("predictive.pdf") + + +def plot_priors(): + plt.figure(figsize=(9, 6)) + px = np.linspace(-2, 3, num=200) + for i in xrange(5): + plt.plot(px, draw_prior(px), label="prior " + str(i)) + plt.legend(loc="lower right") + plt.savefig("prior.pdf") + + +def marginal_likelihood(): + eps = 1 + ml = [] + px = np.linspace(1, 50, num=10) + for D in px: + base = np.linspace(-2, 3, num=D) + phi = map_features(x, base) + z = np.zeros(phi.shape[0]) + rv = multivariate_normal(mean=z, cov=np.identity(phi.shape[0]) + + np.dot(phi, phi.T)) + ml.append(rv.pdf(y)) + plt.figure(figsize=(9, 6)) + plt.bar(px, ml) + plt.savefig("ml.pdf") + + + +if __name__ == "__main__": + plot_priors() + plot_posteriors() + plot_tube() + marginal_likelihood() +#plt.plot(x, y, "o") +#plt.show() diff --git a/hw2/p3.py b/hw2/p3.py new file mode 100644 index 0000000..f4a3ffa --- /dev/null +++ b/hw2/p3.py @@ -0,0 +1,125 @@ +import numpy as np +from scipy.special import expit +from scipy.optimize import minimize +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.metrics import accuracy_score +from sklearn.cross_validation import KFold +from sklearn import preprocessing +import seaborn +import matplotlib.pyplot as plt +from math import exp +import sys + +seaborn.set_style("white") + + +class LogReg(BaseEstimator, ClassifierMixin): + + def __init__(self, sigmas=1.): + self.sigmas = sigmas + + def fit(self, x, y): + w = np.zeros(x.shape[1]) + + def aux(w): + return l_gradient(w, x, y, self.sigmas) + + self.w = minimize(aux, w, jac=True, method='L-BFGS-B', + options={'disp': False, 'gtol': 1e-10}).x + return self + + def predict(self, x): + pred = np.dot(x, self.w) + return (pred > 0).astype(int) + + +def l(w, x, y, sigmas): + x2 = np.dot(x, w) + l = (-np.log(expit(x2)).sum() + np.inner(1-y, x2) + + 1 / (2 * sigmas) * np.inner(w, w)) + return l + + +def l_gradient(w, x, y, sigmas): + x2 = np.dot(x, w) + l = (-np.log(expit(x2)).sum() + np.inner(x2, (1 - y)) + + 1 / (2 * sigmas) * np.inner(w, w)) + grad = (np.sum(-x * expit(-x2)[:, None], axis=0) + + np.sum(x * (1 - y)[:, None], axis=0) + 1 / sigmas * w) + return l, grad + + +def test_gradient(w, x, y, sigmas): + eps = 1e-10 + _, g = l_gradient(w, x, y, sigmas) + for i in xrange(len(g)): + z = np.zeros(len(g)) + z[i] = eps + l1 = l(w + z, x, y, sigmas) + z = np.zeros(len(g)) + z[i] = -eps + l2 = l(w + z, x, y, sigmas) + print (l1 - l2) / (2 * eps), g[i] + + +def cross_validation(x, y): + for sigmas in [10 ** i for i in xrange(-8, 9)]: + train_scores = [] + test_scores = [] + for train, test in KFold(n=x.shape[0], n_folds=10, shuffle=True): + mod = LogReg(sigmas=sigmas) + x_train_f = x[train] + y_train_f = y[train] + x_test_f = x[test] + y_test_f = y[test] + mod = mod.fit(x_train_f, y_train_f) + train_scores.append(accuracy_score(y_test_f, + mod.predict(x_test_f))) + test_scores.append(accuracy_score(y_test, mod.predict(x_test))) + print np.mean(train_scores), np.mean(test_scores) + + +def plot_scores(): + plt.figure(figsize=(9, 6)) + values = [map(float, line.strip().split()) + for line in open(sys.argv[1])] + y1, y2 = zip(*values) + x = xrange(-8, 9) + y1 = 1 - np.array(y1) + y2 = 1 - np.array(y2) + plt.plot(x, y1, label='CV error') + plt.plot(x, y2, label='Test error') + plt.legend() + plt.xlabel("log(sigma^2)") + plt.ylabel("Error") + plt.savefig(sys.argv[1] + ".pdf") + + +def three(x, y): + sigmas = exp(2) + mod = LogReg(sigmas=sigmas) + mod.fit(x, y) + print 1 - accuracy_score(y, mod.predict(x)) + print 1 - accuracy_score(y_test, mod.predict(x_test)) + print mod.w + + +def norm(x): + for i in [54, 55, 56]: + x[:, i] = np.log(1 + x[:, i]) + return x + +if __name__ == "__main__": + data_train = np.loadtxt("spam.train.dat") + x_train = data_train[:, :-1] + y_train = data_train[:, -1].astype(int) + data_test = np.loadtxt("spam.test.dat") + x_test = data_test[:, :-1] + y_test = data_test[:, -1].astype(int) + #x_train = preprocessing.scale(x_train) + #x_test = preprocessing.scale(x_test) + #x_train = norm(x_train) + #x_test = norm(x_test) + cross_validation(x_train, y_train) + #plot_scores() + #three(x_train, y_train) diff --git a/hw2/w.txt.pdf b/hw2/w.txt.pdf new file mode 100644 index 0000000..10e02fa --- /dev/null +++ b/hw2/w.txt.pdf @@ -0,0 +1,16 @@ +[ -1.16313019e-01 -1.86665945e-01 6.97073481e-02 1.08220421e+01 + 3.79967800e-01 2.55576319e-01 8.97469186e-01 2.24521456e-01 + 1.96371840e-01 9.65848716e-03 -1.84768134e-03 -2.10674040e-02 + 7.25815774e-03 -1.73726085e-03 3.78833303e-01 7.27626576e-01 + 3.49140543e-01 7.38916828e-02 1.89606569e-01 5.01057133e-01 + 1.51101469e-01 2.66337989e-01 1.76604724e+00 3.85382385e-01 + -1.92398194e+00 -6.65178155e-01 -2.41555043e+00 1.63766156e-01 + -2.31952937e-01 -1.37873458e-01 -3.86235131e-02 1.88487314e+00 + -4.75460995e-01 -1.63762752e+00 -3.51540891e-01 3.48965393e-01 + -2.85733418e-01 -1.09756723e-01 -2.27348768e-01 -6.50708679e-01 + -3.71495595e-01 -8.24659275e-01 -9.70155511e-02 -5.92524802e-01 + -7.01454079e-01 -7.61876779e-01 -1.52936023e-01 -3.05077963e-01 + -3.85494128e-01 -2.82554055e-02 -1.80762491e-01 3.03846017e-01 + 1.39227907e+00 1.09831231e+00 6.43297348e+00 1.19357017e+00 + 1.66187736e-01] + -- cgit v1.2.3-70-g09d2