From c1e4e0e41442c087284f34d7585cf9f551e52c2f Mon Sep 17 00:00:00 2001
From: Thibaut Horel <thibaut.horel@gmail.com>
Date: Thu, 8 Oct 2015 22:39:22 -0400
Subject: [hw2]

---
 hw2/harvardml.cls |  97 ++++++++++++++++
 hw2/main.tex      | 326 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw2/p1.py         |  37 +++++++
 hw2/p2.py         | 102 +++++++++++++++++
 hw2/p3.py         | 125 +++++++++++++++++++++
 hw2/w.txt.pdf     |  16 +++
 6 files changed, 703 insertions(+)
 create mode 100644 hw2/harvardml.cls
 create mode 100644 hw2/main.tex
 create mode 100644 hw2/p1.py
 create mode 100644 hw2/p2.py
 create mode 100644 hw2/p3.py
 create mode 100644 hw2/w.txt.pdf

diff --git a/hw2/harvardml.cls b/hw2/harvardml.cls
new file mode 100644
index 0000000..e285173
--- /dev/null
+++ b/hw2/harvardml.cls
@@ -0,0 +1,97 @@
+% Ryan Adams
+% School of Engineering and Applied Sciences
+% Harvard University
+% v0.01, 31 August 2013
+% Based on HMC Math Dept. template by Eric J. Malm.
+\NeedsTeXFormat{LaTeX2e}[1995/01/01]
+\ProvidesClass{harvardml}
+[2013/08/31 v0.01 Harvard ML Assignment Class]
+
+\RequirePackage{ifpdf}
+
+\newif\ifhmlset@submit
+\DeclareOption{submit}{%
+  \hmlset@submittrue%
+}
+\DeclareOption{nosubmit}{%
+  \hmlset@submitfalse%
+}
+
+\DeclareOption*{\PassOptionsToClass{\CurrentOption}{article}}
+\ExecuteOptions{nosubmit}
+\ProcessOptions\relax
+
+\LoadClass[10pt,letterpaper]{article}
+
+\newif\ifhmlset@header
+
+\hmlset@headertrue
+
+\RequirePackage{mathpazo}
+\RequirePackage{palatino}
+\RequirePackage{amsmath}
+\RequirePackage{amssymb}
+\RequirePackage{amsthm}
+\RequirePackage{fullpage}
+\RequirePackage{mdframed}
+
+\newtheoremstyle{hmlplain}
+                {3pt}% Space above
+                {3pt}% Space below
+                {}% Body font
+                {}% Indent amount
+                {\bfseries}% Theorem head font
+                {\\*[3pt]}% Punctuation after theorem head
+                {.5em}% Space after theorem head
+                {}% Theorem head spec (can be left empty, meaning `normal')
+
+\def\titlebar{\hrule height2pt\vskip .25in\vskip-\parskip}
+
+\newcommand{\headerblock}{%
+  \noindent\begin{minipage}{0.33\textwidth}
+  \begin{flushleft}
+  \ifhmlset@submit
+    \mbox{\hmlset@name}\\
+    \mbox{\tt \hmlset@email}\\
+    \mbox{\hmlset@course}
+  \fi
+  \end{flushleft}
+  \end{minipage}
+  \noindent\begin{minipage}{0.33\textwidth}
+  \begin{center}
+    \mbox{\Large\hmlset@assignment}\protect\\
+    Due: \hmlset@duedate
+  \end{center}
+  \end{minipage}
+  \noindent\begin{minipage}{0.33\textwidth}
+  \begin{flushright}
+  \ifhmlset@submit
+    Collaborators: \hmlset@collaborators
+    \fi
+  \end{flushright}
+  \end{minipage}
+  \vspace{0.1cm}
+  \titlebar
+}
+
+\ifhmlset@header\AtBeginDocument{\headerblock}\fi
+
+\def\hmlset@name{}
+\def\hmlset@email{}
+\def\hmlset@course{}
+\def\hmlset@assignment{}
+\def\hmlset@duedate{}
+\def\hmlset@collaborators{}
+\def\hmlset@extraline{}
+
+% commands to set header block info
+\newcommand{\name}[1]{\def\hmlset@name{#1}}
+\newcommand{\email}[1]{\def\hmlset@email{#1}}
+\newcommand{\course}[1]{\def\hmlset@course{#1}}
+\newcommand{\assignment}[1]{\def\hmlset@assignment{#1}}
+\newcommand{\duedate}[1]{\def\hmlset@duedate{#1}}
+\newcommand{\collaborators}[1]{\def\hmlset@collaborators{#1}}
+\newcommand{\extraline}[1]{\def\hmlset@extraline{#1}}
+
+\theoremstyle{hmlplain}
+\newmdtheoremenv[skipabove=\topsep,skipbelow=\topsep,nobreak=true]{problem}{Problem}
diff --git a/hw2/main.tex b/hw2/main.tex
new file mode 100644
index 0000000..f4a1067
--- /dev/null
+++ b/hw2/main.tex
@@ -0,0 +1,326 @@
+% No 'submit' option for the problems by themselves.
+\documentclass[submit]{harvardml}
+% Use the 'submit' option when you submit your solutions.
+%\documentclass[submit]{harvardml}
+
+% Put in your full name and email address.
+\name{Thibaut Horel}
+\email{thorel@seas.harvard.edu}
+\usepackage[utf8x]{inputenc}
+
+% List any people you worked with.
+\collaborators{%
+}
+
+% You don't need to change these.
+\course{CS281-F13}
+\assignment{Assignment \#2}
+\duedate{23:59pm October 7, 2015}
+
+\usepackage{graphicx}
+% Useful macros.
+\DeclareMathOperator*{\argmax}{argmax}
+\DeclareMathOperator*{\argmin}{argmin}
+\newcommand{\given}{\,|\,}
+\newcommand{\prob}{\mathbb{P}}
+\newcommand{\trans}{\mathsf{T}}
+\newcommand{\bx}{\mathbf{x}}
+\newcommand{\by}{\mathbf{y}}
+\newcommand{\bw}{\mathbf{w}}
+\newcommand{\distNorm}{\mathcal{N}}
+\newcommand{\bzero}{\mathbf{0}}
+\newcommand{\ident}{\mathbb{I}}
+
+\begin{document}
+
+
+\begin{problem}[10pts]
+One intuitive way to summarize a probability density is via the mode,
+as this is the ``most likely'' value in some sense.  A common example
+of this is using the maximum \textit{a posteriori} (MAP) estimate of a
+model's parameters.  In high dimensions, however, the mode becomes
+less and less representative of typical samples.  Consider variates
+from a~$D$-dimensional zero mean spherical Gaussian with unit
+variance:
+\begin{align*}
+  \bx &\sim \distNorm(\bzero_D, \ident_D),
+\end{align*}
+where~$\bzero_D$ indicates a column vector of~$D$ zeros and~$\ident_D$
+is a~${D\times D}$ identity matrix.
+\begin{enumerate}
+  \item Compute the distribution that this implies over the distance
+    of these points from the origin.  That is, compute the
+    distribution over~$\sqrt{\bx^\trans\bx}$, if~$\bx$ is a
+    realization from~$\distNorm(\bzero_D, \ident_D)$.  (Hint: Consider
+    transformations of a Gamma distribution.)
+  \item Make a plot that shows this probability density function for
+    several different values of~$D$, up to ~${D=100}$.
+
+  \item Make a plot of the cumulative distribution function (CDF) over
+    this distance distribution for~${D=100}$.  A closed-form solution
+    may be difficult to compute, so you can do this numerically.)
+
+  \item From examining the CDF we can think about where most of the
+    mass lives as a function of radius.  For example, most of the mass
+    for~${D=100}$ is within a thin spherical shell.  From eyeballing
+    the plot, what are the inner and outer radii for the shell that
+    contains 90\% of the mass in this case?
+\end{enumerate}
+\end{problem}
+
+\paragraph{Solution.} The code for this problem can be found in file
+\textsf{p1.py}.
+
+1. By definition of $\bx$, its coordinates are
+independent univariate Gaussian variables of mean 0 and variance one. Hence,
+$\bx^T\bx = \sum_{i=1}^D\bx_i^2$ is exactly a chi-squared distribution with $D$
+degrees of freedom. So for all $a, b$ we have:
+\begin{align*}
+    p(a\leq \sqrt{\bx^T\bx}\leq b)
+    &= p(a^2\leq \bx^T\bx \leq b)
+    =\frac{1}{2^{D/2}\Gamma(D/2)}\int_{a^2}^{b^2}x^{D/2-1}e^{-x/2}dx\\
+    &= \frac{1}{2^{D/2-1}\Gamma(D/2)} u^{D-1}e^{-u^2/2}
+\end{align*}
+where we used the formula of the chi-squared distribution for the second
+equality and the change of variable $u=\sqrt{x}$ for the last equality. Since
+this is true for all $a,b$, the distribution of the distance is given by:
+\begin{displaymath}
+    p(u) = \frac{1}{2^{D/2-1}\Gamma(D/2)} u^{D-1}e^{-u^2/2}
+\end{displaymath}
+
+2. The pdf for different values of $D$ can be found below:
+\begin{center}
+\includegraphics[width=0.8\textwidth]{dist1.pdf}
+\end{center}
+
+3. The cdf plot is below for $D=100$
+\begin{center}
+\includegraphics[width=0.8\textwidth]{cumul.pdf}
+\end{center}
+
+4. Eyeballing the plot, 90\% is in the shell whose inner and outer radii are
+   $9$ and $11$ respectively.
+
+\begin{problem}[Bayesian Linear Regression, 30pts]
+Here are some one-dimensional data to regress:
+\begin{verbatim}
+x = [-1.87 -1.76 -1.67 -1.22 -0.07 0.11 0.67 1.60 2.22 2.51]'
+y = [0.06 1.67 0.54 -1.45 -0.18 -0.67 0.92 2.95 5.13 5.18]'
+\end{verbatim}
+Construct a Bayesian linear regression model using Gaussian observation noise with fixed variance $\sigma^2$, and a basis of your choosing (e.g., polynomial, sinusoids, radial basis functions).
+For the prior on the regression weights, use a normal distribution:
+$$p(\bw) = \distNorm(\bw | \bw_0, \mathbf V_0)$$
+
+
+\begin{enumerate}
+  \item Write down:
+  \begin{enumerate}
+  	\item your basis
+  	\item the posterior distribution on $\bw$ given the data: $p(\bw | \bx, \by, \bw_0, \mathbf V_0, \sigma^2)$.
+  	\item the posterior predictive distribution $p(y^\star | x^\star, \bx, \by, \bw_0, \mathbf V_0, \sigma^2)$ of a new function value $y^\star = f(x^\star)$ obtained by integrating out $\bw$.
+	\item The marginal likelihood, $p(\by | \bx, \bw_0,\boldsymbol V_0, \sigma^2)$ obtained by integrating out the regression weights.
+  \end{enumerate}
+  \item Choose values of $\bw_0, \mathbf V_0$ and $\sigma^2$ that seem sensible for the regression weight prior and the noise.
+  Plot several random functions drawn from your prior.
+
+  \item Plot the data, as well as several typical posterior samples of the function given the data.
+    
+  \item Plot the 95\% central credible interval region of the
+    predictive density as a function of~$x$. That is, produce a plot
+    that shows the ``tube'' containing most of the functions that are
+    consistent with the data under your model.
+
+  \item Try varying the basis functions used.  For example, you could change the
+    order of polynomial, or how many radial basis functions to put
+    down.  Produce a bar plot of the marginal likelihood for several of these hypotheses.
+    Which hypotheses are supported by the data?
+  \end{enumerate}
+\end{problem}
+
+\paragraph{Solution.} 1. (a) I chose the Gaussian radial basis functions as my
+basis:
+\begin{displaymath}
+    \phi_j(x) =  \exp\left(\frac{-(x-\mu_j)^2}{2\epsilon^2}\right)
+\end{displaymath}
+For the plots I chose 10 basis functions whose $\mu_j$ were evenly spread over
+the interval $[-2, 3]$ and $\epsilon = 1$.
+
+(b) Let us denote by $\Phi(\bx)$ the matrix of the observations after applying
+the basis functions to them. That is, the $i$th row of $\Phi(\bx)$ is $\phi(x)
+= [\phi_1(x_i),\ldots,\phi_N(x_i)]$ where $N$ is the number of basis functions.
+Then the posterior is given by:
+\begin{displaymath}
+    p(\bw \given \by) \propto \distNorm(\Phi(\bx)\bw, \sigma^2Id)\distNorm(\bw_0,
+    \mathbf V_0)
+\end{displaymath}
+Using standard results for Gaussian variables (e.g. Theorem 4.4.1 from Murphy)
+we get that $p(\bw \given \bx, \by) \sim \distNorm(\mu, \Sigma)$ with:
+\begin{displaymath}
+    \Sigma^{-1} = V_0^{-1} + \frac{1}{\sigma^2}\Phi(\bx)^T\Phi(\bx)
+    \quad\text{and}\quad \mu = \Sigma V_0^{-1}\bw_0
+    +\frac{1}{\sigma^2}\Sigma\Phi(\bx)^Ty
+\end{displaymath}
+
+(c) Again using Theorem 4.4.1 we compute the posterior predictive:
+\begin{displaymath}
+    p(y^*\given x^*) \sim \distNorm\left(\phi(x^*)^T\mu, \sigma^2 + \phi(x^*)^T
+    \Sigma \phi(x^*)\right)
+\end{displaymath}
+
+(d) And the marginal likelihood:
+\begin{displaymath}
+    p(\by\given \bx)
+    = \distNorm(\Phi(\bx)^T\bw_0, \sigma^2Id + \Phi(\bx)V_0\Phi(\bx)^T)
+\end{displaymath}
+I am using a slight abuse of notation here: in (c) I meant that the posterior
+predictive is a normal distribution whose parameters where indicated as the
+arguments of the $\distNorm$ function. Here I mean that the marginal likelihood
+is computed by evaluating the cdf of normal distribution whose parameters are
+indicated at the point $\by$.
+
+2. I chose $w_0$ to be the zero vector, $V_0$ to be $Id$ and $\sigma^2$ to be
+   1. Several functions drawn from the prior are draw below:
+\begin{center}
+\includegraphics[width=0.7\textwidth]{prior.pdf}
+\end{center}
+
+3. The data and several functions drawn from the posterior are drawn below:
+\begin{center}
+\includegraphics[width=0.8\textwidth]{posterior.pdf}
+\end{center}
+
+4. The data, mean of the posterior predictive and 95\% interval for the
+posterior predictive (blue-grey tube) can be found below. For any point $x$,
+the posterior predictive distribution at this point is a univariate gaussian
+whose mean and variance was computed in part 1. The 95\% interval is compute by
+considering the interval of length $2\times 1.96$ times the standard deviation
+around the mean.
+\begin{center}
+\includegraphics[width=0.8\textwidth]{predictive.pdf}
+\end{center}
+
+5. The plot below shows the marginal likelihood as a function of the number of
+   basis functions in my RBF basis.
+\begin{center}
+\includegraphics[width=0.8\textwidth]{ml.pdf}
+\end{center}
+It seems that about 25 functions in the basis is a reasonable prior hypothesis.
+
+
+
+\begin{problem}[30pts, Hastie et al., Murphy]
+In this problem, we'll apply logistic regression to a data set of spam
+email.  These data consist of 4601 email messages, from which 57
+features have been extracted.  These are as follows:
+\begin{itemize}
+  \item $48$ features in $[0,100]$, giving the percentage of words in
+    a given message which match a given word on a list containing,
+    e.g., ``business'', ``free'', etc.
+  \item $6$ features in $[0,100]$, giving the percentage of characters
+    in the email that match characters on a list containing, e.g.,
+    ``\$'', ``\#'', etc.
+  \item Feature 55: The average length of an uninterrupted sequence of
+    capital letters.
+  \item Feature 56: The length of the longest uninterrupted sequence
+    of capital letters.
+  \item Feature 57: The sum of the lengths of uninterrupted sequences
+    of capital letters.
+\end{itemize}
+There are files \texttt{spam.train.dat} and \texttt{spam.test.dat}
+(available on the course website) in which each row is an email.
+There are 3000 training and 1601 test examples.  The final column in
+each file indicates whether the email was spam.
+
+\begin{enumerate}
+  \item Code up ~$\ell_2$-regularized logistic regression.
+  That is, code up a model of the form:
+  \begin{align}
+  p(\mathbf{w}) & = \mathcal{N}(0, \sigma^2 I) \\
+  p(y_i = 1 | \mathbf{x}, \mathbf{w}) & = \frac{1}{1 + \exp(-\mathbf{w}^T \mathbf{x})}
+  \end{align}
+
+  and use it to compute the MAP estimate of $\mathbf{w}$, given the training examples.
+  Compute the gradients of the posterior probability of the data with respect to $\mathbf{w}$, and check that they're correct numerically.
+  Show the code you used for computing the derivatives. 
+  
+  \item Use
+    10-fold cross-validation on the training set to determine an appropriate regularization
+    penalty $\sigma^2$.  Plot different values of $\log_e \sigma^2$ ranging from -8 to 8  against the cross-validation error and test error.  Which is the best?
+  \item There are different ways one might preprocess the data.  One
+    typical thing to do is to linearly ``standardize'' each input feature so
+    that it has mean zero and variance one.  Do this standardization
+    and evaluate the model again.  Report the discovered $\bw$, and the training and test error after performing cross-validation as in part 2.
+    \item It may also be the case that some dimensions are better represented a binary variables, or by taking the logarithm, than the given values.
+  Hypothesize whether any of the variables in this data sets might be better coded as binary (or using their log), and explain your reasoning.
+  Then do the transform and apply the same cross-validation as before.  Do your results improve?  Conjecture why or why not.
+\end{enumerate}
+
+\end{problem}
+
+\paragraph{Solution.} The code for this problem can be found in \textsf{p3.py}
+
+1. The MAP estimate of $w$ is given by:
+\begin{displaymath}
+    w^* = \argmax_{w} \sum_{i=1}^n \log p(x_i, y_i \given w) + \log p(w)
+\end{displaymath}
+For $\ell$-2 regularized logistic regression:
+\begin{align*}
+    \log p(x_i, y_i\given w) &= - y_i\log\left(1+\exp(-w^Tx_i)\right) 
+    + (1- y_i)\log\left(\frac{\exp(-w^Tx_i)}{1+\exp(-w^Tx_i)}\right)\\
+    &= -(1-y_i) w^Tx_i -\log\left(1+\exp(-w^T x_i)\right)
+\end{align*}
+and:
+\begin{displaymath}
+    \log p(w) = -\frac{1}{2\sigma^2}\|w\|^2 + C
+\end{displaymath}
+where $C$ is the normalization constant which does not depend on $w$. So the
+MAP optimization problem can be written like this:
+\begin{displaymath}
+    w^* = \argmin_w \sum_{i=1}^n \log\left(1+\exp(-w^Tx_i)\right)
+    + \sum_{i=1}^n(1-y_i)w^Tx_i + \frac{1}{2\sigma^2}\|w\|^2
+\end{displaymath}
+Denoting by $L(w)$ the objective function of the above optimization problem.
+The gradient is given by:
+\begin{displaymath}
+    \nabla L(w) = \sum_{i=1}^n \frac{-x_i}{1+\exp(w^Tx_i)}
+    + \sum_{i=1}^n (1-y_i)x_i + \frac{1}{\sigma^2}w
+\end{displaymath}
+The code to compute $L$ and its gradient and to test the gradient can be found
+in \textsf{p3.py}.
+
+2. The plot of the cross-validation error and test error as a function of $\log
+   \sigma^2$ can be found below:
+\begin{center}
+\includegraphics[width=0.8\textwidth]{scores_txt.pdf}
+\end{center}
+I used fraction of misclassified samples are a measure of error.  Strangely,
+the plot doesn't exhibit the standard ``sweetspot behavior''. The test error
+should be increasing for large values of $\sigma^2$ because at this point we
+are simply over-fitting the data, but I haven't been able to understand why
+I am not observing it. In practice, I would use $\log\sigma^2 = -2$.
+
+3. The cross-validation plot for the normalized data can be found below:
+\begin{center}
+\includegraphics[width=0.8\textwidth]{scores2_txt.pdf}
+\end{center}
+The error is much smaller in this case. For $\log\sigma^2 = 2$, I obtained
+a cross-validation error of 0.073 and a test error of 0.086. The value of $w$
+can be found in file \textsf{w.txt.pdf}.
+
+4. I believe feature 55, 56 and 57 would be better represented by taking their
+   logarithms. My reasoning is the following: if for example an email is
+   written entirely in capital letters the variable will simply be equal to the
+   length of the email. But two emails of different lengths written entirely in
+   capital should be considered equally bad. In other words, as long as there
+   are many capital letters, whether it is 100 or 1000, it seems to me that the
+   emails are almost equally likely to be spam. Taking the logarithm will favor
+   the order of magnitude rather than the actual count.
+
+   The plot of the CV and test error after this normalization can be found
+   here:
+\begin{center}
+\includegraphics[width=0.8\textwidth]{scores3_txt.pdf}
+\end{center}
+Despite the normalization the error stays the same.
+
+\end{document}
diff --git a/hw2/p1.py b/hw2/p1.py
new file mode 100644
index 0000000..682abc1
--- /dev/null
+++ b/hw2/p1.py
@@ -0,0 +1,37 @@
+from scipy.special import gamma
+import numpy as np
+from math import exp
+import matplotlib.pyplot as plt
+from scipy.stats import rv_continuous
+
+
+def pdf(u, D):
+    return 1 / (2 ** ((D / 2.) - 1.)
+                * gamma(D / 2.)) * u ** (D - 1.) * exp(-u ** 2. / 2.)
+
+
+class my(rv_continuous):
+
+    def _pdf(self, u, D):
+        return pdf(u, D)
+
+
+def dist():
+    a = np.linspace(0, 20, 1000)
+    plt.figure(figsize=(9, 6))
+    for D in [1, 2, 5, 10, 20, 50, 100]:
+        b = [pdf(u, D) for u in a]
+        plt.plot(a, b, label="D=" + str(D))
+    plt.legend()
+    plt.savefig("dist1.pdf")
+
+
+def cdf():
+    a = my(a=0, b=float("inf"))
+    px = np.linspace(5, 15, 100)
+    plt.figure(figsize=(9, 6))
+    plt.plot(px, a.cdf(px, 100))
+    plt.savefig("cumul.pdf")
+
+if __name__ == "__main__":
+    cdf()
diff --git a/hw2/p2.py b/hw2/p2.py
new file mode 100644
index 0000000..1659112
--- /dev/null
+++ b/hw2/p2.py
@@ -0,0 +1,102 @@
+import numpy as np
+import seaborn
+import matplotlib.pyplot as plt
+from math import sqrt
+from scipy.stats import multivariate_normal
+
+seaborn.set_style("white")
+
+x = np.array([-1.87, -1.76, -1.67, -1.22, -0.07, 0.11, 0.67, 1.60, 2.22, 2.51])
+y = np.array([0.06, 1.67, 0.54, -1.45, -0.18, -0.67, 0.92, 2.95, 5.13, 5.18])
+
+D = 20
+eps = 1
+base = np.linspace(-2, 3, num=D)
+
+
+def map_features(x, base=base):
+    x = x.reshape(len(x), 1)
+    phi = np.exp(-(x - base) ** 2 / (2 * eps ** 2))
+    phi = np.concatenate((phi, np.ones((phi.shape[0], 1))), 1)
+    return phi
+
+
+def draw_prior(x):
+    phi = map_features(x)
+    D = phi.shape[1]
+    w = np.random.normal(0, 1, D)
+    return np.dot(phi, w)
+
+
+def posterior():
+    phi = map_features(x)
+    D = phi.shape[1]
+    invsigma = np.identity(D) + np.dot(phi.T, phi)
+    sigma = np.linalg.inv(invsigma)
+    mu = np.dot(sigma, np.dot(phi.T, y))
+    return mu, sigma
+
+
+def plot_posteriors():
+    plt.figure(figsize=(9, 6))
+    mu, sigma = posterior()
+    px = np.linspace(-2, 3, num=200)
+    phi = map_features(px)
+    plt.plot(x, y, "o", label="data")
+    for i in xrange(5):
+        w = np.random.multivariate_normal(mu, sigma)
+        plt.plot(px, np.dot(phi, w), label="post. " + str(i))
+    plt.legend(loc="lower right")
+    plt.savefig("posterior.pdf")
+
+
+def plot_tube():
+    fig = plt.figure(figsize=(9, 6))
+    mu, sigma = posterior()
+    px = np.linspace(-2, 3, num=200)
+    phi = map_features(px)
+    plt.plot(px, np.dot(phi, mu), label="post. pred. mean")
+    ax = fig.get_axes()[0]
+    inf = [np.dot(row, mu) - 1.96 * sqrt(1 + np.dot(row, np.dot(sigma, row)))
+           for row in phi]
+    sup = [np.dot(row, mu) + 1.96 * sqrt(1 + np.dot(row, np.dot(sigma, row)))
+           for row in phi]
+    ax.fill_between(px, inf, sup, alpha=0.2)
+    plt.plot(x, y, "o", label="data")
+    plt.legend(loc="lower right")
+    plt.savefig("predictive.pdf")
+
+
+def plot_priors():
+    plt.figure(figsize=(9, 6))
+    px = np.linspace(-2, 3, num=200)
+    for i in xrange(5):
+        plt.plot(px, draw_prior(px), label="prior " + str(i))
+    plt.legend(loc="lower right")
+    plt.savefig("prior.pdf")
+
+
+def marginal_likelihood():
+    eps = 1
+    ml = []
+    px = np.linspace(1, 50, num=10)
+    for D in px:
+        base = np.linspace(-2, 3, num=D)
+        phi = map_features(x, base)
+        z = np.zeros(phi.shape[0])
+        rv = multivariate_normal(mean=z, cov=np.identity(phi.shape[0])
+                                 + np.dot(phi, phi.T))
+        ml.append(rv.pdf(y))
+    plt.figure(figsize=(9, 6))
+    plt.bar(px, ml)
+    plt.savefig("ml.pdf")
+
+
+
+if __name__ == "__main__":
+    plot_priors()
+    plot_posteriors()
+    plot_tube()
+    marginal_likelihood()
+#plt.plot(x, y, "o")
+#plt.show()
diff --git a/hw2/p3.py b/hw2/p3.py
new file mode 100644
index 0000000..f4a3ffa
--- /dev/null
+++ b/hw2/p3.py
@@ -0,0 +1,125 @@
+import numpy as np
+from scipy.special import expit
+from scipy.optimize import minimize
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.metrics import accuracy_score
+from sklearn.cross_validation import KFold
+from sklearn import preprocessing
+import seaborn
+import matplotlib.pyplot as plt
+from math import exp
+import sys
+
+seaborn.set_style("white")
+
+
+class LogReg(BaseEstimator, ClassifierMixin):
+
+    def __init__(self, sigmas=1.):
+        self.sigmas = sigmas
+
+    def fit(self, x, y):
+        w = np.zeros(x.shape[1])
+
+        def aux(w):
+            return l_gradient(w, x, y, self.sigmas)
+
+        self.w = minimize(aux, w, jac=True, method='L-BFGS-B',
+                          options={'disp': False, 'gtol': 1e-10}).x
+        return self
+
+    def predict(self, x):
+        pred = np.dot(x, self.w)
+        return (pred > 0).astype(int)
+
+
+def l(w, x, y, sigmas):
+    x2 = np.dot(x, w)
+    l = (-np.log(expit(x2)).sum() + np.inner(1-y, x2)
+         + 1 / (2 * sigmas) * np.inner(w, w))
+    return l
+
+
+def l_gradient(w, x, y, sigmas):
+    x2 = np.dot(x, w)
+    l = (-np.log(expit(x2)).sum() + np.inner(x2, (1 - y))
+         + 1 / (2 * sigmas) * np.inner(w, w))
+    grad = (np.sum(-x * expit(-x2)[:, None], axis=0)
+            + np.sum(x * (1 - y)[:, None], axis=0) + 1 / sigmas * w)
+    return l, grad
+
+
+def test_gradient(w, x, y, sigmas):
+    eps = 1e-10
+    _, g = l_gradient(w, x, y, sigmas)
+    for i in xrange(len(g)):
+        z = np.zeros(len(g))
+        z[i] = eps
+        l1 = l(w + z, x, y, sigmas)
+        z = np.zeros(len(g))
+        z[i] = -eps
+        l2 = l(w + z, x, y, sigmas)
+        print (l1 - l2) / (2 * eps), g[i]
+
+
+def cross_validation(x, y):
+    for sigmas in [10 ** i for i in xrange(-8, 9)]:
+        train_scores = []
+        test_scores = []
+        for train, test in KFold(n=x.shape[0], n_folds=10, shuffle=True):
+            mod = LogReg(sigmas=sigmas)
+            x_train_f = x[train]
+            y_train_f = y[train]
+            x_test_f = x[test]
+            y_test_f = y[test]
+            mod = mod.fit(x_train_f, y_train_f)
+            train_scores.append(accuracy_score(y_test_f,
+                                               mod.predict(x_test_f)))
+            test_scores.append(accuracy_score(y_test, mod.predict(x_test)))
+        print np.mean(train_scores), np.mean(test_scores)
+
+
+def plot_scores():
+    plt.figure(figsize=(9, 6))
+    values = [map(float, line.strip().split())
+              for line in open(sys.argv[1])]
+    y1, y2 = zip(*values)
+    x = xrange(-8, 9)
+    y1 = 1 - np.array(y1)
+    y2 = 1 - np.array(y2)
+    plt.plot(x, y1, label='CV error')
+    plt.plot(x, y2, label='Test error')
+    plt.legend()
+    plt.xlabel("log(sigma^2)")
+    plt.ylabel("Error")
+    plt.savefig(sys.argv[1] + ".pdf")
+
+
+def three(x, y):
+    sigmas = exp(2)
+    mod = LogReg(sigmas=sigmas)
+    mod.fit(x, y)
+    print 1 - accuracy_score(y, mod.predict(x))
+    print 1 - accuracy_score(y_test, mod.predict(x_test))
+    print mod.w
+
+
+def norm(x):
+    for i in [54, 55, 56]:
+        x[:, i] = np.log(1 + x[:, i])
+    return x
+
+if __name__ == "__main__":
+    data_train = np.loadtxt("spam.train.dat")
+    x_train = data_train[:, :-1]
+    y_train = data_train[:, -1].astype(int)
+    data_test = np.loadtxt("spam.test.dat")
+    x_test = data_test[:, :-1]
+    y_test = data_test[:, -1].astype(int)
+    #x_train = preprocessing.scale(x_train)
+    #x_test = preprocessing.scale(x_test)
+    #x_train = norm(x_train)
+    #x_test = norm(x_test)
+    cross_validation(x_train, y_train)
+    #plot_scores()
+    #three(x_train, y_train)
diff --git a/hw2/w.txt.pdf b/hw2/w.txt.pdf
new file mode 100644
index 0000000..10e02fa
--- /dev/null
+++ b/hw2/w.txt.pdf
@@ -0,0 +1,16 @@
+[ -1.16313019e-01  -1.86665945e-01   6.97073481e-02   1.08220421e+01
+   3.79967800e-01   2.55576319e-01   8.97469186e-01   2.24521456e-01
+   1.96371840e-01   9.65848716e-03  -1.84768134e-03  -2.10674040e-02
+   7.25815774e-03  -1.73726085e-03   3.78833303e-01   7.27626576e-01
+   3.49140543e-01   7.38916828e-02   1.89606569e-01   5.01057133e-01
+   1.51101469e-01   2.66337989e-01   1.76604724e+00   3.85382385e-01
+  -1.92398194e+00  -6.65178155e-01  -2.41555043e+00   1.63766156e-01
+  -2.31952937e-01  -1.37873458e-01  -3.86235131e-02   1.88487314e+00
+  -4.75460995e-01  -1.63762752e+00  -3.51540891e-01   3.48965393e-01
+  -2.85733418e-01  -1.09756723e-01  -2.27348768e-01  -6.50708679e-01
+  -3.71495595e-01  -8.24659275e-01  -9.70155511e-02  -5.92524802e-01
+  -7.01454079e-01  -7.61876779e-01  -1.52936023e-01  -3.05077963e-01
+  -3.85494128e-01  -2.82554055e-02  -1.80762491e-01   3.03846017e-01
+   1.39227907e+00   1.09831231e+00   6.43297348e+00   1.19357017e+00
+   1.66187736e-01]
+
-- 
cgit v1.3.1