summaryrefslogtreecommitdiffstats
path: root/hw2
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2015-10-08 22:39:22 -0400
committerThibaut Horel <thibaut.horel@gmail.com>2015-10-08 22:39:22 -0400
commitc1e4e0e41442c087284f34d7585cf9f551e52c2f (patch)
tree5154724e4f018128edaffc3b58392bf22188d613 /hw2
parent0f9fc1bdb1755f5677e3201bcd35706e09235844 (diff)
downloadcs281-c1e4e0e41442c087284f34d7585cf9f551e52c2f.tar.gz
[hw2]
Diffstat (limited to 'hw2')
-rw-r--r--hw2/harvardml.cls97
-rw-r--r--hw2/main.tex326
-rw-r--r--hw2/p1.py37
-rw-r--r--hw2/p2.py102
-rw-r--r--hw2/p3.py125
-rw-r--r--hw2/w.txt.pdf16
6 files changed, 703 insertions, 0 deletions
diff --git a/hw2/harvardml.cls b/hw2/harvardml.cls
new file mode 100644
index 0000000..e285173
--- /dev/null
+++ b/hw2/harvardml.cls
@@ -0,0 +1,97 @@
+% Ryan Adams
+% School of Engineering and Applied Sciences
+% Harvard University
+% v0.01, 31 August 2013
+% Based on HMC Math Dept. template by Eric J. Malm.
+\NeedsTeXFormat{LaTeX2e}[1995/01/01]
+\ProvidesClass{harvardml}
+[2013/08/31 v0.01 Harvard ML Assignment Class]
+
+\RequirePackage{ifpdf}
+
+\newif\ifhmlset@submit
+\DeclareOption{submit}{%
+ \hmlset@submittrue%
+}
+\DeclareOption{nosubmit}{%
+ \hmlset@submitfalse%
+}
+
+\DeclareOption*{\PassOptionsToClass{\CurrentOption}{article}}
+\ExecuteOptions{nosubmit}
+\ProcessOptions\relax
+
+\LoadClass[10pt,letterpaper]{article}
+
+\newif\ifhmlset@header
+
+\hmlset@headertrue
+
+\RequirePackage{mathpazo}
+\RequirePackage{palatino}
+\RequirePackage{amsmath}
+\RequirePackage{amssymb}
+\RequirePackage{amsthm}
+\RequirePackage{fullpage}
+\RequirePackage{mdframed}
+
+\newtheoremstyle{hmlplain}
+ {3pt}% Space above
+ {3pt}% Space below
+ {}% Body font
+ {}% Indent amount
+ {\bfseries}% Theorem head font
+ {\\*[3pt]}% Punctuation after theorem head
+ {.5em}% Space after theorem head
+ {}% Theorem head spec (can be left empty, meaning `normal')
+
+\def\titlebar{\hrule height2pt\vskip .25in\vskip-\parskip}
+
+\newcommand{\headerblock}{%
+ \noindent\begin{minipage}{0.33\textwidth}
+ \begin{flushleft}
+ \ifhmlset@submit
+ \mbox{\hmlset@name}\\
+ \mbox{\tt \hmlset@email}\\
+ \mbox{\hmlset@course}
+ \fi
+ \end{flushleft}
+ \end{minipage}
+ \noindent\begin{minipage}{0.33\textwidth}
+ \begin{center}
+ \mbox{\Large\hmlset@assignment}\protect\\
+ Due: \hmlset@duedate
+ \end{center}
+ \end{minipage}
+ \noindent\begin{minipage}{0.33\textwidth}
+ \begin{flushright}
+ \ifhmlset@submit
+ Collaborators: \hmlset@collaborators
+ \fi
+ \end{flushright}
+ \end{minipage}
+ \vspace{0.1cm}
+ \titlebar
+}
+
+\ifhmlset@header\AtBeginDocument{\headerblock}\fi
+
+\def\hmlset@name{}
+\def\hmlset@email{}
+\def\hmlset@course{}
+\def\hmlset@assignment{}
+\def\hmlset@duedate{}
+\def\hmlset@collaborators{}
+\def\hmlset@extraline{}
+
+% commands to set header block info
+\newcommand{\name}[1]{\def\hmlset@name{#1}}
+\newcommand{\email}[1]{\def\hmlset@email{#1}}
+\newcommand{\course}[1]{\def\hmlset@course{#1}}
+\newcommand{\assignment}[1]{\def\hmlset@assignment{#1}}
+\newcommand{\duedate}[1]{\def\hmlset@duedate{#1}}
+\newcommand{\collaborators}[1]{\def\hmlset@collaborators{#1}}
+\newcommand{\extraline}[1]{\def\hmlset@extraline{#1}}
+
+\theoremstyle{hmlplain}
+\newmdtheoremenv[skipabove=\topsep,skipbelow=\topsep,nobreak=true]{problem}{Problem}
diff --git a/hw2/main.tex b/hw2/main.tex
new file mode 100644
index 0000000..f4a1067
--- /dev/null
+++ b/hw2/main.tex
@@ -0,0 +1,326 @@
+% No 'submit' option for the problems by themselves.
+\documentclass[submit]{harvardml}
+% Use the 'submit' option when you submit your solutions.
+%\documentclass[submit]{harvardml}
+
+% Put in your full name and email address.
+\name{Thibaut Horel}
+\email{thorel@seas.harvard.edu}
+\usepackage[utf8x]{inputenc}
+
+% List any people you worked with.
+\collaborators{%
+}
+
+% You don't need to change these.
+\course{CS281-F13}
+\assignment{Assignment \#2}
+\duedate{23:59pm October 7, 2015}
+
+\usepackage{graphicx}
+% Useful macros.
+\DeclareMathOperator*{\argmax}{argmax}
+\DeclareMathOperator*{\argmin}{argmin}
+\newcommand{\given}{\,|\,}
+\newcommand{\prob}{\mathbb{P}}
+\newcommand{\trans}{\mathsf{T}}
+\newcommand{\bx}{\mathbf{x}}
+\newcommand{\by}{\mathbf{y}}
+\newcommand{\bw}{\mathbf{w}}
+\newcommand{\distNorm}{\mathcal{N}}
+\newcommand{\bzero}{\mathbf{0}}
+\newcommand{\ident}{\mathbb{I}}
+
+\begin{document}
+
+
+\begin{problem}[10pts]
+One intuitive way to summarize a probability density is via the mode,
+as this is the ``most likely'' value in some sense. A common example
+of this is using the maximum \textit{a posteriori} (MAP) estimate of a
+model's parameters. In high dimensions, however, the mode becomes
+less and less representative of typical samples. Consider variates
+from a~$D$-dimensional zero mean spherical Gaussian with unit
+variance:
+\begin{align*}
+ \bx &\sim \distNorm(\bzero_D, \ident_D),
+\end{align*}
+where~$\bzero_D$ indicates a column vector of~$D$ zeros and~$\ident_D$
+is a~${D\times D}$ identity matrix.
+\begin{enumerate}
+ \item Compute the distribution that this implies over the distance
+ of these points from the origin. That is, compute the
+ distribution over~$\sqrt{\bx^\trans\bx}$, if~$\bx$ is a
+ realization from~$\distNorm(\bzero_D, \ident_D)$. (Hint: Consider
+ transformations of a Gamma distribution.)
+ \item Make a plot that shows this probability density function for
+ several different values of~$D$, up to ~${D=100}$.
+
+ \item Make a plot of the cumulative distribution function (CDF) over
+ this distance distribution for~${D=100}$. A closed-form solution
+ may be difficult to compute, so you can do this numerically.)
+
+ \item From examining the CDF we can think about where most of the
+ mass lives as a function of radius. For example, most of the mass
+ for~${D=100}$ is within a thin spherical shell. From eyeballing
+ the plot, what are the inner and outer radii for the shell that
+ contains 90\% of the mass in this case?
+\end{enumerate}
+\end{problem}
+
+\paragraph{Solution.} The code for this problem can be found in file
+\textsf{p1.py}.
+
+1. By definition of $\bx$, its coordinates are
+independent univariate Gaussian variables of mean 0 and variance one. Hence,
+$\bx^T\bx = \sum_{i=1}^D\bx_i^2$ is exactly a chi-squared distribution with $D$
+degrees of freedom. So for all $a, b$ we have:
+\begin{align*}
+ p(a\leq \sqrt{\bx^T\bx}\leq b)
+ &= p(a^2\leq \bx^T\bx \leq b)
+ =\frac{1}{2^{D/2}\Gamma(D/2)}\int_{a^2}^{b^2}x^{D/2-1}e^{-x/2}dx\\
+ &= \frac{1}{2^{D/2-1}\Gamma(D/2)} u^{D-1}e^{-u^2/2}
+\end{align*}
+where we used the formula of the chi-squared distribution for the second
+equality and the change of variable $u=\sqrt{x}$ for the last equality. Since
+this is true for all $a,b$, the distribution of the distance is given by:
+\begin{displaymath}
+ p(u) = \frac{1}{2^{D/2-1}\Gamma(D/2)} u^{D-1}e^{-u^2/2}
+\end{displaymath}
+
+2. The pdf for different values of $D$ can be found below:
+\begin{center}
+\includegraphics[width=0.8\textwidth]{dist1.pdf}
+\end{center}
+
+3. The cdf plot is below for $D=100$
+\begin{center}
+\includegraphics[width=0.8\textwidth]{cumul.pdf}
+\end{center}
+
+4. Eyeballing the plot, 90\% is in the shell whose inner and outer radii are
+ $9$ and $11$ respectively.
+
+\begin{problem}[Bayesian Linear Regression, 30pts]
+Here are some one-dimensional data to regress:
+\begin{verbatim}
+x = [-1.87 -1.76 -1.67 -1.22 -0.07 0.11 0.67 1.60 2.22 2.51]'
+y = [0.06 1.67 0.54 -1.45 -0.18 -0.67 0.92 2.95 5.13 5.18]'
+\end{verbatim}
+Construct a Bayesian linear regression model using Gaussian observation noise with fixed variance $\sigma^2$, and a basis of your choosing (e.g., polynomial, sinusoids, radial basis functions).
+For the prior on the regression weights, use a normal distribution:
+$$p(\bw) = \distNorm(\bw | \bw_0, \mathbf V_0)$$
+
+
+\begin{enumerate}
+ \item Write down:
+ \begin{enumerate}
+ \item your basis
+ \item the posterior distribution on $\bw$ given the data: $p(\bw | \bx, \by, \bw_0, \mathbf V_0, \sigma^2)$.
+ \item the posterior predictive distribution $p(y^\star | x^\star, \bx, \by, \bw_0, \mathbf V_0, \sigma^2)$ of a new function value $y^\star = f(x^\star)$ obtained by integrating out $\bw$.
+ \item The marginal likelihood, $p(\by | \bx, \bw_0,\boldsymbol V_0, \sigma^2)$ obtained by integrating out the regression weights.
+ \end{enumerate}
+ \item Choose values of $\bw_0, \mathbf V_0$ and $\sigma^2$ that seem sensible for the regression weight prior and the noise.
+ Plot several random functions drawn from your prior.
+
+ \item Plot the data, as well as several typical posterior samples of the function given the data.
+
+ \item Plot the 95\% central credible interval region of the
+ predictive density as a function of~$x$. That is, produce a plot
+ that shows the ``tube'' containing most of the functions that are
+ consistent with the data under your model.
+
+ \item Try varying the basis functions used. For example, you could change the
+ order of polynomial, or how many radial basis functions to put
+ down. Produce a bar plot of the marginal likelihood for several of these hypotheses.
+ Which hypotheses are supported by the data?
+ \end{enumerate}
+\end{problem}
+
+\paragraph{Solution.} 1. (a) I chose the Gaussian radial basis functions as my
+basis:
+\begin{displaymath}
+ \phi_j(x) = \exp\left(\frac{-(x-\mu_j)^2}{2\epsilon^2}\right)
+\end{displaymath}
+For the plots I chose 10 basis functions whose $\mu_j$ were evenly spread over
+the interval $[-2, 3]$ and $\epsilon = 1$.
+
+(b) Let us denote by $\Phi(\bx)$ the matrix of the observations after applying
+the basis functions to them. That is, the $i$th row of $\Phi(\bx)$ is $\phi(x)
+= [\phi_1(x_i),\ldots,\phi_N(x_i)]$ where $N$ is the number of basis functions.
+Then the posterior is given by:
+\begin{displaymath}
+ p(\bw \given \by) \propto \distNorm(\Phi(\bx)\bw, \sigma^2Id)\distNorm(\bw_0,
+ \mathbf V_0)
+\end{displaymath}
+Using standard results for Gaussian variables (e.g. Theorem 4.4.1 from Murphy)
+we get that $p(\bw \given \bx, \by) \sim \distNorm(\mu, \Sigma)$ with:
+\begin{displaymath}
+ \Sigma^{-1} = V_0^{-1} + \frac{1}{\sigma^2}\Phi(\bx)^T\Phi(\bx)
+ \quad\text{and}\quad \mu = \Sigma V_0^{-1}\bw_0
+ +\frac{1}{\sigma^2}\Sigma\Phi(\bx)^Ty
+\end{displaymath}
+
+(c) Again using Theorem 4.4.1 we compute the posterior predictive:
+\begin{displaymath}
+ p(y^*\given x^*) \sim \distNorm\left(\phi(x^*)^T\mu, \sigma^2 + \phi(x^*)^T
+ \Sigma \phi(x^*)\right)
+\end{displaymath}
+
+(d) And the marginal likelihood:
+\begin{displaymath}
+ p(\by\given \bx)
+ = \distNorm(\Phi(\bx)^T\bw_0, \sigma^2Id + \Phi(\bx)V_0\Phi(\bx)^T)
+\end{displaymath}
+I am using a slight abuse of notation here: in (c) I meant that the posterior
+predictive is a normal distribution whose parameters where indicated as the
+arguments of the $\distNorm$ function. Here I mean that the marginal likelihood
+is computed by evaluating the cdf of normal distribution whose parameters are
+indicated at the point $\by$.
+
+2. I chose $w_0$ to be the zero vector, $V_0$ to be $Id$ and $\sigma^2$ to be
+ 1. Several functions drawn from the prior are draw below:
+\begin{center}
+\includegraphics[width=0.7\textwidth]{prior.pdf}
+\end{center}
+
+3. The data and several functions drawn from the posterior are drawn below:
+\begin{center}
+\includegraphics[width=0.8\textwidth]{posterior.pdf}
+\end{center}
+
+4. The data, mean of the posterior predictive and 95\% interval for the
+posterior predictive (blue-grey tube) can be found below. For any point $x$,
+the posterior predictive distribution at this point is a univariate gaussian
+whose mean and variance was computed in part 1. The 95\% interval is compute by
+considering the interval of length $2\times 1.96$ times the standard deviation
+around the mean.
+\begin{center}
+\includegraphics[width=0.8\textwidth]{predictive.pdf}
+\end{center}
+
+5. The plot below shows the marginal likelihood as a function of the number of
+ basis functions in my RBF basis.
+\begin{center}
+\includegraphics[width=0.8\textwidth]{ml.pdf}
+\end{center}
+It seems that about 25 functions in the basis is a reasonable prior hypothesis.
+
+
+
+\begin{problem}[30pts, Hastie et al., Murphy]
+In this problem, we'll apply logistic regression to a data set of spam
+email. These data consist of 4601 email messages, from which 57
+features have been extracted. These are as follows:
+\begin{itemize}
+ \item $48$ features in $[0,100]$, giving the percentage of words in
+ a given message which match a given word on a list containing,
+ e.g., ``business'', ``free'', etc.
+ \item $6$ features in $[0,100]$, giving the percentage of characters
+ in the email that match characters on a list containing, e.g.,
+ ``\$'', ``\#'', etc.
+ \item Feature 55: The average length of an uninterrupted sequence of
+ capital letters.
+ \item Feature 56: The length of the longest uninterrupted sequence
+ of capital letters.
+ \item Feature 57: The sum of the lengths of uninterrupted sequences
+ of capital letters.
+\end{itemize}
+There are files \texttt{spam.train.dat} and \texttt{spam.test.dat}
+(available on the course website) in which each row is an email.
+There are 3000 training and 1601 test examples. The final column in
+each file indicates whether the email was spam.
+
+\begin{enumerate}
+ \item Code up ~$\ell_2$-regularized logistic regression.
+ That is, code up a model of the form:
+ \begin{align}
+ p(\mathbf{w}) & = \mathcal{N}(0, \sigma^2 I) \\
+ p(y_i = 1 | \mathbf{x}, \mathbf{w}) & = \frac{1}{1 + \exp(-\mathbf{w}^T \mathbf{x})}
+ \end{align}
+
+ and use it to compute the MAP estimate of $\mathbf{w}$, given the training examples.
+ Compute the gradients of the posterior probability of the data with respect to $\mathbf{w}$, and check that they're correct numerically.
+ Show the code you used for computing the derivatives.
+
+ \item Use
+ 10-fold cross-validation on the training set to determine an appropriate regularization
+ penalty $\sigma^2$. Plot different values of $\log_e \sigma^2$ ranging from -8 to 8 against the cross-validation error and test error. Which is the best?
+ \item There are different ways one might preprocess the data. One
+ typical thing to do is to linearly ``standardize'' each input feature so
+ that it has mean zero and variance one. Do this standardization
+ and evaluate the model again. Report the discovered $\bw$, and the training and test error after performing cross-validation as in part 2.
+ \item It may also be the case that some dimensions are better represented a binary variables, or by taking the logarithm, than the given values.
+ Hypothesize whether any of the variables in this data sets might be better coded as binary (or using their log), and explain your reasoning.
+ Then do the transform and apply the same cross-validation as before. Do your results improve? Conjecture why or why not.
+\end{enumerate}
+
+\end{problem}
+
+\paragraph{Solution.} The code for this problem can be found in \textsf{p3.py}
+
+1. The MAP estimate of $w$ is given by:
+\begin{displaymath}
+ w^* = \argmax_{w} \sum_{i=1}^n \log p(x_i, y_i \given w) + \log p(w)
+\end{displaymath}
+For $\ell$-2 regularized logistic regression:
+\begin{align*}
+ \log p(x_i, y_i\given w) &= - y_i\log\left(1+\exp(-w^Tx_i)\right)
+ + (1- y_i)\log\left(\frac{\exp(-w^Tx_i)}{1+\exp(-w^Tx_i)}\right)\\
+ &= -(1-y_i) w^Tx_i -\log\left(1+\exp(-w^T x_i)\right)
+\end{align*}
+and:
+\begin{displaymath}
+ \log p(w) = -\frac{1}{2\sigma^2}\|w\|^2 + C
+\end{displaymath}
+where $C$ is the normalization constant which does not depend on $w$. So the
+MAP optimization problem can be written like this:
+\begin{displaymath}
+ w^* = \argmin_w \sum_{i=1}^n \log\left(1+\exp(-w^Tx_i)\right)
+ + \sum_{i=1}^n(1-y_i)w^Tx_i + \frac{1}{2\sigma^2}\|w\|^2
+\end{displaymath}
+Denoting by $L(w)$ the objective function of the above optimization problem.
+The gradient is given by:
+\begin{displaymath}
+ \nabla L(w) = \sum_{i=1}^n \frac{-x_i}{1+\exp(w^Tx_i)}
+ + \sum_{i=1}^n (1-y_i)x_i + \frac{1}{\sigma^2}w
+\end{displaymath}
+The code to compute $L$ and its gradient and to test the gradient can be found
+in \textsf{p3.py}.
+
+2. The plot of the cross-validation error and test error as a function of $\log
+ \sigma^2$ can be found below:
+\begin{center}
+\includegraphics[width=0.8\textwidth]{scores_txt.pdf}
+\end{center}
+I used fraction of misclassified samples are a measure of error. Strangely,
+the plot doesn't exhibit the standard ``sweetspot behavior''. The test error
+should be increasing for large values of $\sigma^2$ because at this point we
+are simply over-fitting the data, but I haven't been able to understand why
+I am not observing it. In practice, I would use $\log\sigma^2 = -2$.
+
+3. The cross-validation plot for the normalized data can be found below:
+\begin{center}
+\includegraphics[width=0.8\textwidth]{scores2_txt.pdf}
+\end{center}
+The error is much smaller in this case. For $\log\sigma^2 = 2$, I obtained
+a cross-validation error of 0.073 and a test error of 0.086. The value of $w$
+can be found in file \textsf{w.txt.pdf}.
+
+4. I believe feature 55, 56 and 57 would be better represented by taking their
+ logarithms. My reasoning is the following: if for example an email is
+ written entirely in capital letters the variable will simply be equal to the
+ length of the email. But two emails of different lengths written entirely in
+ capital should be considered equally bad. In other words, as long as there
+ are many capital letters, whether it is 100 or 1000, it seems to me that the
+ emails are almost equally likely to be spam. Taking the logarithm will favor
+ the order of magnitude rather than the actual count.
+
+ The plot of the CV and test error after this normalization can be found
+ here:
+\begin{center}
+\includegraphics[width=0.8\textwidth]{scores3_txt.pdf}
+\end{center}
+Despite the normalization the error stays the same.
+
+\end{document}
diff --git a/hw2/p1.py b/hw2/p1.py
new file mode 100644
index 0000000..682abc1
--- /dev/null
+++ b/hw2/p1.py
@@ -0,0 +1,37 @@
+from scipy.special import gamma
+import numpy as np
+from math import exp
+import matplotlib.pyplot as plt
+from scipy.stats import rv_continuous
+
+
+def pdf(u, D):
+ return 1 / (2 ** ((D / 2.) - 1.)
+ * gamma(D / 2.)) * u ** (D - 1.) * exp(-u ** 2. / 2.)
+
+
+class my(rv_continuous):
+
+ def _pdf(self, u, D):
+ return pdf(u, D)
+
+
+def dist():
+ a = np.linspace(0, 20, 1000)
+ plt.figure(figsize=(9, 6))
+ for D in [1, 2, 5, 10, 20, 50, 100]:
+ b = [pdf(u, D) for u in a]
+ plt.plot(a, b, label="D=" + str(D))
+ plt.legend()
+ plt.savefig("dist1.pdf")
+
+
+def cdf():
+ a = my(a=0, b=float("inf"))
+ px = np.linspace(5, 15, 100)
+ plt.figure(figsize=(9, 6))
+ plt.plot(px, a.cdf(px, 100))
+ plt.savefig("cumul.pdf")
+
+if __name__ == "__main__":
+ cdf()
diff --git a/hw2/p2.py b/hw2/p2.py
new file mode 100644
index 0000000..1659112
--- /dev/null
+++ b/hw2/p2.py
@@ -0,0 +1,102 @@
+import numpy as np
+import seaborn
+import matplotlib.pyplot as plt
+from math import sqrt
+from scipy.stats import multivariate_normal
+
+seaborn.set_style("white")
+
+x = np.array([-1.87, -1.76, -1.67, -1.22, -0.07, 0.11, 0.67, 1.60, 2.22, 2.51])
+y = np.array([0.06, 1.67, 0.54, -1.45, -0.18, -0.67, 0.92, 2.95, 5.13, 5.18])
+
+D = 20
+eps = 1
+base = np.linspace(-2, 3, num=D)
+
+
+def map_features(x, base=base):
+ x = x.reshape(len(x), 1)
+ phi = np.exp(-(x - base) ** 2 / (2 * eps ** 2))
+ phi = np.concatenate((phi, np.ones((phi.shape[0], 1))), 1)
+ return phi
+
+
+def draw_prior(x):
+ phi = map_features(x)
+ D = phi.shape[1]
+ w = np.random.normal(0, 1, D)
+ return np.dot(phi, w)
+
+
+def posterior():
+ phi = map_features(x)
+ D = phi.shape[1]
+ invsigma = np.identity(D) + np.dot(phi.T, phi)
+ sigma = np.linalg.inv(invsigma)
+ mu = np.dot(sigma, np.dot(phi.T, y))
+ return mu, sigma
+
+
+def plot_posteriors():
+ plt.figure(figsize=(9, 6))
+ mu, sigma = posterior()
+ px = np.linspace(-2, 3, num=200)
+ phi = map_features(px)
+ plt.plot(x, y, "o", label="data")
+ for i in xrange(5):
+ w = np.random.multivariate_normal(mu, sigma)
+ plt.plot(px, np.dot(phi, w), label="post. " + str(i))
+ plt.legend(loc="lower right")
+ plt.savefig("posterior.pdf")
+
+
+def plot_tube():
+ fig = plt.figure(figsize=(9, 6))
+ mu, sigma = posterior()
+ px = np.linspace(-2, 3, num=200)
+ phi = map_features(px)
+ plt.plot(px, np.dot(phi, mu), label="post. pred. mean")
+ ax = fig.get_axes()[0]
+ inf = [np.dot(row, mu) - 1.96 * sqrt(1 + np.dot(row, np.dot(sigma, row)))
+ for row in phi]
+ sup = [np.dot(row, mu) + 1.96 * sqrt(1 + np.dot(row, np.dot(sigma, row)))
+ for row in phi]
+ ax.fill_between(px, inf, sup, alpha=0.2)
+ plt.plot(x, y, "o", label="data")
+ plt.legend(loc="lower right")
+ plt.savefig("predictive.pdf")
+
+
+def plot_priors():
+ plt.figure(figsize=(9, 6))
+ px = np.linspace(-2, 3, num=200)
+ for i in xrange(5):
+ plt.plot(px, draw_prior(px), label="prior " + str(i))
+ plt.legend(loc="lower right")
+ plt.savefig("prior.pdf")
+
+
+def marginal_likelihood():
+ eps = 1
+ ml = []
+ px = np.linspace(1, 50, num=10)
+ for D in px:
+ base = np.linspace(-2, 3, num=D)
+ phi = map_features(x, base)
+ z = np.zeros(phi.shape[0])
+ rv = multivariate_normal(mean=z, cov=np.identity(phi.shape[0])
+ + np.dot(phi, phi.T))
+ ml.append(rv.pdf(y))
+ plt.figure(figsize=(9, 6))
+ plt.bar(px, ml)
+ plt.savefig("ml.pdf")
+
+
+
+if __name__ == "__main__":
+ plot_priors()
+ plot_posteriors()
+ plot_tube()
+ marginal_likelihood()
+#plt.plot(x, y, "o")
+#plt.show()
diff --git a/hw2/p3.py b/hw2/p3.py
new file mode 100644
index 0000000..f4a3ffa
--- /dev/null
+++ b/hw2/p3.py
@@ -0,0 +1,125 @@
+import numpy as np
+from scipy.special import expit
+from scipy.optimize import minimize
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.metrics import accuracy_score
+from sklearn.cross_validation import KFold
+from sklearn import preprocessing
+import seaborn
+import matplotlib.pyplot as plt
+from math import exp
+import sys
+
+seaborn.set_style("white")
+
+
+class LogReg(BaseEstimator, ClassifierMixin):
+
+ def __init__(self, sigmas=1.):
+ self.sigmas = sigmas
+
+ def fit(self, x, y):
+ w = np.zeros(x.shape[1])
+
+ def aux(w):
+ return l_gradient(w, x, y, self.sigmas)
+
+ self.w = minimize(aux, w, jac=True, method='L-BFGS-B',
+ options={'disp': False, 'gtol': 1e-10}).x
+ return self
+
+ def predict(self, x):
+ pred = np.dot(x, self.w)
+ return (pred > 0).astype(int)
+
+
+def l(w, x, y, sigmas):
+ x2 = np.dot(x, w)
+ l = (-np.log(expit(x2)).sum() + np.inner(1-y, x2)
+ + 1 / (2 * sigmas) * np.inner(w, w))
+ return l
+
+
+def l_gradient(w, x, y, sigmas):
+ x2 = np.dot(x, w)
+ l = (-np.log(expit(x2)).sum() + np.inner(x2, (1 - y))
+ + 1 / (2 * sigmas) * np.inner(w, w))
+ grad = (np.sum(-x * expit(-x2)[:, None], axis=0)
+ + np.sum(x * (1 - y)[:, None], axis=0) + 1 / sigmas * w)
+ return l, grad
+
+
+def test_gradient(w, x, y, sigmas):
+ eps = 1e-10
+ _, g = l_gradient(w, x, y, sigmas)
+ for i in xrange(len(g)):
+ z = np.zeros(len(g))
+ z[i] = eps
+ l1 = l(w + z, x, y, sigmas)
+ z = np.zeros(len(g))
+ z[i] = -eps
+ l2 = l(w + z, x, y, sigmas)
+ print (l1 - l2) / (2 * eps), g[i]
+
+
+def cross_validation(x, y):
+ for sigmas in [10 ** i for i in xrange(-8, 9)]:
+ train_scores = []
+ test_scores = []
+ for train, test in KFold(n=x.shape[0], n_folds=10, shuffle=True):
+ mod = LogReg(sigmas=sigmas)
+ x_train_f = x[train]
+ y_train_f = y[train]
+ x_test_f = x[test]
+ y_test_f = y[test]
+ mod = mod.fit(x_train_f, y_train_f)
+ train_scores.append(accuracy_score(y_test_f,
+ mod.predict(x_test_f)))
+ test_scores.append(accuracy_score(y_test, mod.predict(x_test)))
+ print np.mean(train_scores), np.mean(test_scores)
+
+
+def plot_scores():
+ plt.figure(figsize=(9, 6))
+ values = [map(float, line.strip().split())
+ for line in open(sys.argv[1])]
+ y1, y2 = zip(*values)
+ x = xrange(-8, 9)
+ y1 = 1 - np.array(y1)
+ y2 = 1 - np.array(y2)
+ plt.plot(x, y1, label='CV error')
+ plt.plot(x, y2, label='Test error')
+ plt.legend()
+ plt.xlabel("log(sigma^2)")
+ plt.ylabel("Error")
+ plt.savefig(sys.argv[1] + ".pdf")
+
+
+def three(x, y):
+ sigmas = exp(2)
+ mod = LogReg(sigmas=sigmas)
+ mod.fit(x, y)
+ print 1 - accuracy_score(y, mod.predict(x))
+ print 1 - accuracy_score(y_test, mod.predict(x_test))
+ print mod.w
+
+
+def norm(x):
+ for i in [54, 55, 56]:
+ x[:, i] = np.log(1 + x[:, i])
+ return x
+
+if __name__ == "__main__":
+ data_train = np.loadtxt("spam.train.dat")
+ x_train = data_train[:, :-1]
+ y_train = data_train[:, -1].astype(int)
+ data_test = np.loadtxt("spam.test.dat")
+ x_test = data_test[:, :-1]
+ y_test = data_test[:, -1].astype(int)
+ #x_train = preprocessing.scale(x_train)
+ #x_test = preprocessing.scale(x_test)
+ #x_train = norm(x_train)
+ #x_test = norm(x_test)
+ cross_validation(x_train, y_train)
+ #plot_scores()
+ #three(x_train, y_train)
diff --git a/hw2/w.txt.pdf b/hw2/w.txt.pdf
new file mode 100644
index 0000000..10e02fa
--- /dev/null
+++ b/hw2/w.txt.pdf
@@ -0,0 +1,16 @@
+[ -1.16313019e-01 -1.86665945e-01 6.97073481e-02 1.08220421e+01
+ 3.79967800e-01 2.55576319e-01 8.97469186e-01 2.24521456e-01
+ 1.96371840e-01 9.65848716e-03 -1.84768134e-03 -2.10674040e-02
+ 7.25815774e-03 -1.73726085e-03 3.78833303e-01 7.27626576e-01
+ 3.49140543e-01 7.38916828e-02 1.89606569e-01 5.01057133e-01
+ 1.51101469e-01 2.66337989e-01 1.76604724e+00 3.85382385e-01
+ -1.92398194e+00 -6.65178155e-01 -2.41555043e+00 1.63766156e-01
+ -2.31952937e-01 -1.37873458e-01 -3.86235131e-02 1.88487314e+00
+ -4.75460995e-01 -1.63762752e+00 -3.51540891e-01 3.48965393e-01
+ -2.85733418e-01 -1.09756723e-01 -2.27348768e-01 -6.50708679e-01
+ -3.71495595e-01 -8.24659275e-01 -9.70155511e-02 -5.92524802e-01
+ -7.01454079e-01 -7.61876779e-01 -1.52936023e-01 -3.05077963e-01
+ -3.85494128e-01 -2.82554055e-02 -1.80762491e-01 3.03846017e-01
+ 1.39227907e+00 1.09831231e+00 6.43297348e+00 1.19357017e+00
+ 1.66187736e-01]
+