% No 'submit' option for the problems by themselves.
\documentclass[submit]{harvardml}
% Use the 'submit' option when you submit your solutions.
%\documentclass[submit]{harvardml}

% Put in your full name and email address.
\name{Thibaut Horel}
\email{thorel@seas.harvard.edu}
\usepackage[utf8x]{inputenc}

% List any people you worked with.
\collaborators{%
}

% You don't need to change these.
\course{CS281-F13}
\assignment{Assignment \#2}
\duedate{23:59pm October 7, 2015}

\usepackage{graphicx}
% Useful macros.
\DeclareMathOperator*{\argmax}{argmax}
\DeclareMathOperator*{\argmin}{argmin}
\newcommand{\given}{\,|\,}
\newcommand{\prob}{\mathbb{P}}
\newcommand{\trans}{\mathsf{T}}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\by}{\mathbf{y}}
\newcommand{\bw}{\mathbf{w}}
\newcommand{\distNorm}{\mathcal{N}}
\newcommand{\bzero}{\mathbf{0}}
\newcommand{\ident}{\mathbb{I}}

\begin{document}


\begin{problem}[10pts]
One intuitive way to summarize a probability density is via the mode,
as this is the ``most likely'' value in some sense.  A common example
of this is using the maximum \textit{a posteriori} (MAP) estimate of a
model's parameters.  In high dimensions, however, the mode becomes
less and less representative of typical samples.  Consider variates
from a~$D$-dimensional zero mean spherical Gaussian with unit
variance:
\begin{align*}
  \bx &\sim \distNorm(\bzero_D, \ident_D),
\end{align*}
where~$\bzero_D$ indicates a column vector of~$D$ zeros and~$\ident_D$
is a~${D\times D}$ identity matrix.
\begin{enumerate}
  \item Compute the distribution that this implies over the distance
    of these points from the origin.  That is, compute the
    distribution over~$\sqrt{\bx^\trans\bx}$, if~$\bx$ is a
    realization from~$\distNorm(\bzero_D, \ident_D)$.  (Hint: Consider
    transformations of a Gamma distribution.)
  \item Make a plot that shows this probability density function for
    several different values of~$D$, up to ~${D=100}$.

  \item Make a plot of the cumulative distribution function (CDF) over
    this distance distribution for~${D=100}$.  A closed-form solution
    may be difficult to compute, so you can do this numerically.)

  \item From examining the CDF we can think about where most of the
    mass lives as a function of radius.  For example, most of the mass
    for~${D=100}$ is within a thin spherical shell.  From eyeballing
    the plot, what are the inner and outer radii for the shell that
    contains 90\% of the mass in this case?
\end{enumerate}
\end{problem}

\paragraph{Solution.} The code for this problem can be found in file
\textsf{p1.py}.

1. By definition of $\bx$, its coordinates are
independent univariate Gaussian variables of mean 0 and variance one. Hence,
$\bx^T\bx = \sum_{i=1}^D\bx_i^2$ is exactly a chi-squared distribution with $D$
degrees of freedom. So for all $a, b$ we have:
\begin{align*}
    p(a\leq \sqrt{\bx^T\bx}\leq b)
    &= p(a^2\leq \bx^T\bx \leq b)
    =\frac{1}{2^{D/2}\Gamma(D/2)}\int_{a^2}^{b^2}x^{D/2-1}e^{-x/2}dx\\
    &= \frac{1}{2^{D/2-1}\Gamma(D/2)} u^{D-1}e^{-u^2/2}
\end{align*}
where we used the formula of the chi-squared distribution for the second
equality and the change of variable $u=\sqrt{x}$ for the last equality. Since
this is true for all $a,b$, the distribution of the distance is given by:
\begin{displaymath}
    p(u) = \frac{1}{2^{D/2-1}\Gamma(D/2)} u^{D-1}e^{-u^2/2}
\end{displaymath}

2. The pdf for different values of $D$ can be found below:
\begin{center}
\includegraphics[width=0.8\textwidth]{dist1.pdf}
\end{center}

3. The cdf plot is below for $D=100$
\begin{center}
\includegraphics[width=0.8\textwidth]{cumul.pdf}
\end{center}

4. Eyeballing the plot, 90\% is in the shell whose inner and outer radii are
   $9$ and $11$ respectively.

\begin{problem}[Bayesian Linear Regression, 30pts]
Here are some one-dimensional data to regress:
\begin{verbatim}
x = [-1.87 -1.76 -1.67 -1.22 -0.07 0.11 0.67 1.60 2.22 2.51]'
y = [0.06 1.67 0.54 -1.45 -0.18 -0.67 0.92 2.95 5.13 5.18]'
\end{verbatim}
Construct a Bayesian linear regression model using Gaussian observation noise with fixed variance $\sigma^2$, and a basis of your choosing (e.g., polynomial, sinusoids, radial basis functions).
For the prior on the regression weights, use a normal distribution:
$$p(\bw) = \distNorm(\bw | \bw_0, \mathbf V_0)$$


\begin{enumerate}
  \item Write down:
  \begin{enumerate}
  	\item your basis
  	\item the posterior distribution on $\bw$ given the data: $p(\bw | \bx, \by, \bw_0, \mathbf V_0, \sigma^2)$.
  	\item the posterior predictive distribution $p(y^\star | x^\star, \bx, \by, \bw_0, \mathbf V_0, \sigma^2)$ of a new function value $y^\star = f(x^\star)$ obtained by integrating out $\bw$.
	\item The marginal likelihood, $p(\by | \bx, \bw_0,\boldsymbol V_0, \sigma^2)$ obtained by integrating out the regression weights.
  \end{enumerate}
  \item Choose values of $\bw_0, \mathbf V_0$ and $\sigma^2$ that seem sensible for the regression weight prior and the noise.
  Plot several random functions drawn from your prior.

  \item Plot the data, as well as several typical posterior samples of the function given the data.
    
  \item Plot the 95\% central credible interval region of the
    predictive density as a function of~$x$. That is, produce a plot
    that shows the ``tube'' containing most of the functions that are
    consistent with the data under your model.

  \item Try varying the basis functions used.  For example, you could change the
    order of polynomial, or how many radial basis functions to put
    down.  Produce a bar plot of the marginal likelihood for several of these hypotheses.
    Which hypotheses are supported by the data?
  \end{enumerate}
\end{problem}

\paragraph{Solution.} 1. (a) I chose the Gaussian radial basis functions as my
basis:
\begin{displaymath}
    \phi_j(x) =  \exp\left(\frac{-(x-\mu_j)^2}{2\epsilon^2}\right)
\end{displaymath}
For the plots I chose 10 basis functions whose $\mu_j$ were evenly spread over
the interval $[-2, 3]$ and $\epsilon = 1$.

(b) Let us denote by $\Phi(\bx)$ the matrix of the observations after applying
the basis functions to them. That is, the $i$th row of $\Phi(\bx)$ is $\phi(x)
= [\phi_1(x_i),\ldots,\phi_N(x_i)]$ where $N$ is the number of basis functions.
Then the posterior is given by:
\begin{displaymath}
    p(\bw \given \by) \propto \distNorm(\Phi(\bx)\bw, \sigma^2Id)\distNorm(\bw_0,
    \mathbf V_0)
\end{displaymath}
Using standard results for Gaussian variables (e.g. Theorem 4.4.1 from Murphy)
we get that $p(\bw \given \bx, \by) \sim \distNorm(\mu, \Sigma)$ with:
\begin{displaymath}
    \Sigma^{-1} = V_0^{-1} + \frac{1}{\sigma^2}\Phi(\bx)^T\Phi(\bx)
    \quad\text{and}\quad \mu = \Sigma V_0^{-1}\bw_0
    +\frac{1}{\sigma^2}\Sigma\Phi(\bx)^Ty
\end{displaymath}

(c) Again using Theorem 4.4.1 we compute the posterior predictive:
\begin{displaymath}
    p(y^*\given x^*) \sim \distNorm\left(\phi(x^*)^T\mu, \sigma^2 + \phi(x^*)^T
    \Sigma \phi(x^*)\right)
\end{displaymath}

(d) And the marginal likelihood:
\begin{displaymath}
    p(\by\given \bx)
    = \distNorm(\Phi(\bx)^T\bw_0, \sigma^2Id + \Phi(\bx)V_0\Phi(\bx)^T)
\end{displaymath}
I am using a slight abuse of notation here: in (c) I meant that the posterior
predictive is a normal distribution whose parameters where indicated as the
arguments of the $\distNorm$ function. Here I mean that the marginal likelihood
is computed by evaluating the cdf of normal distribution whose parameters are
indicated at the point $\by$.

2. I chose $w_0$ to be the zero vector, $V_0$ to be $Id$ and $\sigma^2$ to be
   1. Several functions drawn from the prior are draw below:
\begin{center}
\includegraphics[width=0.7\textwidth]{prior.pdf}
\end{center}

3. The data and several functions drawn from the posterior are drawn below:
\begin{center}
\includegraphics[width=0.8\textwidth]{posterior.pdf}
\end{center}

4. The data, mean of the posterior predictive and 95\% interval for the
posterior predictive (blue-grey tube) can be found below. For any point $x$,
the posterior predictive distribution at this point is a univariate gaussian
whose mean and variance was computed in part 1. The 95\% interval is compute by
considering the interval of length $2\times 1.96$ times the standard deviation
around the mean.
\begin{center}
\includegraphics[width=0.8\textwidth]{predictive.pdf}
\end{center}

5. The plot below shows the marginal likelihood as a function of the number of
   basis functions in my RBF basis.
\begin{center}
\includegraphics[width=0.8\textwidth]{ml.pdf}
\end{center}
It seems that about 25 functions in the basis is a reasonable prior hypothesis.


\begin{problem}[30pts, Hastie et al., Murphy]
In this problem, we'll apply logistic regression to a data set of spam
email.  These data consist of 4601 email messages, from which 57
features have been extracted.  These are as follows:
\begin{itemize}
  \item $48$ features in $[0,100]$, giving the percentage of words in
    a given message which match a given word on a list containing,
    e.g., ``business'', ``free'', etc.
  \item $6$ features in $[0,100]$, giving the percentage of characters
    in the email that match characters on a list containing, e.g.,
    ``\$'', ``\#'', etc.
  \item Feature 55: The average length of an uninterrupted sequence of
    capital letters.
  \item Feature 56: The length of the longest uninterrupted sequence
    of capital letters.
  \item Feature 57: The sum of the lengths of uninterrupted sequences
    of capital letters.
\end{itemize}
There are files \texttt{spam.train.dat} and \texttt{spam.test.dat}
(available on the course website) in which each row is an email.
There are 3000 training and 1601 test examples.  The final column in
each file indicates whether the email was spam.

\begin{enumerate}
  \item Code up ~$\ell_2$-regularized logistic regression.
  That is, code up a model of the form:
  \begin{align}
  p(\mathbf{w}) & = \mathcal{N}(0, \sigma^2 I) \\
  p(y_i = 1 | \mathbf{x}, \mathbf{w}) & = \frac{1}{1 + \exp(-\mathbf{w}^T \mathbf{x})}
  \end{align}

  and use it to compute the MAP estimate of $\mathbf{w}$, given the training examples.
  Compute the gradients of the posterior probability of the data with respect to $\mathbf{w}$, and check that they're correct numerically.
  Show the code you used for computing the derivatives. 
  
  \item Use
    10-fold cross-validation on the training set to determine an appropriate regularization
    penalty $\sigma^2$.  Plot different values of $\log_e \sigma^2$ ranging from -8 to 8  against the cross-validation error and test error.  Which is the best?
  \item There are different ways one might preprocess the data.  One
    typical thing to do is to linearly ``standardize'' each input feature so
    that it has mean zero and variance one.  Do this standardization
    and evaluate the model again.  Report the discovered $\bw$, and the training and test error after performing cross-validation as in part 2.
    \item It may also be the case that some dimensions are better represented a binary variables, or by taking the logarithm, than the given values.
  Hypothesize whether any of the variables in this data sets might be better coded as binary (or using their log), and explain your reasoning.
  Then do the transform and apply the same cross-validation as before.  Do your results improve?  Conjecture why or why not.
\end{enumerate}

\end{problem}

\paragraph{Solution.} The code for this problem can be found in \textsf{p3.py}

1. The MAP estimate of $w$ is given by:
\begin{displaymath}
    w^* = \argmax_{w} \sum_{i=1}^n \log p(x_i, y_i \given w) + \log p(w)
\end{displaymath}
For $\ell$-2 regularized logistic regression:
\begin{align*}
    \log p(x_i, y_i\given w) &= - y_i\log\left(1+\exp(-w^Tx_i)\right) 
    + (1- y_i)\log\left(\frac{\exp(-w^Tx_i)}{1+\exp(-w^Tx_i)}\right)\\
    &= -(1-y_i) w^Tx_i -\log\left(1+\exp(-w^T x_i)\right)
\end{align*}
and:
\begin{displaymath}
    \log p(w) = -\frac{1}{2\sigma^2}\|w\|^2 + C
\end{displaymath}
where $C$ is the normalization constant which does not depend on $w$. So the
MAP optimization problem can be written like this:
\begin{displaymath}
    w^* = \argmin_w \sum_{i=1}^n \log\left(1+\exp(-w^Tx_i)\right)
    + \sum_{i=1}^n(1-y_i)w^Tx_i + \frac{1}{2\sigma^2}\|w\|^2
\end{displaymath}
Denoting by $L(w)$ the objective function of the above optimization problem.
The gradient is given by:
\begin{displaymath}
    \nabla L(w) = \sum_{i=1}^n \frac{-x_i}{1+\exp(w^Tx_i)}
    + \sum_{i=1}^n (1-y_i)x_i + \frac{1}{\sigma^2}w
\end{displaymath}
The code to compute $L$ and its gradient and to test the gradient can be found
in \textsf{p3.py}.

2. The plot of the cross-validation error and test error as a function of $\log
   \sigma^2$ can be found below:
\begin{center}
\includegraphics[width=0.8\textwidth]{scores_txt.pdf}
\end{center}
I used fraction of misclassified samples are a measure of error.  Strangely,
the plot doesn't exhibit the standard ``sweetspot behavior''. The test error
should be increasing for large values of $\sigma^2$ because at this point we
are simply over-fitting the data, but I haven't been able to understand why
I am not observing it. In practice, I would use $\log\sigma^2 = -2$.

3. The cross-validation plot for the normalized data can be found below:
\begin{center}
\includegraphics[width=0.8\textwidth]{scores2_txt.pdf}
\end{center}
The error is much smaller in this case. For $\log\sigma^2 = 2$, I obtained
a cross-validation error of 0.073 and a test error of 0.086. The value of $w$
can be found in file \textsf{w.txt.pdf}.

4. I believe feature 55, 56 and 57 would be better represented by taking their
   logarithms. My reasoning is the following: if for example an email is
   written entirely in capital letters the variable will simply be equal to the
   length of the email. But two emails of different lengths written entirely in
   capital should be considered equally bad. In other words, as long as there
   are many capital letters, whether it is 100 or 1000, it seems to me that the
   emails are almost equally likely to be spam. Taking the logarithm will favor
   the order of magnitude rather than the actual count.

   The plot of the CV and test error after this normalization can be found
   here:
\begin{center}
\includegraphics[width=0.8\textwidth]{scores3_txt.pdf}
\end{center}
Despite the normalization the error stays the same.

\end{document}