1 files changed, 322 insertions, 0 deletions
diff --git a/hw0/main.tex b/hw0/main.tex
new file mode 100644
index 0000000..ce693b6
--- /dev/null
+++ b/hw0/main.tex
@@ -0,0 +1,322 @@
+% No 'submit' option for the problems by themselves.
+\documentclass[submit]{harvardml}
+% Use the 'submit' option when you submit your solutions.
+%\documentclass[submit]{harvardml}
+
+% Put in your full name and email address.
+\name{Thibaut Horel}
+\email{thorel@seas.harvard.edu}
+
+% List any people you worked with.
+\collaborators{%
+}
+
+% You don't need to change these.
+\course{CS281-F15}
+\assignment{Assignment \#0}
+\duedate{11:59pm September 11, 2013}
+
+\usepackage{url, enumitem}
+\usepackage{amsfonts, amsmath}
+\usepackage{listings}
+
+% Some useful macros.
+\newcommand{\given}{\,|\,}
+\newcommand{\R}{\mathbb{R}}
+\newcommand{\E}{\mathbb{E}}
+\newcommand{\prob}{\mathbb{P}}
+\newcommand{\var}{\text{var}}
+\newcommand{\cov}{\text{cov}}
+
+\begin{document}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\noindent Please turn in your solutions \textbf{via canvas} by the due date listed above.
+\\
+
+\noindent This assignment is intended to ensure that you have the background required for CS281. You should be able to answer the problems below without complicated calculations. All questions are worth $70/6 = 11.\bar{6}$ points unless stated otherwise.
+
+\begin{problem}[Variance and covariance]
+Let $X$ and~$Y$ be two independent random variables.
+
+\begin{enumerate}[label=(\alph*)]
+\item Show that the independence of~$X$ and~$Y$ implies that their
+covariance is zero.
+
+\item For a scalar constant~$a$, show the following two properties:
+\begin{align*}
+  \E(X + aY) &= \E(X) + a\E(Y)\\
+  \var(X + aY) &= \var(X) + a^2\var(Y)
+\end{align*}
+\end{enumerate}
+\end{problem}
+
+\paragraph{Solution} (a) If $X$ and $Y$ are independent, we can write:
+\begin{align*}
+    \cov(X, Y) &= \E\big[(X- \E X)(Y - \E Y)\big]
+    = \E(XY) - 2 \E(X)\E(Y) + \E(X)\E(Y)\\
+    &= \E(XY) - \E(X)\E(Y) = 0
+\end{align*}
+where the last equality uses that for two independent variables $\E(XY)
+= \E(X)\E(Y)$. The proof of this fact is as follows. Let us denote by $f_{X,Y}$
+the joint pdf of $(X, Y)$ and by $f_X$ (resp. $f_Y$) the pdf of $X$ (resp.
+$Y$). By independence, we have $f_{X,Y}(x,y) = f_X(x)f_Y(y)$, hence:
+\begin{displaymath}
+    \E(XY) = \iint xy f_{X,Y}(x, y)dxdy = \iint xy f_X(x)f_Y(y)dxdy
+    = \int x f_X(dx) \int y f_Y(dy) = \E(X)\E(Y)
+\end{displaymath}
+where the penultimate equality used Fubini's theorem. If the variables do not
+have pdfs, the above approach can be made to work using slightly more advanced
+measure theory (I am assuming that this is not the purpose of this HW to have
+us do it).
+
+(b) The first property is true even for non-independent variables and follows
+directly from the linearity of the integral. The proof is as follows:
+\begin{align*}
+    \E(X+aY) &= \iint (x+ay) f_{X,Y}(x,y)dxdy = \int xdx\int f_{X,Y}(x,y)dy
+    + a\int ydy\int f_{X,Y}(x,y)dx \\
+    &= \int xf_X(x)dx + a\int yf_Y(y)dy = \E(X) + a\E(Y)
+\end{align*}
+where we used linearity of the integral and Fubini's theorem for the second
+equality and the definition of marginal distributions for the penultimate
+equality.
+
+For the second property:
+\begin{align*}
+    \var(X+aY) &= \E\big[(X+aY)^2\big] - \big[\E(X + aY)\big]^2\\
+               &=  \E(X^2 + a^2Y^2 + 2aXY)
+    - \E(X)^2 - a^2\E(Y)^2 - 2a\E(X)\E(Y)\\
+               &= \E(X^2) - \E(X)^2 + a^2\big[\E(Y^2) - \E(Y)^2\big]
+    + 2a\big[\E(XY) - \E(X)\E(Y)\big] = \var(X) + a^2\var(Y)
+\end{align*}
+where the second equality used the first property (linearity of expectation) to
+expand the second term, the third equality is algebra and the last equality
+uses independence to see that the last term vanishes.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\begin{problem}[Densities]
+Answer the following questions.
+\begin{enumerate}[label=(\alph*)]
+  \item Can a probability density function (pdf) ever take values greater than 1?
+  \item Let $X$ be a univariate normally distributed random variable with mean 0 and variance $1/100$. What is the pdf of $X$?
+  \item What is the value of this pdf at 0?
+  \item What is the probability that $X = 0$?
+  \item Explain the discrepancy.
+\end{enumerate}
+\end{problem}
+
+\paragraph{Solution} (a) A pdf can take values larger than 1. Its integral must
+be equal to 1 but it could be larger than 1 on intervals of length smaller than
+1. For example the uniform distribution on $[0, 1/2]$ has a pdf constant equal
+to $2$ on the interval $[0, 1/2]$ (and is equal to $0$ elsewhere).
+
+In fact the pdf can take arbitrarily large values. Consider the random variable
+whose pdf is 0 everywhere except at every integer $n$ where it takes value
+$6n/\pi^2$ on the interval $[n, n+\frac{1}{n^3}]$. Denoting by $f$ this pdf, we
+have:
+\begin{displaymath}
+    \int_\R f(x)dx = \sum_{n\geq 1} \frac{1}{n^3} \frac{6n}{\pi^2}
+    = \frac{6}{\pi^2}\sum_{n\geq 1} \frac{1}{n^2} = 1
+\end{displaymath}
+
+(b) The pdf of this normal variable is:
+\begin{displaymath}
+    f(x) = \sqrt{\frac{50}{\pi}} \exp\left(-50x^2\right)
+\end{displaymath}
+
+(c) The value of this pdf at 0 is $\sqrt{\frac{50}{\pi}}$.
+
+(d) The probability that $X=0$ is 0 since the law of $X$ is
+continuous with respect to the Lebesgue measure.
+
+(e) The discrepancy can be explained by the following fact: the interpretation
+of the pdf is that its integral over any interval is equal to the probability
+that the random variable falls into this interval:
+\begin{displaymath}
+    \prob(X\in [a,b]) = \int_a^b f_X(x) dx
+\end{displaymath}
+hence the probability that the variable is exactly equal to any value (interval
+of length 0) is 0. To get a non-zero probability we need to consider an
+interval of positive length.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\begin{problem}[Conditioning and Bayes' rule]
+Let $\mu \in \R^m$ and $\Sigma, \Sigma' \in \R^{m \times m}$. Let $X$ be an $m$-dimensional random vector with $X \sim \mathcal{N}(\mu, \Sigma)$, and let $Y$ be a $m$-dimensional random vector such that $Y \given X \sim \mathcal{N}(X, \Sigma')$. State how each of the following is distributed. (For example: ``normally distributed with mean $z$ and variance $s$.")
+\begin{enumerate}[label=(\alph*)]
+  \item The unconditional distribution of $Y$.
+  \item The joint distribution of the random variable $(X,Y)$.
+  \item The conditional distribution of $X$ given $Y$.
+\end{enumerate}
+\end{problem}
+
+\paragraph{Solution} (a) I will use the characterization of multivariate
+gaussians in terms of their characteristic function. Using the law of iterated
+expectations, for $u\in\R^m$:
+\begin{align*}
+    \phi_Y(u) &= \E_Y(e^{iu^TY}) = \E_X\left[\E_{Y|X}(e^{iu^TY|X})\right]
+    = \E_X[\exp(iu^TX - \frac{1}{2}u^T\Sigma'u)]
+    = \E_X[\exp(iu^TX)]\exp(-\frac{1}{2}u^T\Sigma'u)\\
+    &= \exp(iu^T\mu - \frac{1}{2}u^T\Sigma u)\exp(-\frac{1}{2}u^T\Sigma'u)
+    = \exp(iu^T\mu - \frac{1}{2}u^T(\Sigma + \Sigma')u)
+\end{align*}
+where we used the characteristic function of $Y|X$ and $X$ (which are both
+normally distributed). Hence, $Y\sim \mathcal{N}(\mu, \Sigma + \Sigma')$.
+
+(b) Using the law of iterated expectations again, for $w\in \R^{2m}$ and
+writing $w = [u\; v]$ with $u\in \R^m$ and $v\in \R^m$:
+\begin{align*}
+    \phi_{X,Y}(w) &= \E_{X,Y}[e^{iu^TX + iv^TY}]
+    = \E_X\big[e^{iu^TX}\E_{Y|X}[e^{iv^TY|X}]\big]
+    = \E_X[e^{iu^TX}\exp(iv^T\mu - \frac{1}{2}v^T\Sigma' v)]\\
+    &=  \exp\left(i(u+v)^T\mu - \frac{1}{2}(u+v)^T\Sigma(u+v)
+    - \frac{1}{2}v^T\Sigma' v\right)
+\end{align*}
+Hence $(X, Y)\sim\mathcal{N}(M, S)$ with $M = [\mu\;\mu]$ and:
+\begin{displaymath}
+    S = \begin{pmatrix}
+        \Sigma & \Sigma \\ \Sigma & \Sigma + \Sigma '
+\end{pmatrix}
+\end{displaymath}
+
+(c) Using Bayes' rule:
+\begin{displaymath}
+    f_{X|Y}(x|Y=y) = \frac{f_{Y|X}(y|X=x)f_X(x)}{f_Y(y)}
+\end{displaymath}
+We already know the pdfs of the terms appearing on the right hand side. By
+multiplying them we see that the right-hand side is some exponential of
+a quadratic form in $x$. By identifying the coefficients of this quadratic
+form, we obtain $X|Y\sim \mathcal{N}\big((\Sigma'^{-1}
++ \Sigma^{-1})^{-1}(\Sigma'^{-1}y + \Sigma^{-1}\mu), (\Sigma^{-1}
++ \Sigma'^{-1})^{-1}\big)$
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\begin{problem}[I can Ei-gen]
+Let $X \in \R^{n \times m}$.
+\begin{enumerate}[label=(\alph*)]
+\item What is the relationship between the $n$ eigenvalues of $XX^T$ and the $m$ eigenvalues of $X^TX$?
+\item Suppose $X$ is square (i.e., $n=m$) and symmetric. What does this tell you about the eigenvalues of $X$? What are the eigenvalues of $X + I$, where $I$ is the identity matrix?
+\item Suppose $X$ is square, symmetric, and invertible. What are the eigenvalues of $X^{-1}$?
+\end{enumerate}
+\end{problem}
+
+(a) Let $v\neq 0\in \R^n$ be an eigenvector for the eigenvalue $\lambda$ of
+$XX^T$, with $\lambda \neq 0$. Then $XX^Tv = \lambda v$. Hence $X^TXX^Tv
+= \lambda X^Tv$. Hence $\lambda$ is an eigenvalue of $X^TX$ with $X^Tv$ as an
+eigenvector ($X^Tv\neq 0$, otherwise $XX^Tv = \lambda v = 0$, a contradiction).
+
+Let $v\neq 0\in\R^m$ be in the kernel of $X^TX$. Then $v^TX^TXv = \|Xv\|^2 = 0$.
+Hence $Xv = 0$, that is $v$ is in the kernel of $X$.
+
+As a consequence, one can compute the non-zero eigenvalues (and associated
+eigenvectors) of $X^TX$ from the eigenvalues and eigenvectors of $XX^T$ and the
+vectors in the kernel of $X^TX$ from the kernel of $X$.
+
+By swapping the role of $X^T$ and $X$, we can also compute the eigenvalues and
+vectors of $XX^T$ from the kernel and eigenvalues/vectors of $X^TX$ in
+a similar way.
+
+In short, the non-zero eigenvalues of $X^TX$ and $XX^T$ are the same.
+
+(b) If $X$ is square and symmetric, then the eigenvalues of $X$ are real and
+there is an orthornormal basis of $\R^n$ formed by eigenvectors of $X$ (this is
+the spectral theorem). Writing $X = P^T D P$ where $P$ is an orthogonal matrix
+and $D$ is diagonal, we have $X + I = P^TDP + P^TP = P^T(D+I)P$. Hence the
+eigenvalues of $X+I$ are of the form $\lambda + 1$ where $\lambda$ is an
+eigenvalue of $X$.
+
+(c) Similarly, if $X$ is square symmetric one can write $X = P^T DP$ where $P$
+is orthogonal and $D$ is diagonal. The entries on the diagonal of $D$ are the
+eigenvalues of $X$. Since $X$ is invertible, its eigenvalues are non-zero.
+Hence $X^{-1} = P^T D^{-1} P$ where $D^{-1}$ is the diagonal matrix whose
+entries are the inverse of the diagonal entries of $D$. Hence, the eigenvalues
+of $X^{-1}$ are simply the inverse of the eigenvalues of $X$.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\begin{problem}[Calculus]
+Let $x, y \in \R^m$ and $A \in \R^{m \times m}$. Please answer the following questions, writing your answers in vector notation.
+\begin{enumerate}[label=(\alph*)]
+\item What is the gradient with respect to $x$ of $x^T y$?
+\item What is the gradient with respect to $x$ of $x^T x$?
+\item What is the gradient with respect to $x$ of $x^T A x$?
+\item What is the gradient with respect to $x$ of $A x$?
+\end{enumerate}
+\end{problem}
+
+\paragraph{Solution} Throuhgout this problem, I will compute the gradient by
+reading it directly from the Taylor expansion. By that I mean that if I can
+write:
+\begin{displaymath}
+    f(x+h) = f(x) + h^T a + o(\|h\|)
+\end{displaymath}
+for some vector $a$. Then, by uniqueness of the Taylor expansion, $a$ is the
+gradient of $f$ at $x$.
+
+(a) We have $(x+h)^Ty = x^Ty + h^Ty$. Hence the gradient with respect to $x$ is
+constant and equal to $y$.
+
+(b) We have $(x+h)^T(x+h) = x^Tx + 2h^Tx + h^Th = x^Tx + 2h^Tx + o(\|h\|)$.
+Hence the gradient with respect to $x$ is $2x$.
+
+(c) Similarly, we have $(x+h)^TA(x+h) = x^TAx + 2h^TAx + h^TAh = x^TAx + 2h^TAx
++ o(\|h\|)$. Hence the gradient with respect to $x$ is $2Ax$.
+
+(d) We have $A(x+h) = Ax + Ah$. Hence the gradient of the $i$th coordinate of
+$Ax$ is the transpose of the $i$th row of $A$. Hence the Jacobian of $Ax$ with
+respect to $A$ is simply $A$ itself.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\begin{problem}[Sanity check. This problem is worth 0 points.]
+Bugs in machine learning software, as in any software, can be detected by running
+tests to see if the code produces the required output for a particular input.
+When one of these tests fails it is necessary to fix the code.  How do you usually track down where
+the bug occurs?
+\begin{itemize}
+\item[A)] I use the debugger to figure out why things are behaving the way they are.
+\item[B)] I add print statements to see the value of the different variables. 
+Scrolling through the resulting output is easier and more efficient than using the debugger.
+\item[C)] If you are good at programming your code should not have bugs.
+\end{itemize}
+
+\end{problem}
+
+\paragraph{Solution} Answer A, I use the debugger.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\begin{problem}[Stability]
+One way to solve numerical problems in the evaluation of a function is to compute
+a power series approximation around the input that causes the problems.
+Around what input is the following python function not numerically stable?
+What is the problem?
+
+\begin{lstlisting}[language=python]
+import numpy as np
+def buggy(x):
+    return np.sin(x**2) / x
+\end{lstlisting}
+Use the power series approximation to write a numerically stable linear approximation of the function around the the problematic input. (Hint: recall L'H\^opital's rule.) Your answer should be of the following form.
+
+\begin{lstlisting}[language=python]
+import numpy as np
+def linear_approximation_around_problematic_input(x):
+    return ?? # return some expression here
+\end{lstlisting}
+\end{problem}
+
+\paragraph{Solution} The $\texttt{buggy}$ function might not be numerically
+stable around $0$ because of the division by $x$ (which will become arbitrarily
+large as $x$ gets closer to $0$ and exceed machine precision).
+
+Using the Taylor expansion of $\sin$ we have $\sin(x^2) = x^2 + O(x^6)$. Hence
+$\sin(x^2)/x = x + O(x^5) = x + o(x)$. So $\sin(x^2)\simeq x$ when $x$ is close
+to zero. Hence we get the following code:
+
+\begin{lstlisting}[language=python]
+import numpy as np
+def linear_approximation_around_problematic_input(x):
+    return x
+\end{lstlisting}
+\end{document}