hw0/main.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322

% No 'submit' option for the problems by themselves.
\documentclass[submit]{harvardml}
% Use the 'submit' option when you submit your solutions.
%\documentclass[submit]{harvardml}

% Put in your full name and email address.
\name{Thibaut Horel}
\email{thorel@seas.harvard.edu}

% List any people you worked with.
\collaborators{%
}

% You don't need to change these.
\course{CS281-F15}
\assignment{Assignment \#0}
\duedate{11:59pm September 11, 2013}

\usepackage{url, enumitem}
\usepackage{amsfonts, amsmath}
\usepackage{listings}

% Some useful macros.
\newcommand{\given}{\,|\,}
\newcommand{\R}{\mathbb{R}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\prob}{\mathbb{P}}
\newcommand{\var}{\text{var}}
\newcommand{\cov}{\text{cov}}

\begin{document}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\noindent Please turn in your solutions \textbf{via canvas} by the due date listed above.
\\

\noindent This assignment is intended to ensure that you have the background required for CS281. You should be able to answer the problems below without complicated calculations. All questions are worth $70/6 = 11.\bar{6}$ points unless stated otherwise.

\begin{problem}[Variance and covariance]
Let $X$ and~$Y$ be two independent random variables.

\begin{enumerate}[label=(\alph*)]
\item Show that the independence of~$X$ and~$Y$ implies that their
covariance is zero.

\item For a scalar constant~$a$, show the following two properties:
\begin{align*}
  \E(X + aY) &= \E(X) + a\E(Y)\\
  \var(X + aY) &= \var(X) + a^2\var(Y)
\end{align*}
\end{enumerate}
\end{problem}

\paragraph{Solution} (a) If $X$ and $Y$ are independent, we can write:
\begin{align*}
    \cov(X, Y) &= \E\big[(X- \E X)(Y - \E Y)\big]
    = \E(XY) - 2 \E(X)\E(Y) + \E(X)\E(Y)\\
    &= \E(XY) - \E(X)\E(Y) = 0
\end{align*}
where the last equality uses that for two independent variables $\E(XY)
= \E(X)\E(Y)$. The proof of this fact is as follows. Let us denote by $f_{X,Y}$
the joint pdf of $(X, Y)$ and by $f_X$ (resp. $f_Y$) the pdf of $X$ (resp.
$Y$). By independence, we have $f_{X,Y}(x,y) = f_X(x)f_Y(y)$, hence:
\begin{displaymath}
    \E(XY) = \iint xy f_{X,Y}(x, y)dxdy = \iint xy f_X(x)f_Y(y)dxdy
    = \int x f_X(dx) \int y f_Y(dy) = \E(X)\E(Y)
\end{displaymath}
where the penultimate equality used Fubini's theorem. If the variables do not
have pdfs, the above approach can be made to work using slightly more advanced
measure theory (I am assuming that this is not the purpose of this HW to have
us do it).

(b) The first property is true even for non-independent variables and follows
directly from the linearity of the integral. The proof is as follows:
\begin{align*}
    \E(X+aY) &= \iint (x+ay) f_{X,Y}(x,y)dxdy = \int xdx\int f_{X,Y}(x,y)dy
    + a\int ydy\int f_{X,Y}(x,y)dx \\
    &= \int xf_X(x)dx + a\int yf_Y(y)dy = \E(X) + a\E(Y)
\end{align*}
where we used linearity of the integral and Fubini's theorem for the second
equality and the definition of marginal distributions for the penultimate
equality.

For the second property:
\begin{align*}
    \var(X+aY) &= \E\big[(X+aY)^2\big] - \big[\E(X + aY)\big]^2\\
               &=  \E(X^2 + a^2Y^2 + 2aXY)
    - \E(X)^2 - a^2\E(Y)^2 - 2a\E(X)\E(Y)\\
               &= \E(X^2) - \E(X)^2 + a^2\big[\E(Y^2) - \E(Y)^2\big]
    + 2a\big[\E(XY) - \E(X)\E(Y)\big] = \var(X) + a^2\var(Y)
\end{align*}
where the second equality used the first property (linearity of expectation) to
expand the second term, the third equality is algebra and the last equality
uses independence to see that the last term vanishes.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{problem}[Densities]
Answer the following questions.
\begin{enumerate}[label=(\alph*)]
  \item Can a probability density function (pdf) ever take values greater than 1?
  \item Let $X$ be a univariate normally distributed random variable with mean 0 and variance $1/100$. What is the pdf of $X$?
  \item What is the value of this pdf at 0?
  \item What is the probability that $X = 0$?
  \item Explain the discrepancy.
\end{enumerate}
\end{problem}

\paragraph{Solution} (a) A pdf can take values larger than 1. Its integral must
be equal to 1 but it could be larger than 1 on intervals of length smaller than
1. For example the uniform distribution on $[0, 1/2]$ has a pdf constant equal
to $2$ on the interval $[0, 1/2]$ (and is equal to $0$ elsewhere).

In fact the pdf can take arbitrarily large values. Consider the random variable
whose pdf is 0 everywhere except at every integer $n$ where it takes value
$6n/\pi^2$ on the interval $[n, n+\frac{1}{n^3}]$. Denoting by $f$ this pdf, we
have:
\begin{displaymath}
    \int_\R f(x)dx = \sum_{n\geq 1} \frac{1}{n^3} \frac{6n}{\pi^2}
    = \frac{6}{\pi^2}\sum_{n\geq 1} \frac{1}{n^2} = 1
\end{displaymath}

(b) The pdf of this normal variable is:
\begin{displaymath}
    f(x) = \sqrt{\frac{50}{\pi}} \exp\left(-50x^2\right)
\end{displaymath}

(c) The value of this pdf at 0 is $\sqrt{\frac{50}{\pi}}$.

(d) The probability that $X=0$ is 0 since the law of $X$ is
continuous with respect to the Lebesgue measure.

(e) The discrepancy can be explained by the following fact: the interpretation
of the pdf is that its integral over any interval is equal to the probability
that the random variable falls into this interval:
\begin{displaymath}
    \prob(X\in [a,b]) = \int_a^b f_X(x) dx
\end{displaymath}
hence the probability that the variable is exactly equal to any value (interval
of length 0) is 0. To get a non-zero probability we need to consider an
interval of positive length.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{problem}[Conditioning and Bayes' rule]
Let $\mu \in \R^m$ and $\Sigma, \Sigma' \in \R^{m \times m}$. Let $X$ be an $m$-dimensional random vector with $X \sim \mathcal{N}(\mu, \Sigma)$, and let $Y$ be a $m$-dimensional random vector such that $Y \given X \sim \mathcal{N}(X, \Sigma')$. State how each of the following is distributed. (For example: ``normally distributed with mean $z$ and variance $s$.")
\begin{enumerate}[label=(\alph*)]
  \item The unconditional distribution of $Y$.
  \item The joint distribution of the random variable $(X,Y)$.
  \item The conditional distribution of $X$ given $Y$.
\end{enumerate}
\end{problem}

\paragraph{Solution} (a) I will use the characterization of multivariate
gaussians in terms of their characteristic function. Using the law of iterated
expectations, for $u\in\R^m$:
\begin{align*}
    \phi_Y(u) &= \E_Y(e^{iu^TY}) = \E_X\left[\E_{Y|X}(e^{iu^TY|X})\right]
    = \E_X[\exp(iu^TX - \frac{1}{2}u^T\Sigma'u)]
    = \E_X[\exp(iu^TX)]\exp(-\frac{1}{2}u^T\Sigma'u)\\
    &= \exp(iu^T\mu - \frac{1}{2}u^T\Sigma u)\exp(-\frac{1}{2}u^T\Sigma'u)
    = \exp(iu^T\mu - \frac{1}{2}u^T(\Sigma + \Sigma')u)
\end{align*}
where we used the characteristic function of $Y|X$ and $X$ (which are both
normally distributed). Hence, $Y\sim \mathcal{N}(\mu, \Sigma + \Sigma')$.

(b) Using the law of iterated expectations again, for $w\in \R^{2m}$ and
writing $w = [u\; v]$ with $u\in \R^m$ and $v\in \R^m$:
\begin{align*}
    \phi_{X,Y}(w) &= \E_{X,Y}[e^{iu^TX + iv^TY}]
    = \E_X\big[e^{iu^TX}\E_{Y|X}[e^{iv^TY|X}]\big]
    = \E_X[e^{iu^TX}\exp(iv^T\mu - \frac{1}{2}v^T\Sigma' v)]\\
    &=  \exp\left(i(u+v)^T\mu - \frac{1}{2}(u+v)^T\Sigma(u+v)
    - \frac{1}{2}v^T\Sigma' v\right)
\end{align*}
Hence $(X, Y)\sim\mathcal{N}(M, S)$ with $M = [\mu\;\mu]$ and:
\begin{displaymath}
    S = \begin{pmatrix}
        \Sigma & \Sigma \\ \Sigma & \Sigma + \Sigma '
\end{pmatrix}
\end{displaymath}

(c) Using Bayes' rule:
\begin{displaymath}
    f_{X|Y}(x|Y=y) = \frac{f_{Y|X}(y|X=x)f_X(x)}{f_Y(y)}
\end{displaymath}
We already know the pdfs of the terms appearing on the right hand side. By
multiplying them we see that the right-hand side is some exponential of
a quadratic form in $x$. By identifying the coefficients of this quadratic
form, we obtain $X|Y\sim \mathcal{N}\big((\Sigma'^{-1}
+ \Sigma^{-1})^{-1}(\Sigma'^{-1}y + \Sigma^{-1}\mu), (\Sigma^{-1}
+ \Sigma'^{-1})^{-1}\big)$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{problem}[I can Ei-gen]
Let $X \in \R^{n \times m}$.
\begin{enumerate}[label=(\alph*)]
\item What is the relationship between the $n$ eigenvalues of $XX^T$ and the $m$ eigenvalues of $X^TX$?
\item Suppose $X$ is square (i.e., $n=m$) and symmetric. What does this tell you about the eigenvalues of $X$? What are the eigenvalues of $X + I$, where $I$ is the identity matrix?
\item Suppose $X$ is square, symmetric, and invertible. What are the eigenvalues of $X^{-1}$?
\end{enumerate}
\end{problem}

(a) Let $v\neq 0\in \R^n$ be an eigenvector for the eigenvalue $\lambda$ of
$XX^T$, with $\lambda \neq 0$. Then $XX^Tv = \lambda v$. Hence $X^TXX^Tv
= \lambda X^Tv$. Hence $\lambda$ is an eigenvalue of $X^TX$ with $X^Tv$ as an
eigenvector ($X^Tv\neq 0$, otherwise $XX^Tv = \lambda v = 0$, a contradiction).

Let $v\neq 0\in\R^m$ be in the kernel of $X^TX$. Then $v^TX^TXv = \|Xv\|^2 = 0$.
Hence $Xv = 0$, that is $v$ is in the kernel of $X$.

As a consequence, one can compute the non-zero eigenvalues (and associated
eigenvectors) of $X^TX$ from the eigenvalues and eigenvectors of $XX^T$ and the
vectors in the kernel of $X^TX$ from the kernel of $X$.

By swapping the role of $X^T$ and $X$, we can also compute the eigenvalues and
vectors of $XX^T$ from the kernel and eigenvalues/vectors of $X^TX$ in
a similar way.

In short, the non-zero eigenvalues of $X^TX$ and $XX^T$ are the same.

(b) If $X$ is square and symmetric, then the eigenvalues of $X$ are real and
there is an orthornormal basis of $\R^n$ formed by eigenvectors of $X$ (this is
the spectral theorem). Writing $X = P^T D P$ where $P$ is an orthogonal matrix
and $D$ is diagonal, we have $X + I = P^TDP + P^TP = P^T(D+I)P$. Hence the
eigenvalues of $X+I$ are of the form $\lambda + 1$ where $\lambda$ is an
eigenvalue of $X$.

(c) Similarly, if $X$ is square symmetric one can write $X = P^T DP$ where $P$
is orthogonal and $D$ is diagonal. The entries on the diagonal of $D$ are the
eigenvalues of $X$. Since $X$ is invertible, its eigenvalues are non-zero.
Hence $X^{-1} = P^T D^{-1} P$ where $D^{-1}$ is the diagonal matrix whose
entries are the inverse of the diagonal entries of $D$. Hence, the eigenvalues
of $X^{-1}$ are simply the inverse of the eigenvalues of $X$.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{problem}[Calculus]
Let $x, y \in \R^m$ and $A \in \R^{m \times m}$. Please answer the following questions, writing your answers in vector notation.
\begin{enumerate}[label=(\alph*)]
\item What is the gradient with respect to $x$ of $x^T y$?
\item What is the gradient with respect to $x$ of $x^T x$?
\item What is the gradient with respect to $x$ of $x^T A x$?
\item What is the gradient with respect to $x$ of $A x$?
\end{enumerate}
\end{problem}

\paragraph{Solution} Throuhgout this problem, I will compute the gradient by
reading it directly from the Taylor expansion. By that I mean that if I can
write:
\begin{displaymath}
    f(x+h) = f(x) + h^T a + o(\|h\|)
\end{displaymath}
for some vector $a$. Then, by uniqueness of the Taylor expansion, $a$ is the
gradient of $f$ at $x$.

(a) We have $(x+h)^Ty = x^Ty + h^Ty$. Hence the gradient with respect to $x$ is
constant and equal to $y$.

(b) We have $(x+h)^T(x+h) = x^Tx + 2h^Tx + h^Th = x^Tx + 2h^Tx + o(\|h\|)$.
Hence the gradient with respect to $x$ is $2x$.

(c) Similarly, we have $(x+h)^TA(x+h) = x^TAx + 2h^TAx + h^TAh = x^TAx + 2h^TAx
+ o(\|h\|)$. Hence the gradient with respect to $x$ is $2Ax$.

(d) We have $A(x+h) = Ax + Ah$. Hence the gradient of the $i$th coordinate of
$Ax$ is the transpose of the $i$th row of $A$. Hence the Jacobian of $Ax$ with
respect to $A$ is simply $A$ itself.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{problem}[Sanity check. This problem is worth 0 points.]
Bugs in machine learning software, as in any software, can be detected by running
tests to see if the code produces the required output for a particular input.
When one of these tests fails it is necessary to fix the code.  How do you usually track down where
the bug occurs?
\begin{itemize}
\item[A)] I use the debugger to figure out why things are behaving the way they are.
\item[B)] I add print statements to see the value of the different variables. 
Scrolling through the resulting output is easier and more efficient than using the debugger.
\item[C)] If you are good at programming your code should not have bugs.
\end{itemize}

\end{problem}

\paragraph{Solution} Answer A, I use the debugger.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{problem}[Stability]
One way to solve numerical problems in the evaluation of a function is to compute
a power series approximation around the input that causes the problems.
Around what input is the following python function not numerically stable?
What is the problem?

\begin{lstlisting}[language=python]
import numpy as np
def buggy(x):
    return np.sin(x**2) / x
\end{lstlisting}
Use the power series approximation to write a numerically stable linear approximation of the function around the the problematic input. (Hint: recall L'H\^opital's rule.) Your answer should be of the following form.

\begin{lstlisting}[language=python]
import numpy as np
def linear_approximation_around_problematic_input(x):
    return ?? # return some expression here
\end{lstlisting}
\end{problem}

\paragraph{Solution} The $\texttt{buggy}$ function might not be numerically
stable around $0$ because of the division by $x$ (which will become arbitrarily
large as $x$ gets closer to $0$ and exceed machine precision).

Using the Taylor expansion of $\sin$ we have $\sin(x^2) = x^2 + O(x^6)$. Hence
$\sin(x^2)/x = x + O(x^5) = x + o(x)$. So $\sin(x^2)\simeq x$ when $x$ is close
to zero. Hence we get the following code:

\begin{lstlisting}[language=python]
import numpy as np
def linear_approximation_around_problematic_input(x):
    return x
\end{lstlisting}
\end{document}