hw1/main.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520

% No 'submit' option for the problems by themselves.
\documentclass[submit]{harvardml}
% Use the 'submit' option when you submit your solutions.
%\documentclass[submit]{harvardml}

% Put in your full name and email address.
\name{Thibaut Horel}
\email{thorel@seas.harvard.edu}

% List any people you worked with.
\collaborators{%
}

% You don't need to change these.
\course{CS281-F15}
\assignment{Assignment \#1}
\duedate{11:59pm September 23, 2015}

\usepackage{url, enumitem}
\usepackage{amsfonts}
\usepackage{listings}
\usepackage{bm}
\usepackage{graphicx}

% Some useful macros.
\newcommand{\prob}{\mathbb{P}}
\newcommand{\given}{\,|\,}
\newcommand{\R}{\mathbb{R}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\var}{\text{var}}
\newcommand{\cov}{\text{cov}}
\newcommand{\N}{\mathcal{N}}
\newcommand{\ep}{\varepsilon}

\newcommand{\Dir}{\text{Dirichlet}}

\theoremstyle{plain}
\newtheorem{lemma}{Lemma}

\begin{document}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{problem}[The Gaussian algebra, 10pts]
Let $X$ and $Y$ be independent univariate Gaussian random variables. In the previous problem set, you likely used the fact that $X + Y$ is also Gaussian. Here you'll prove this fact.

\begin{enumerate}[label=(\alph*)]
\item Suppose $X$ and $Y$ have mean 0 and variances $\sigma_X^2$ and $\sigma_Y^2$ respectively. Write the pdf of $X + Y$ as an integral.
\item Evaluate the integral from the previous part to find a closed-form expression for the pdf of $X+Y$, then argue that this expression implies that $X+Y$ is also Gaussian with mean $0$ and variance $\sigma_X^2 + \sigma_Y^2$. Hint: what is the integral, over the entire real line, of
\[ 
f(x) = \frac{1}{\sqrt{2\pi}s} \exp\left( -\frac{1}{2s^2}(x - m)^2 \right) ,
\] i.e., the pdf of a univariate Gaussian random variable?
\item Extend the above result to the case in which $X$ and $Y$ may have arbitrary means.
\item Univariate Gaussians are supported on the entire real line. Sometimes this is undesirable because we're modeling a quantity with positive support. A common way to transform a Gaussian to solve this problem is to exponentiate it. Suppose $X$ is a univariate Gaussian with mean $\mu$ and variance $\sigma^2$. What is the pdf of $e^X$?
\end{enumerate}
\end{problem}

\paragraph{Solution} (a) Writing $f_X$, $f_Y$ and $g$ the cdfs of $X$, $Y$ and
$X+Y$ respectively, we have:
\begin{displaymath}
    g(z) = \int_{\mathbb{R}} f_X(u)f_Y(z-u)du,\quad z\in\mathbb{R}
\end{displaymath}

(b) Using the formula for the cdf of a Gaussian variable, we obtain:
\begin{displaymath}
    g(z) = \frac{1}{2\pi\sigma_X\sigma_Y}\int_{\mathbb{R}}
    \exp\left(-\frac{u^2}{2\sigma_X^2} - \frac{(z-u)^2}{2\sigma_y^2}\right)du
\end{displaymath}
the argument of the $\exp$ function can be reordered in ``canonical form'' with
respect to $u$:
\begin{displaymath}
    -\frac{u^2}{2\sigma_X^2} - \frac{(z-u)^2}{2\sigma_y^2}
    = -\frac{\sigma_X^2+ \sigma_Y^2}{2\sigma_X^2\sigma_Y^2}\left(u
    - \frac{\sigma_X^2}{\sigma_X^2+\sigma_Y^2}z\right)^2
    - \frac{z^2}{2(\sigma_X^2+\sigma_Y^2)}
\end{displaymath}
Hence:
\begin{displaymath}
    g(z) = \frac{1}{2\pi\sigma_X\sigma_Y}
    \exp\left(- \frac{z^2}{2(\sigma_X^2+\sigma_Y^2)}\right)
    \int_{\mathbb{R}}
    \exp\left(-\frac{\sigma_X^2+ \sigma_Y^2}{2\sigma_X^2\sigma_Y^2}\left(u
    - \frac{\sigma_X^2}{\sigma_X^2+\sigma_Y^2}z\right)^2\right)du
\end{displaymath}
We now recognize the integral of the un-normalized cdf of a Gaussian variable.
Hence the integral evaluates to:
\begin{displaymath}
    \sqrt{2\pi} \frac{\sigma_X\sigma_Y}{\sqrt{\sigma_X^2 + \sigma_Y^2}}
\end{displaymath}
after simplification we obtain the following formula for $g(z)$:
\begin{displaymath}
    g(z) = \frac{1}{\sqrt{2\pi}\sqrt{\sigma_X^2+\sigma_Y^2}}
    \exp\left(- \frac{z^2}{2(\sigma_X^2+\sigma_Y^2)}\right)
\end{displaymath}
That is, $X+Y$ is a Gaussian variable with mean $0$ and variance
$\sigma_X^2+\sigma_Y^2$.

(c) If $X$ has mean $\mu_X$ and $Y$ has mean $\mu_Y$, $X-\mu_X$ and $Y-\mu_Y$
are still independent variables with unchanged variance and zero mean.
Applying the previous result, $X+Y-\mu_X-\mu_Y$ is a Gaussian variable with
mean $0$ and variance $\sigma_X^2 + \sigma_Y^2$. Hence, $X+Y$ is normally
distributed with mean $\mu_X+\mu_Y$ and variance $\sigma_X^2+\sigma_Y^2$.

(d) Since $x\mapsto e^X$ is a bijection mapping $\mathbb{R}$ to $\mathbb{R}^+$,
the pdf of $Y = e^X$ can be obtained by using an inverse transform. Formally,
for any interval $[a,b]$ with $a,b>0$:
\begin{align*}
    \prob(Y\in[a,b]) = \prob(X\in [\log a, \log b])
    &=\frac{1}{\sqrt{2\pi}\sigma_X}\int_{\log a}^{\log b}
    \exp\left(-\frac{1}{2\sigma_X^2}(x-\mu)^2\right)dx\\
    &=\frac{1}{\sqrt{2\pi}\sigma_X^2}\int_{a}^{b}
    \exp\left(-\frac{1}{2z\sigma_X^2}(\log z-\mu)^2\right)dz
\end{align*}
where we used the change of variable $z = e^x$ in the last equality. Hence the
pdf of $Y$:
\begin{displaymath}
    f_Y(z) = \begin{cases}
    \frac{1}{\sqrt{2\pi}\sigma_X}
    \exp\left(-\frac{1}{2z\sigma_X^2}(\log z-\mu)^2\right)&\text{if $z>0$}\\
    0&\text{otherwise}
    \end{cases}
\end{displaymath}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{problem}[Regression, 13pts]
Suppose that $X \in \R^{n \times m}$ with $n \geq m$ and $Y \in \R^n$, and that $Y \sim \N(X\beta, \sigma^2 I)$. You learned in class that the maximum likelihood estimate $\hat\beta$ of $\beta$ is given by
\[
\hat\beta = (X^TX)^{-1}X^TY
\]
\begin{enumerate}[label=(\alph*)]
\item Why do we need to assume that $n \geq m$?
\item Define $H = X(X^TX)^{-1}X^T$, so that the ``fitted" values $\hat Y = X\hat{\beta}$ satisfy $\hat Y = HY$. Show that $H$ is an orthogonal projection matrix that projects onto the column space of $X$, so that the fitted y-values are a projection of $Y$ onto the column space of $X$.
\item What are the expectation and covariance matrix of $\hat\beta$?
\item Compute the gradient with respect to $\beta$ of the log likelihood implied by the model above, assuming we have observed $Y$ and $X$.
\item Suppose we place a normal prior on $\beta$. That is, we assume that $\beta \sim \N(0, \tau^2 I)$. Show that the MAP estimate of $\beta$ given $Y$ in this context is
\[
\hat \beta_{MAP} = (X^TX + \lambda I)^{-1}X^T Y
\]
where $\lambda = \sigma^2 / \tau^2$. (You may employ standard conjugacy results about Gaussians without proof in your solution.)

Estimating $\beta$ in this way is called {\em ridge regression} because the matrix $\lambda I$ looks like a ``ridge". Ridge regression is a common form of {\em regularization} that is used to avoid the overfitting (resp. underdetermination) that happens when the sample size is close to (resp. higher than) the output dimension in linear regression.
\item Do we need $n \geq m$ to do ridge regression? Why or why not?
\item Show that ridge regression is equivalent to adding $m$ additional rows to $X$ where the $j$-th additional row has its $j$-th entry equal to $\sqrt{\lambda}$ and all other entries equal to zero, adding $m$ corresponding additional entries to $Y$ that are all 0, and and then computing the maximum likelihood estimate of $\beta$ using the modified $X$ and $Y$.
\end{enumerate}
\end{problem}

\paragraph{Solution} (a) If $n<m$, $X^TX$ has no chance of being invertible, so
we won't be able to compute the MLE. Intuitively we need the observations to
span a $m$ dimensional subspace to be able to learn the $m$ coordinates of
$\beta$.

(b) We have $H^T = H$, and $H^2 = X(X^TX)^{-1}X^TX(X^TX)^{-1}X^T
= X(X^TX)^{-1}X^T = H$. This shows that $H$ is an orthogonal projection matrix.
Furthermore, we see that if $x$ is in the column space of $X$, that is, $x
= Xe_i$ where $e_i$ is a vector of the canonical basis, then $Hx$
= $X(X^TX)^{-1}X^TXe_i = Xe_i = x$. That is, $H$ is the identity matrix on the
column space of $X$. This is enough to conclude that $H$ is the orthogonal
projection on this subspace.

(c) By linearity of expectation:
\begin{displaymath}
    \E(\hat{\beta}) = (X^TX)^{-1}X^TX\beta = \beta
\end{displaymath}
for the covariance. Writing $Y = X\beta + \epsilon$ where $\epsilon\sim
\mathcal{N}(0, \sigma^2I)$, we see that $\hat{\beta}
= \beta + (X^TX)^{-1}X^T\epsilon$. Hence:
\begin{displaymath}
    \E[(\hat{\beta}-\beta)(\hat{\beta}-\beta)^T)
        = \E\big[(X^TX)^{-1}X^T\epsilon\epsilon^TX(X^TX)^{-1}\big]
        = \sigma^2(X^TX)^{-1}
\end{displaymath}
where the last equality used $E(\epsilon\epsilon^T) = \sigma^2 Id$.

(d) The log-likelihood is:
\begin{displaymath}
    L(\beta\given X, Y) = \frac{1}{\sqrt{2\pi}\sigma}
    \prod_{i=1}^n\exp\left(-\frac{1}{2\sigma^2}(Y_i-X_i\beta)^2\right)
    = \frac{1}{\sqrt{2\pi}\sigma}
        \exp\left(-\frac{1}{2\sigma^2}\|Y-X\beta\|^2\right)
\end{displaymath}
Taking the gradient with respect to $\beta$ (using the chain rule):
\begin{displaymath}
    \nabla L(\beta) = \frac{1}{\sqrt{2\pi}\sigma}
        \exp\left(-\frac{1}{2\sigma^2}\|Y-X\beta\|^2\right)
        \frac{1}{\sigma^2}X^T(Y-X\beta)
\end{displaymath}

(e) The posterior is proportional to:
\begin{displaymath}
    \prob(X, Y\given \beta)\exp\left(-\frac{1}{2\tau^2}\|\beta^2\|\right)
\end{displaymath}
using the formula obtained in (d), this is proportional to:
\begin{displaymath}
    p(\beta) = \exp\left(-\frac{1}{2\sigma^2}\|Y-X\beta\|^2
    - \frac{1}{2\tau^2}\|\beta\|^2\right)
\end{displaymath}
where the proportionality constant (normalization factors) does not depend on
$\beta$. Hence the MAP estimator can be found by find the maximum of the
function $p$. Since it is convex, it is sufficient to find a critical point.
Using a computation similar to the one in (d), we find the gradient of $p$:
\begin{displaymath}
    \nabla p(\beta) = \exp\left(-\frac{1}{2\sigma^2}\|Y-X\beta\|^2
    - \frac{1}{2\tau^2}\|\beta\|^2\right)\left[\frac{1}{\sigma^2}X^T(Y-X\beta)
    -\frac{1}{\tau^2}\beta\right]
\end{displaymath}
Solving $\nabla p(\beta) = 0$ for $\beta$:
\begin{displaymath}
    X^TY - (X^TX +\lambda Id)\beta = 0
\end{displaymath}
and we obtain the desired formula for the MAP estimator.

(f) $n$ needs not be greater than $m$ since $(\lambda Id + X^TX)$ might be
invertible even if $X^TX$ is not.

(g) Let us define $X'$:
\begin{displaymath}
    X' = \begin{pmatrix}
        X\\
        \sqrt{\lambda}Id
    \end{pmatrix}
\end{displaymath}
Replacing $Y^T$ by $Y'^ = [Y\; 0]$ and replacing $X$ by $X'$ in the least square
estimator $\hat{\beta}$, we see that $X'^TY' = X^TY$. Furthermore:
\begin{displaymath}
    X'^TX' = \begin{pmatrix}
        X^T &\sqrt{\lambda}Id
    \end{pmatrix}
\begin{pmatrix}
        X\\ 
        \sqrt{\lambda}Id
    \end{pmatrix} = X^TX +\lambda Id
\end{displaymath}
Hence the least square estimator yields $\hat{\beta} = (X^TX+\lambda
Id)^{-1}X^TY$, which is exactly the expression of the MAP estimator in ridge
regression.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{problem}[The Dirichlet and multinomial distributions, 12pts]
The Dirichlet distribution over $K$ categories is a generalization of the beta distribution. It has a shape parameter $a \in \R^K$ with non-negative entries and is supported over the set of $K$-dimensional positive vectors whose components sum to 1. Its density is given by
\[
f(x | a) = \frac{\Gamma\left( \sum_{k=1}^K a_k \right)}{\Pi_{k=1}^K \Gamma(a_k)} \Pi_{k=1}^K x_k^{a_k - 1}
\]
(Notice that when $K=2$, this reduces to the density of a beta distribution since in that case $a_2 = 1 - a_1$.) For the rest of this problem, assume a fixed $K \geq 2$.
\begin{enumerate}[label=(\alph*)]
\item Suppose $X$ is Dirichlet-distributed with shape parameter $a$. Without proof, state the value of $E(X)$. Your answer should be a vector defined in terms of either $a$ or $K$ or potentially both.
\item Suppose that $\theta \sim \text{Dirichlet}(a)$ and that $X \sim \text{Categorical}(\theta)$. That is, suppose we first sample a $K$-dimensional vector $\theta$ with entries in $(0,1)$ from a Dirichlet distribution and then roll a $K$-sided die such that the probability of rolling the number $k$ is $\theta_k$. Prove that $\theta | X$ also follows a Dirichlet distribution. What is its shape parameter?
\item Now suppose that $\theta \sim \text{Dirichlet}(a)$ and that $X_1, X_2, \ldots \stackrel{iid}{\sim} \text{Categorical}(\theta)$. Show that
\[
P(X_n = k | X_1, \ldots, X_{n-1}) = \frac{a_{k,n}}{\sum_{k=1}^K a_{k,n}}
\]
where $a_{k,n} = a_k + \sum_{i=1}^{n-1} \mathbb{1}\{X_i = k\}$. (Bonus points if your solution does not involve any integrals.)
\item Argue that the random variable $\lim_{n \rightarrow \infty} \frac{1}{n}\sum_{i=1}^n \mathbb{1}\{X_i = k\}$ is marginally beta-distributed. What is its shape parameter? If you're not sure how to rigorously talk about convergence of random variables, give an informal argument. Hint: what would you say if $\theta$ were fixed?
\item Suppose we have $K$ distinct colors and an urn with $a_k$ balls of color $k$. At each time step, we choose a ball uniformly at random from the urn and then add into the urn an additional new ball of the same color as the chosen ball. (So if at the first time step we choose a ball of color 1, we'll end up with $a_1+1$ balls of color 1 and $a_k$ balls of color $k$ for all $k > 1$ at the start of the second time step.) Let $\rho_{k,n}$ be the fraction of all the balls that are of color $k$ at time $n$. What is the distribution of $\lim_{n \rightarrow \infty} \rho_k^n$? Prove your answer.
\end{enumerate}
\end{problem}

\paragraph{Solution} (a) We have $\E(X) = \frac{1}{\sum_{k=1}^K a_k} a$.

(b) Using Bayes' theorem, we see that the posterior $\theta\given X$ is
proportional to:
\begin{displaymath}
    f(\theta\given a)\prob(X=i|\theta) \sim ~ \theta_i\prod_{k=1}^K \theta_k^{a_k-1}
\end{displaymath}
where the multiplicative constant does not depend on $\theta$. In particular we
see that $\theta\given X$ is a Dirichlet distribution of parameter $a + e_X$.
Where  $e_i$ represents the $i$th vector of the canonical basis. That is, the
new shape parameter is $a$ where the coordinate corresponding to the
observation $X$ has been increased by 1.

(c) Using Bayes rule, we have:
\begin{equation}
    \label{eq:foo}
    \prob(X_n=k\given X_1,\ldots,X_{n-1}) = \int \prob(X_n=k\given \theta)
    \prob(\theta \given X_1,\ldots,X_{n-1})d\theta
\end{equation}
Now to compute $\prob(\theta\given X_1,\ldots,X_{n-1})$ we use:
\begin{displaymath}
    \prob(\theta\given X_1,\ldots, X_{n-1})
    \sim \prob(\theta)\prod_{i=1}^{n-1}\prob(X_i\given\theta)
\end{displaymath}
where we used the Bayes rule and independence. Using a computation similar to
the one in (b), we see that $\theta\given X_1,\ldots, X_{n-1}$ follows
a Dirichlet distribution with shape parameter $a_n = (a_{1,n},\dots, a_{K,n})$
(each observation increases the associated coordinate by one). Using this in
\eqref{eq:foo} we see that $\prob(X_n=k|X_1,\ldots,X_{n-1})$ is exactly the
expectation of the $k$th coordinate of the Dirichlet distribution of parameter
$a_n$. Using part (a), this is equal to:
\begin{displaymath}
    \prob(X_n=k|X_1,\ldots,X_{n-1}) = \frac{a_{k,n}}{\sum_{k=1}^K a_{k,n}}
\end{displaymath}

(d) Using the strong law of large numbers, writing $\hat{X}_n
= \frac{1}{n}\sum_{i=1}^n \mathbb{1}\{X_i=k\}$, we know that $\hat{X}_n$
converges almost surely to $\E(X_i=k) = \theta_k$. In particular, the CDF of
the (almost sure) limit $Y$ of $\hat{X}_n$ is the step function equal to $0$ for
$z< \theta_k$ and $1$ for $z\geq \theta_k$. To get the CDF of the marginal
distribution, we integrate over $\theta_k$:
\begin{displaymath}
    \prob(Y\leq z) = \int_{0}^1 \mathbb{1}\{z\geq
    \theta_k\}p(\theta_k)d\theta_k
    = \int_{0}^z p(\theta_k)d\theta_k
\end{displaymath}
we recognize on the right-hand side the CDF of $\theta_k$ which is the $k$th
marginal of a dirichlet distribution: this is a Beta distribution of parameters
$(\alpha_k, \sum_i\alpha_i - \alpha_k)$. Hence the limit (in distribution) $Y$
of $\hat{X}_n$ is Beta distributed with those parameters.

(e) It is easy to see that if we denote by $X_n$ the color of the ball drawn at
the  $n$th time step, then $X_n\given X_1\ldots X_{n-1}$ has the same law as
the one obtained in part (c). Then we can write:
\begin{displaymath}
    \rho_{k,n} = \frac{a_k +\sum_{i=1}^{n-1}\mathbb{1}\{X_i=k\}}{\sum_{k=1}^K a_k
    + n} = \frac{a_k}{\sum_k a_k + n} + \frac{1}{1 + \sum_k a_k/n}
    \frac{1}{n}\sum_{i=1}^{n-1}\mathbb{1}\{X_i=k\}
\end{displaymath}
Using part (d) we obtain that $\rho_{k,n}$ converges (at least in distribution)
to a Beta distributed variable with parameters $(\alpha_k, \sum_i \alpha_i
- \alpha_k)$.

\section*{Physicochemical Properties of Protein Tertiary Structure}

In the following problems we will code two different approaches for
solving linear regression problems and compare how they scale as a function of
the dimensionality of the data.  We will also investigate the effects of
linear and non-linear features in the predictions made by linear models.

We will be working with the regression data set Protein
Tertiary Structure:
\url{https://archive.ics.uci.edu/ml/machine-learning-databases/00265/CASP.csv}.
This data set contains information about predicted conformations for 45730
proteins. In the data, the target variable $y$ is the root-mean-square
deviation (RMSD) of the predicted conformations with respect to the true properly
folded form of the protein. The RMSD is the measure of the average distance
between the atoms (usually the backbone atoms) of superimposed proteins.
The features $\mathbf{x}$ are
physico-chemical properties of the proteins in their true folded form. After
downloading the file CASP.csv we can load the data into python using
\begin{verbatim}
>>> import numpy as np
>>> data = np.loadtxt('CASP.csv', delimiter = ',', skiprows = 1)
\end{verbatim}
We can then obtain the vector of target variables and the feature matrix using
\begin{verbatim}
>>> y = data[ : , 0 ]
>>> X = data[ : , 1 : ]
\end{verbatim}
We can then split the original data into a training set with 90\% of the data
entries in the file CASP.csv and a test set with the remaining 10\% of the
entries. Normally, the spliting of the data is done at random, but here {\bf we ask
you to put into the training set the first 90\% of the elements from the
file CASP.csv} so that we can verify that the values that you will be reporting are correct.
(This should not cause problems, because the rows of the file are in a random order.)
The following python function splits the data as requested:
\begin{verbatim}
def split_train_test(X, y, fraction_train = 9.0 / 10.0):
    end_train = round(X.shape[ 0 ] * fraction_train)
    X_train = X[ 0 : end_train, ]
    y_train = y[ 0 : end_train ]
    X_test = X[ end_train :, ]
    y_test = y[ end_train : ]
    return X_train, y_train, X_test, y_test
\end{verbatim}
We can then normalize the features so that they have zero mean and unit deviation
in the training set. This is a standard step before the application of many machine
learning methods. We can use the following python function to perform this operation:
\begin{verbatim}
def normalize_features(X_train, X_test):
    mean_X_train = np.mean(X_train, 0)
    std_X_train = np.std(X_train, 0)
    std_X_train[ std_X_train == 0 ] = 1
    X_train_normalized = (X_train - mean_X_train) / std_X_train
    X_test_normalized = (X_test - mean_X_train) / std_X_train
    return X_train_normalized, X_test_normalized
\end{verbatim}
After these steps are done, we can concatenate a bias feature (one feature which
always takes value 1) to the observations in the normalized training and test sets:
\begin{verbatim}
>>> X_train, y_train, X_test, y_test = split_train_test(X, y)
>>> X_train, X_test = normalize_features(X_train, X_test)
>>> X_train = np.concatenate((X_train, np.ones((X_train.shape[ 0 ], 1))), 1)
>>> X_test = np.concatenate((X_test, np.ones((X_test.shape[ 0 ], 1))), 1)
\end{verbatim}
We are now ready to apply our machine learning methods to the normalized training set and
evaluate their performance on the normalized test set.
In the following problems, you will be asked to report some numbers and produce
some figures. Include these numbers and figures in your assignment report.
{\bf The numbers should be reported with up to 8 decimals}.

%Notation consistent with prob 2?
\begin{problem}[7pts]\label{prob:analytic_linear_model}
Assume that the targets $y$ are obtained as a function of the normalized
features $\mathbf{x}$ according to a Bayesian linear model with additive Gaussian noise with variance
$\sigma^2 = 1.0$ and a Gaussian prior on the regression coefficients $\mathbf{w}$
with precision matrix $\tau^{-2}\mathbf{I}$ where $\tau^{-2} = 10$. Code a routine that finds the Maximum a
Posteriori (MAP) value $\hat{\mathbf{w}}$ for $\mathbf{w}$ given the normalized
training data using the QR decomposition (see Section 7.5.2 in Murphy's book).
\begin{itemize}
\item Report the value of $\hat{\mathbf{w}}$ obtained. 
\item Report the root mean squared error (RMSE) of $\hat{\mathbf{w}}$ in the normalized test set.
\end{itemize}
\end{problem}

\paragraph{Solution} See file \textsf{main.py} for the code. I obtained:
\begin{align*}
    \hat{w} = [&5.55782079, 2.25190765, 1.07880135, -5.91177796, -1.73480336,\\
            &-1.63875478 -0.26610556,  0.81781409, -0.65913397, 7.74153395]
\end{align*}
and a RMSE of 5.20880460.

\begin{problem}[14pts]\label{prob:numerical_linear_model}
Write code that evaluates the log-posterior probability (up to an
additive constant) of $\mathbf{w}$ according to the previous Bayesian linear model and the normalized training data.
Write also code that evaluates the gradient of the log-posterior probability with respect to
$\mathbf{w}$. Merge the two pieces of code into a
function called \emph{obj\_and\_gradient} that, for a value of $\mathbf{w}$,
returns the log-posterior probability value and the corresponding gradient.
Verify that the gradient computation is correct using the approximation
\begin{align}
\frac{df(x)}{dx} \approx \frac{f(x + \epsilon) - f(x - \epsilon)}{2\epsilon}\,.
\end{align}
Write then a function that uses "obj\_and\_gradient" to find the MAP solution $\hat{\mathbf{w}}$ for
$\mathbf{w}$ by running 100 iterations of the L-BFGS numerical optimization
method with $\mathbf{0}$ as the initial point for the optimization.

The L-BFGS method is an iterative method for solving nonlinear optimization problems.
You will learn more about numerical optimization and about this method in one of the sections of this course. However,
for this problem you can use the method as a black box that returns the MAP solution
by sequentially evaluating the objective function and its gradient for different input values.
In python, a variant of the L-BFGS method called L-BFGS-B is implemented in scipy.optimize.minimize.
Since scipy.optimize.minimize only minimizes, you can work with the negative
log-posterior probability to transform the maximization problem into a minimization one.

\begin{itemize}
\item Report the value of $\hat{\mathbf{w}}$ obtained. 
\item Report the RMSE of the predictions made with $\hat{\mathbf{w}}$ in the normalized test set.
\end{itemize}
\end{problem}

\paragraph{Solution} See file \textsf{main.py} for the code. I obtained:
\begin{align*}
    \hat{w} = [&5.55780742, 2.2516761, 1.078904, -5.9118024, -1.73438477,\\
               &-1.63887755, -0.26611567, 0.81785313, -0.65901996, 7.7415115]
\end{align*}
and a RMSE of 5.20880242.

\vspace{2em}

Linear regression can be extended to model non-linear relationships by
replacing the original features $\mathbf{x}$ with some non-linear functions of
the original features $\bm \phi(\mathbf{x})$. We can automatically generate one
such non-linear function by sampling a random weight vector $\mathbf{a}
\sim \N(0,\mathbf{I})$ and a corresponding random bias $b \sim
\text{U}[0, 2\pi]$ and then making $\phi(\mathbf{x}) = \cos(\mathbf{a}^\text{T}
\mathbf{x} + b)$.  By repeating this process $d$ times we can generate $d$
non-linear functions that, when applied to the original features, produce a
non-linear mapping of the data into a new $d$ dimensional space.
We can encode these $d$ functions into a matrix $\mathbf{A}$ with $d$ rows, each one
with the weights for each function, and a $d$-dimensional vector $\mathbf{b}$
with the biases for each function. The new mapped fetures are then obtained as
$\bm \phi (\mathbf{x}) = \cos(\mathbf{A} \mathbf{x} + \mathbf{b})$, where
$\cos$ applied to a vector returns another vector whose elements are the result
of applying $\cos$ to the individual elements of the original vector.

\begin{problem}[14pts]\label{prob:non_linear_model}
Generate 4 sets of non-linear functions, each one with $d=100, 200, 400, 600$ functions, respectively, and use
them to map the features in the original normalized training and test sets into
4 new feature spaces, each one of dimensionality given by the value of $d$. After this, for each
value of $d$, find the MAP solution $\hat{\mathbf{w}}$ for $\mathbf{w}$ using the
corresponding new training set and the method from problem
4. Use the same values for $\sigma^2$ and $\tau^{-2}$ as before.
You are also asked to
record the time taken by the method QR to obtain a value for $\hat{\mathbf{w}}$. 
In python  you can compute the time taken by a routine using the time package:
\begin{verbatim}
>>> import time
>>> time_start = time.time()
>>> routine_to_call()
>>> running_time = time.time() - time_start
\end{verbatim}
Next, compute the RMSE of the resulting predictor in the normalized test
set. Repeat this process with the method from problem
\ref{prob:numerical_linear_model} (L-BFGS).  

\begin{itemize}
\item Report the test RMSE obtained by each method for each value of $d$.
\end{itemize}

You are asked to generate a plot
with the results obtained by each method (QR and L-BFGS) 
for each value of $d$. In this plot
the $x$ axis should represent the time taken by each method to
run and the $y$ axis should be the RMSE of the resulting predictor in the
normalized test set. The plot should
contain 4 points in red, representing the results obtained by the method QR for
each value of $d$, and 4 points in blue, representing the results obtained
by the method L-BFGS for each value of $d$. Answer the following questions:
\begin{itemize}
\item Do the non-linear transformations help to reduce the prediction error? Why?
\item What method (QR or L-BFGS) is faster? Why?
\end{itemize}
\end{problem}

\paragraph{Solution} See file \textsf{main.py} for the code. I obtained the following plot:
\begin{center}
\includegraphics[width=0.5\textwidth]{plot.pdf}
\end{center}

We see that the non-linear transformations help reduce the prediction error:
the data is probably not very well modeled by a linear relation ship. Allowing
more complex relationships (as $d$ becomes larger the relationship becomes more
complex) is a better fit and helps the prediction.

QR is faster than L-BFGS when the number of features is not too large ($d\leq
400$). For $d=600$, L-BFGS is faster. The asymptotic complexity of QR is
$O(nd^2 + d^3)$ whereas the complexity of L-BFGS is $O(nd)$ (for a fixed number
of iterations). So asymptotically, L-BFGS is faster as we observe for $d=600$
(for smaller values of $d$ the constants hidden in the $O$ do not allow
a formal comparison).

\end{document}