summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2015-09-20 22:30:04 -0400
committerThibaut Horel <thibaut.horel@gmail.com>2015-09-20 22:30:04 -0400
commitd8fc73b493a8a03cb09bd558d9472a33453608a6 (patch)
tree3dc5a583cda20e1c682d342fa9f68f35418de182
parenta7f4d5c2ce9ee09879b4183d18169d5ba45cf422 (diff)
downloadcs281-d8fc73b493a8a03cb09bd558d9472a33453608a6.tar.gz
[hw1] Problem 2
-rw-r--r--hw1/main.tex93
1 files changed, 93 insertions, 0 deletions
diff --git a/hw1/main.tex b/hw1/main.tex
index e05594e..da77359 100644
--- a/hw1/main.tex
+++ b/hw1/main.tex
@@ -143,6 +143,95 @@ Estimating $\beta$ in this way is called {\em ridge regression} because the matr
\end{enumerate}
\end{problem}
+\paragraph{Solution} (a) If $n<m$, $X^TX$ has no chance of being invertible, so
+we won't be able to compute the MLE. Intuitively we need the observations to
+span a $m$ dimensional subspace to be able to learn the $m$ coordinates of
+$\beta$.
+
+(b) We have $H^T = H$, and $H^2 = X(X^TX)^{-1}X^TX(X^TX)^{-1}X^T
+= X(X^TX)^{-1}X^T = H$. This shows that $H$ is an orthogonal projection matrix.
+Furthermore, we see that if $x$ is in the column space of $X$, that is, $x
+= Xe_i$ where $e_i$ is a vector of the canonical basis, then $Hx$
+= $X(X^TX)^{-1}X^TXe_i = Xe_i = x$. That is, $H$ is the identity matrix on the
+column space of $X$. This is enough to conclude that $H$ is the orthogonal
+projection on this subspace.
+
+(c) By linearity of expectation:
+\begin{displaymath}
+ \E(\hat{\beta}) = (X^TX)^{-1}X^TX\beta = \beta
+\end{displaymath}
+for the covariance. Writing $Y = X\beta + \epsilon$ where $\epsilon\sim
+\mathcal{N}(0, \sigma^2I)$, we see that $\hat{\beta}
+= \beta + (X^TX)^{-1}X^T\epsilon$. Hence:
+\begin{displaymath}
+ \E[(\hat{\beta}-\beta)(\hat{\beta}-\beta)^T)
+ = \E\big[(X^TX)^{-1}X^T\epsilon\epsilon^TX(X^TX)^{-1}\big]
+ = \sigma^2(X^TX)^{-1}
+\end{displaymath}
+where the last equality used $E(\epsilon\epsilon^T) = \sigma^2 Id$.
+
+(d) The log-likelihood is:
+\begin{displaymath}
+ L(\beta\given X, Y) = \frac{1}{\sqrt{2\pi}\sigma}
+ \prod_{i=1}^n\exp\left(-\frac{1}{2\sigma^2}(Y_i-X_i\beta)^2\right)
+ = \frac{1}{\sqrt{2\pi}\sigma}
+ \exp\left(-\frac{1}{2\sigma^2}\|Y-X\beta\|^2\right)
+\end{displaymath}
+Taking the gradient with respect to $\beta$ (using the chain rule):
+\begin{displaymath}
+ \nabla L(\beta) = \frac{1}{\sqrt{2\pi}\sigma}
+ \exp\left(-\frac{1}{2\sigma^2}\|Y-X\beta\|^2\right)
+ \frac{1}{\sigma^2}X^T(Y-X\beta)
+\end{displaymath}
+
+(e) The posterior is proportional to:
+\begin{displaymath}
+ \prob(X, Y\given \beta)\exp\left(-\frac{1}{2\tau^2}\|\beta^2\|\right)
+\end{displaymath}
+using the formula obtained in (d), this is proportional to:
+\begin{displaymath}
+ p(\beta) = \exp\left(-\frac{1}{2\sigma^2}\|Y-X\beta\|^2
+ - \frac{1}{2\tau^2}\|\beta\|^2\right)
+\end{displaymath}
+where the proportionality constant (normalization factors) does not depend on
+$\beta$. Hence the MAP estimator can be found by find the maximum of the
+function $p$. Since it is convex, it is sufficient to find a critical point.
+Using a computation similar to the one in (d), we find the gradient of $p$:
+\begin{displaymath}
+ \nabla p(\beta) = \exp\left(-\frac{1}{2\sigma^2}\|Y-X\beta\|^2
+ - \frac{1}{2\tau^2}\|\beta\|^2\right)\left[\frac{1}{\sigma^2}X^T(Y-X\beta)
+ -\frac{1}{\tau^2}\beta\right]
+\end{displaymath}
+Solving $\nabla p(\beta) = 0$ for $\beta$:
+\begin{displaymath}
+ X^TY - (X^TX +\lambda Id)\beta = 0
+\end{displaymath}
+and we obtain the desired formula for the MAP estimator.
+
+(f) $n$ needs not be greater than $m$ since $(\lambda Id + X^TX)$ might be
+invertible even if $X^TX$ is not.
+
+(g) Replacing $X$ by $X'$:
+\begin{displaymath}
+ X' = \begin{pmatrix}
+ X\\
+ \sqrt{\lambda}Id
+ \end{pmatrix}
+\end{displaymath}
+and $Y^T$ by $Y'^ = [Y\; 0]$ and replacing $X$ by $X'$ in the least square
+estimator $\hat{\beta}$, we see that $X'^TY' = X^TY$. Furthermore:
+\begin{displaymath}
+ X'^TX' = \begin{pmatrix}
+ X^T &\sqrt{\lambda}Id
+ \end{pmatrix}
+\begin{pmatrix}
+ X\\
+ \sqrt{\lambda}Id
+ \end{pmatrix} = X^TX +\lambda Id
+\end{displaymath}
+Hence we get $\hat{\beta} = (X^TX+\lambda Id)^{-1}X^TY$, which is exactly the
+expression of the MAP estimator in ridge regression.
+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{problem}[The Dirichlet and multinomial distributions, 12pts]
The Dirichlet distribution over $K$ categories is a generalization of the beta distribution. It has a shape parameter $a \in \R^K$ with non-negative entries and is supported over the set of $K$-dimensional positive vectors whose components sum to 1. Its density is given by
@@ -163,6 +252,10 @@ where $a_{k,n} = a_k + \sum_{i=1}^{n-1} \mathbb{1}\{X_i = k\}$. (Bonus points if
\end{enumerate}
\end{problem}
+\paragraph{Solution} (a) We have $\E(X) = \frac{1}{\sum_{k=1}^K a_k} a$.
+
+(b)
+
\section*{Physicochemical Properties of Protein Tertiary Structure}
In the following problems we will code two different approaches for