1 files changed, 135 insertions, 0 deletions
diff --git a/ps6/main.tex b/ps6/main.tex
new file mode 100644
index 0000000..767a217
--- /dev/null
+++ b/ps6/main.tex
@@ -0,0 +1,135 @@
+\documentclass[10pt]{article}
+\usepackage{fullpage}
+\usepackage[utf8x]{inputenc}
+\usepackage{amsmath,amsfonts,amsthm}
+\usepackage[english]{babel}
+\usepackage[capitalize, noabbrev]{cleveref}
+\usepackage{algorithm}
+\usepackage{algpseudocode}
+
+\newenvironment{enumerate*}%
+  {\vspace{-2ex} \begin{enumerate} %
+     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
+  {\end{enumerate}}
+ 
+\newenvironment{itemize*}%
+  {\vspace{-2ex} \begin{itemize} %
+     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
+  {\end{itemize}}
+ 
+\newenvironment{description*}%
+  {\vspace{-2ex} \begin{description} %
+     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
+  {\end{description}}
+
+\DeclareMathOperator{\E}{\mathbb{E}}
+\let\P\relax
+\DeclareMathOperator{\P}{\mathbb{P}}
+\newcommand{\ex}[1]{\E\left[#1\right]}
+\newcommand{\prob}[1]{\P\left[#1\right]}
+\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
+\newcommand{\R}{\mathbf{R}}
+\newcommand{\N}{\mathbf{N}}
+\newcommand{\C}{\mathcal{C}}
+\newcommand{\eps}{\varepsilon}
+\newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}}
+\newcommand{\llbracket}{[\![}
+
+\newtheorem{theorem}{Theorem}
+\newtheorem{lemma}{Lemma}
+\algrenewcommand\algorithmicensure{\textbf{Output:}}
+\algrenewcommand\algorithmicrequire{\textbf{Input:}}
+
+\author{Thibaut Horel}
+\title{CS 224 Problem Set 6 -- Solutions}
+\date{November 5, 2014}
+
+\begin{document}
+\maketitle
+
+\section*{Problem 1}
+
+\section*{Problem 2}
+
+\section*{Problem 3}
+
+\paragraph{(a)} Let us write $f(x) = \|Ax - b\|^2_2$ for $x\in\R^d$. We have,
+for all $h\in\R^d$:
+\begin{align*}
+    f(x+h) &= \|A(x+h) - b\|_2^2 = \|Ax - b + Ah\|_2^2 = \|Ax-b\|_2^2
+    + 2\inprod{Ax-b, Ah} + \|Ah\|_2^2\\
+    &=f(x) + 2\inprod{A^T(Ax-b), h} + h^TA^TAh.
+\end{align*}
+By uniqueness of the Taylor expansion, we can directly read the gradient and
+Hessian of $f$ in the above identity:
+\begin{displaymath}
+    \nabla f(x) = 2A^T(Ax-b)\quad\text{and}\quad \nabla^2f(x) = 2A^TA.
+\end{displaymath}
+We observe that the Hessian matrix is symmetric semidefinite positive, hence
+$f$ is convex. This implies that any local minimum is also a global minimum. We
+solve for critical points:
+\begin{displaymath}
+    \nabla f(x) = 0\quad \Longleftrightarrow \quad x = (A^TA)^{-1}A^Tb.
+\end{displaymath}
+Hence, $f$ has a unique local minimum which is also the unique global
+minimizer. Its expression is given by $x^* = (A^TA)^{-1}A^Tb$.
+
+\paragraph{(b)} Starting from $\tilde{x}$ which is a $\beta$-approximation to
+the optimal solution, we can obtain a $(1+\eps)$-approximation by running
+a gradient descent starting from $\tilde{x}$. We will denote by $\sigma_{min}$
+(resp. $\sigma_{max}$) the smallest (resp. largest) singular value of $A$. This
+implies,  using the expression of $\nabla^2 f(x)$ found in \textbf{(a)} that:
+\begin{displaymath}
+    2\sigma_{min}^2 I \preceq \nabla^2 f(x) \preceq 2\sigma_{max}^2 I
+\end{displaymath}
+As a consequence, the update rule in the gradient descent takes the following
+form:
+\begin{displaymath}
+    x_{k+1} \leftarrow x_k - \frac{1}{2\sigma_{max}^2}\nabla f(x_k)
+    = x_k - \frac{1}{\sigma_{max}^2}A^T(Ax_k - b) = \left(I-\frac{1}{\sigma_{max}^2}
+    A^TA\right)x_k + \frac{1}{\sigma_{max}^2}A^Tb
+\end{displaymath}
+where we used the expression of the gradient found in \textbf{(a)}. The exact
+description of the algorithm is given in Algorithm~\ref{alg:gradient}
+
+\begin{algorithm}
+    \caption{Gradient Descent}
+    \label{alg:gradient}
+    \begin{algorithmic}[1]
+        \Require $(A, b, \sigma_{max}, \beta, \tilde{x}, \eps)$ such that
+        $\|A\tilde{x} - b\|_2^2\leq \beta \text{OPT}$
+        \Ensure $x$ such that 
+        $\|Ax - b\|_2^2\leq (1+\eps) \text{OPT}$
+        \State $x\leftarrow \tilde{x}$
+        \While{$\|Ax-b\|_2^2 > \frac{1+\eps}{\beta}\|A\tilde{x} - b\|_2^2$}
+            \State $x\leftarrow \left(I-\frac{1}{\sigma_{max}^2}
+            A^TA\right)x + \frac{1}{\sigma_{max}^2}A^Tb$
+        \EndWhile
+        \State\Return $x$
+    \end{algorithmic}
+\end{algorithm}
+
+When the algorithm terminates, we have $x$ such that
+$\|Ax-b\|_2^2\leq\frac{1+\eps}{\beta}\|A\tilde{x}-b\|_2^2\leq
+(1+\eps)\text{OPT}$, which proves correctness.
+
+We now want to find a bound on the running time. When we start the algorithm,
+we have $f(\tilde{x})\leq \beta f(x^*)$, that is $f(\tilde{x}) - f(x^*)\leq
+(\beta-1)f(x^*)$. When we end we have $f(x) - f(x^*)\leq \eps f(x^*)$. That is,
+the absolute error was multiplied by a factor $\frac{\eps}{\beta-1}$. We saw in
+class that this can be achieved by
+$O\left(\frac{\sigma_{max}^2}{\sigma_{min}^2}\log\frac{\beta-1}{\eps}\right)$
+iterations of the gradient descent method. Using the definition of $\kappa(A)$,
+this is $O\left(\kappa(A)^2\log\frac{\beta-1}{\eps}\right)$ repetitions of the
+while loop in Algorithm~\ref{alg:gradient}. Each iteration only involves matrix
+arithmetic an can be computed in time $O(nd)$. Overall the complexity is
+$O\left(nd\,\kappa(A)^2\log\frac{\beta-1}{\eps}\right)$.\footnote{By
+    pre-computing $A^TA$, $A^Tb$ and by being careful with the order in which
+    we compute the matrix-vector products, we can in fact have a complexity of
+$O\left(nd + d^2\,\kappa(A)^2\log\frac{\beta-1}{\eps}\right)$.}
+
+\section*{Problem 4}
+
+Time spent: 
+
+\end{document}