\documentclass[10pt]{article}
\usepackage{fullpage}
\usepackage[utf8x]{inputenc}
\usepackage{amsmath,amsfonts,amsthm}
\usepackage[english]{babel}
\usepackage[capitalize, noabbrev]{cleveref}
\usepackage{algorithm}
\usepackage{algpseudocode}

\newenvironment{enumerate*}%
  {\vspace{-2ex} \begin{enumerate} %
     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
  {\end{enumerate}}
 
\newenvironment{itemize*}%
  {\vspace{-2ex} \begin{itemize} %
     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
  {\end{itemize}}
 
\newenvironment{description*}%
  {\vspace{-2ex} \begin{description} %
     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
  {\end{description}}

\DeclareMathOperator{\E}{\mathbb{E}}
\let\P\relax
\DeclareMathOperator{\P}{\mathbb{P}}
\newcommand{\ex}[1]{\E\left[#1\right]}
\newcommand{\prob}[1]{\P\left[#1\right]}
\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
\newcommand{\R}{\mathbf{R}}
\newcommand{\N}{\mathbf{N}}
\newcommand{\C}{\mathcal{C}}
\newcommand{\eps}{\varepsilon}
\newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}}
\newcommand{\llbracket}{[\![}

\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\algrenewcommand\algorithmicensure{\textbf{Output:}}
\algrenewcommand\algorithmicrequire{\textbf{Input:}}

\author{Thibaut Horel}
\title{CS 224 Problem Set 6 -- Solutions}
\date{November 5, 2014}

\begin{document}
\maketitle

\section*{Problem 1}

\section*{Problem 2}

\section*{Problem 3}

\paragraph{(a)} Let us write $f(x) = \|Ax - b\|^2_2$ for $x\in\R^d$. We have,
for all $h\in\R^d$:
\begin{align*}
    f(x+h) &= \|A(x+h) - b\|_2^2 = \|Ax - b + Ah\|_2^2 = \|Ax-b\|_2^2
    + 2\inprod{Ax-b, Ah} + \|Ah\|_2^2\\
    &=f(x) + 2\inprod{A^T(Ax-b), h} + h^TA^TAh.
\end{align*}
By uniqueness of the Taylor expansion, we can directly read the gradient and
Hessian of $f$ in the above identity:
\begin{displaymath}
    \nabla f(x) = 2A^T(Ax-b)\quad\text{and}\quad \nabla^2f(x) = 2A^TA.
\end{displaymath}
We observe that the Hessian matrix is symmetric semidefinite positive, hence
$f$ is convex. This implies that any local minimum is also a global minimum. We
solve for critical points:
\begin{displaymath}
    \nabla f(x) = 0\quad \Longleftrightarrow \quad x = (A^TA)^{-1}A^Tb.
\end{displaymath}
Hence, $f$ has a unique local minimum which is also the unique global
minimizer. Its expression is given by $x^* = (A^TA)^{-1}A^Tb$.

\paragraph{(b)} Starting from $\tilde{x}$ which is a $\beta$-approximation to
the optimal solution, we can obtain a $(1+\eps)$-approximation by running
a gradient descent starting from $\tilde{x}$. We will denote by $\sigma_{min}$
(resp. $\sigma_{max}$) the smallest (resp. largest) singular value of $A$. This
implies,  using the expression of $\nabla^2 f(x)$ found in \textbf{(a)} that:
\begin{displaymath}
    2\sigma_{min}^2 I \preceq \nabla^2 f(x) \preceq 2\sigma_{max}^2 I
\end{displaymath}
As a consequence, the update rule in the gradient descent takes the following
form:
\begin{displaymath}
    x_{k+1} \leftarrow x_k - \frac{1}{2\sigma_{max}^2}\nabla f(x_k)
    = x_k - \frac{1}{\sigma_{max}^2}A^T(Ax_k - b) = \left(I-\frac{1}{\sigma_{max}^2}
    A^TA\right)x_k + \frac{1}{\sigma_{max}^2}A^Tb
\end{displaymath}
where we used the expression of the gradient found in \textbf{(a)}. The exact
description of the algorithm is given in Algorithm~\ref{alg:gradient}

\begin{algorithm}
    \caption{Gradient Descent}
    \label{alg:gradient}
    \begin{algorithmic}[1]
        \Require $(A, b, \sigma_{max}, \beta, \tilde{x}, \eps)$ such that
        $\|A\tilde{x} - b\|_2^2\leq \beta \text{OPT}$
        \Ensure $x$ such that 
        $\|Ax - b\|_2^2\leq (1+\eps) \text{OPT}$
        \State $x\leftarrow \tilde{x}$
        \While{$\|Ax-b\|_2^2 > \frac{1+\eps}{\beta}\|A\tilde{x} - b\|_2^2$}
            \State $x\leftarrow \left(I-\frac{1}{\sigma_{max}^2}
            A^TA\right)x + \frac{1}{\sigma_{max}^2}A^Tb$
        \EndWhile
        \State\Return $x$
    \end{algorithmic}
\end{algorithm}

When the algorithm terminates, we have $x$ such that
$\|Ax-b\|_2^2\leq\frac{1+\eps}{\beta}\|A\tilde{x}-b\|_2^2\leq
(1+\eps)\text{OPT}$, which proves correctness.

We now want to find a bound on the running time. When we start the algorithm,
we have $f(\tilde{x})\leq \beta f(x^*)$, that is $f(\tilde{x}) - f(x^*)\leq
(\beta-1)f(x^*)$. When we end we have $f(x) - f(x^*)\leq \eps f(x^*)$. That is,
the absolute error was multiplied by a factor $\frac{\eps}{\beta-1}$. We saw in
class that this can be achieved by
$O\left(\frac{\sigma_{max}^2}{\sigma_{min}^2}\log\frac{\beta-1}{\eps}\right)$
iterations of the gradient descent method. Using the definition of $\kappa(A)$,
this is $O\left(\kappa(A)^2\log\frac{\beta-1}{\eps}\right)$ repetitions of the
while loop in Algorithm~\ref{alg:gradient}. Each iteration only involves matrix
arithmetic an can be computed in time $O(nd)$. Overall the complexity is
$O\left(nd\,\kappa(A)^2\log\frac{\beta-1}{\eps}\right)$.\footnote{By
    pre-computing $A^TA$, $A^Tb$ and by being careful with the order in which
    we compute the matrix-vector products, we can in fact have a complexity of
$O\left(nd + d^2\,\kappa(A)^2\log\frac{\beta-1}{\eps}\right)$.}

\section*{Problem 4}

Time spent: 

\end{document}