\documentclass[10pt]{article} \usepackage{fullpage} \usepackage[utf8x]{inputenc} \usepackage{amsmath,amsfonts,amsthm} \usepackage[english]{babel} \usepackage[capitalize, noabbrev]{cleveref} \usepackage{algorithm} \usepackage{algpseudocode} \newenvironment{enumerate*}% {\vspace{-2ex} \begin{enumerate} % \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}% {\end{enumerate}} \newenvironment{itemize*}% {\vspace{-2ex} \begin{itemize} % \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}% {\end{itemize}} \newenvironment{description*}% {\vspace{-2ex} \begin{description} % \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}% {\end{description}} \DeclareMathOperator{\E}{\mathbb{E}} \let\P\relax \DeclareMathOperator{\P}{\mathbb{P}} \newcommand{\ex}[1]{\E\left[#1\right]} \newcommand{\prob}[1]{\P\left[#1\right]} \newcommand{\inprod}[1]{\left\langle #1 \right\rangle} \newcommand{\R}{\mathbf{R}} \newcommand{\N}{\mathbf{N}} \newcommand{\C}{\mathcal{C}} \newcommand{\eps}{\varepsilon} \newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}} \newcommand{\llbracket}{[\![} \newtheorem{theorem}{Theorem} \newtheorem{lemma}{Lemma} \algrenewcommand\algorithmicensure{\textbf{Output:}} \algrenewcommand\algorithmicrequire{\textbf{Input:}} \author{Thibaut Horel} \title{CS 224 Problem Set 6 -- Solutions} \date{November 5, 2014} \begin{document} \maketitle \section*{Problem 1} \section*{Problem 2} \section*{Problem 3} \paragraph{(a)} Let us write $f(x) = \|Ax - b\|^2_2$ for $x\in\R^d$. We have, for all $h\in\R^d$: \begin{align*} f(x+h) &= \|A(x+h) - b\|_2^2 = \|Ax - b + Ah\|_2^2 = \|Ax-b\|_2^2 + 2\inprod{Ax-b, Ah} + \|Ah\|_2^2\\ &=f(x) + 2\inprod{A^T(Ax-b), h} + h^TA^TAh. \end{align*} By uniqueness of the Taylor expansion, we can directly read the gradient and Hessian of $f$ in the above identity: \begin{displaymath} \nabla f(x) = 2A^T(Ax-b)\quad\text{and}\quad \nabla^2f(x) = 2A^TA. \end{displaymath} We observe that the Hessian matrix is symmetric semidefinite positive, hence $f$ is convex. This implies that any local minimum is also a global minimum. We solve for critical points: \begin{displaymath} \nabla f(x) = 0\quad \Longleftrightarrow \quad x = (A^TA)^{-1}A^Tb. \end{displaymath} Hence, $f$ has a unique local minimum which is also the unique global minimizer. Its expression is given by $x^* = (A^TA)^{-1}A^Tb$. \paragraph{(b)} Starting from $\tilde{x}$ which is a $\beta$-approximation to the optimal solution, we can obtain a $(1+\eps)$-approximation by running a gradient descent starting from $\tilde{x}$. We will denote by $\sigma_{min}$ (resp. $\sigma_{max}$) the smallest (resp. largest) singular value of $A$. This implies, using the expression of $\nabla^2 f(x)$ found in \textbf{(a)} that: \begin{displaymath} 2\sigma_{min}^2 I \preceq \nabla^2 f(x) \preceq 2\sigma_{max}^2 I \end{displaymath} As a consequence, the update rule in the gradient descent takes the following form: \begin{displaymath} x_{k+1} \leftarrow x_k - \frac{1}{2\sigma_{max}^2}\nabla f(x_k) = x_k - \frac{1}{\sigma_{max}^2}A^T(Ax_k - b) = \left(I-\frac{1}{\sigma_{max}^2} A^TA\right)x_k + \frac{1}{\sigma_{max}^2}A^Tb \end{displaymath} where we used the expression of the gradient found in \textbf{(a)}. The exact description of the algorithm is given in Algorithm~\ref{alg:gradient} \begin{algorithm} \caption{Gradient Descent} \label{alg:gradient} \begin{algorithmic}[1] \Require $(A, b, \sigma_{max}, \beta, \tilde{x}, \eps)$ such that $\|A\tilde{x} - b\|_2^2\leq \beta \text{OPT}$ \Ensure $x$ such that $\|Ax - b\|_2^2\leq (1+\eps) \text{OPT}$ \State $x\leftarrow \tilde{x}$ \While{$\|Ax-b\|_2^2 > \frac{1+\eps}{\beta}\|A\tilde{x} - b\|_2^2$} \State $x\leftarrow \left(I-\frac{1}{\sigma_{max}^2} A^TA\right)x + \frac{1}{\sigma_{max}^2}A^Tb$ \EndWhile \State\Return $x$ \end{algorithmic} \end{algorithm} When the algorithm terminates, we have $x$ such that $\|Ax-b\|_2^2\leq\frac{1+\eps}{\beta}\|A\tilde{x}-b\|_2^2\leq (1+\eps)\text{OPT}$, which proves correctness. We now want to find a bound on the running time. When we start the algorithm, we have $f(\tilde{x})\leq \beta f(x^*)$, that is $f(\tilde{x}) - f(x^*)\leq (\beta-1)f(x^*)$. When we end we have $f(x) - f(x^*)\leq \eps f(x^*)$. That is, the absolute error was multiplied by a factor $\frac{\eps}{\beta-1}$. We saw in class that this can be achieved by $O\left(\frac{\sigma_{max}^2}{\sigma_{min}^2}\log\frac{\beta-1}{\eps}\right)$ iterations of the gradient descent method. Using the definition of $\kappa(A)$, this is $O\left(\kappa(A)^2\log\frac{\beta-1}{\eps}\right)$ repetitions of the while loop in Algorithm~\ref{alg:gradient}. Each iteration only involves matrix arithmetic an can be computed in time $O(nd)$. Overall the complexity is $O\left(nd\,\kappa(A)^2\log\frac{\beta-1}{\eps}\right)$.\footnote{By pre-computing $A^TA$, $A^Tb$ and by being careful with the order in which we compute the matrix-vector products, we can in fact have a complexity of $O\left(nd + d^2\,\kappa(A)^2\log\frac{\beta-1}{\eps}\right)$.} \section*{Problem 4} Time spent: \end{document}