diff options
Diffstat (limited to 'ps6')
| -rw-r--r-- | ps6/main.tex | 135 |
1 files changed, 135 insertions, 0 deletions
diff --git a/ps6/main.tex b/ps6/main.tex new file mode 100644 index 0000000..767a217 --- /dev/null +++ b/ps6/main.tex @@ -0,0 +1,135 @@ +\documentclass[10pt]{article} +\usepackage{fullpage} +\usepackage[utf8x]{inputenc} +\usepackage{amsmath,amsfonts,amsthm} +\usepackage[english]{babel} +\usepackage[capitalize, noabbrev]{cleveref} +\usepackage{algorithm} +\usepackage{algpseudocode} + +\newenvironment{enumerate*}% + {\vspace{-2ex} \begin{enumerate} % + \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}% + {\end{enumerate}} + +\newenvironment{itemize*}% + {\vspace{-2ex} \begin{itemize} % + \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}% + {\end{itemize}} + +\newenvironment{description*}% + {\vspace{-2ex} \begin{description} % + \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}% + {\end{description}} + +\DeclareMathOperator{\E}{\mathbb{E}} +\let\P\relax +\DeclareMathOperator{\P}{\mathbb{P}} +\newcommand{\ex}[1]{\E\left[#1\right]} +\newcommand{\prob}[1]{\P\left[#1\right]} +\newcommand{\inprod}[1]{\left\langle #1 \right\rangle} +\newcommand{\R}{\mathbf{R}} +\newcommand{\N}{\mathbf{N}} +\newcommand{\C}{\mathcal{C}} +\newcommand{\eps}{\varepsilon} +\newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}} +\newcommand{\llbracket}{[\![} + +\newtheorem{theorem}{Theorem} +\newtheorem{lemma}{Lemma} +\algrenewcommand\algorithmicensure{\textbf{Output:}} +\algrenewcommand\algorithmicrequire{\textbf{Input:}} + +\author{Thibaut Horel} +\title{CS 224 Problem Set 6 -- Solutions} +\date{November 5, 2014} + +\begin{document} +\maketitle + +\section*{Problem 1} + +\section*{Problem 2} + +\section*{Problem 3} + +\paragraph{(a)} Let us write $f(x) = \|Ax - b\|^2_2$ for $x\in\R^d$. We have, +for all $h\in\R^d$: +\begin{align*} + f(x+h) &= \|A(x+h) - b\|_2^2 = \|Ax - b + Ah\|_2^2 = \|Ax-b\|_2^2 + + 2\inprod{Ax-b, Ah} + \|Ah\|_2^2\\ + &=f(x) + 2\inprod{A^T(Ax-b), h} + h^TA^TAh. +\end{align*} +By uniqueness of the Taylor expansion, we can directly read the gradient and +Hessian of $f$ in the above identity: +\begin{displaymath} + \nabla f(x) = 2A^T(Ax-b)\quad\text{and}\quad \nabla^2f(x) = 2A^TA. +\end{displaymath} +We observe that the Hessian matrix is symmetric semidefinite positive, hence +$f$ is convex. This implies that any local minimum is also a global minimum. We +solve for critical points: +\begin{displaymath} + \nabla f(x) = 0\quad \Longleftrightarrow \quad x = (A^TA)^{-1}A^Tb. +\end{displaymath} +Hence, $f$ has a unique local minimum which is also the unique global +minimizer. Its expression is given by $x^* = (A^TA)^{-1}A^Tb$. + +\paragraph{(b)} Starting from $\tilde{x}$ which is a $\beta$-approximation to +the optimal solution, we can obtain a $(1+\eps)$-approximation by running +a gradient descent starting from $\tilde{x}$. We will denote by $\sigma_{min}$ +(resp. $\sigma_{max}$) the smallest (resp. largest) singular value of $A$. This +implies, using the expression of $\nabla^2 f(x)$ found in \textbf{(a)} that: +\begin{displaymath} + 2\sigma_{min}^2 I \preceq \nabla^2 f(x) \preceq 2\sigma_{max}^2 I +\end{displaymath} +As a consequence, the update rule in the gradient descent takes the following +form: +\begin{displaymath} + x_{k+1} \leftarrow x_k - \frac{1}{2\sigma_{max}^2}\nabla f(x_k) + = x_k - \frac{1}{\sigma_{max}^2}A^T(Ax_k - b) = \left(I-\frac{1}{\sigma_{max}^2} + A^TA\right)x_k + \frac{1}{\sigma_{max}^2}A^Tb +\end{displaymath} +where we used the expression of the gradient found in \textbf{(a)}. The exact +description of the algorithm is given in Algorithm~\ref{alg:gradient} + +\begin{algorithm} + \caption{Gradient Descent} + \label{alg:gradient} + \begin{algorithmic}[1] + \Require $(A, b, \sigma_{max}, \beta, \tilde{x}, \eps)$ such that + $\|A\tilde{x} - b\|_2^2\leq \beta \text{OPT}$ + \Ensure $x$ such that + $\|Ax - b\|_2^2\leq (1+\eps) \text{OPT}$ + \State $x\leftarrow \tilde{x}$ + \While{$\|Ax-b\|_2^2 > \frac{1+\eps}{\beta}\|A\tilde{x} - b\|_2^2$} + \State $x\leftarrow \left(I-\frac{1}{\sigma_{max}^2} + A^TA\right)x + \frac{1}{\sigma_{max}^2}A^Tb$ + \EndWhile + \State\Return $x$ + \end{algorithmic} +\end{algorithm} + +When the algorithm terminates, we have $x$ such that +$\|Ax-b\|_2^2\leq\frac{1+\eps}{\beta}\|A\tilde{x}-b\|_2^2\leq +(1+\eps)\text{OPT}$, which proves correctness. + +We now want to find a bound on the running time. When we start the algorithm, +we have $f(\tilde{x})\leq \beta f(x^*)$, that is $f(\tilde{x}) - f(x^*)\leq +(\beta-1)f(x^*)$. When we end we have $f(x) - f(x^*)\leq \eps f(x^*)$. That is, +the absolute error was multiplied by a factor $\frac{\eps}{\beta-1}$. We saw in +class that this can be achieved by +$O\left(\frac{\sigma_{max}^2}{\sigma_{min}^2}\log\frac{\beta-1}{\eps}\right)$ +iterations of the gradient descent method. Using the definition of $\kappa(A)$, +this is $O\left(\kappa(A)^2\log\frac{\beta-1}{\eps}\right)$ repetitions of the +while loop in Algorithm~\ref{alg:gradient}. Each iteration only involves matrix +arithmetic an can be computed in time $O(nd)$. Overall the complexity is +$O\left(nd\,\kappa(A)^2\log\frac{\beta-1}{\eps}\right)$.\footnote{By + pre-computing $A^TA$, $A^Tb$ and by being careful with the order in which + we compute the matrix-vector products, we can in fact have a complexity of +$O\left(nd + d^2\,\kappa(A)^2\log\frac{\beta-1}{\eps}\right)$.} + +\section*{Problem 4} + +Time spent: + +\end{document} |
