ps6/main.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135

\documentclass[10pt]{article}
\usepackage{fullpage}
\usepackage[utf8x]{inputenc}
\usepackage{amsmath,amsfonts,amsthm}
\usepackage[english]{babel}
\usepackage[capitalize, noabbrev]{cleveref}
\usepackage{algorithm}
\usepackage{algpseudocode}

\newenvironment{enumerate*}%
  {\vspace{-2ex} \begin{enumerate} %
     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
  {\end{enumerate}}
 
\newenvironment{itemize*}%
  {\vspace{-2ex} \begin{itemize} %
     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
  {\end{itemize}}
 
\newenvironment{description*}%
  {\vspace{-2ex} \begin{description} %
     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
  {\end{description}}

\DeclareMathOperator{\E}{\mathbb{E}}
\let\P\relax
\DeclareMathOperator{\P}{\mathbb{P}}
\newcommand{\ex}[1]{\E\left[#1\right]}
\newcommand{\prob}[1]{\P\left[#1\right]}
\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
\newcommand{\R}{\mathbf{R}}
\newcommand{\N}{\mathbf{N}}
\newcommand{\C}{\mathcal{C}}
\newcommand{\eps}{\varepsilon}
\newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}}
\newcommand{\llbracket}{[\![}

\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\algrenewcommand\algorithmicensure{\textbf{Output:}}
\algrenewcommand\algorithmicrequire{\textbf{Input:}}

\author{Thibaut Horel}
\title{CS 224 Problem Set 6 -- Solutions}
\date{November 5, 2014}

\begin{document}
\maketitle

\section*{Problem 1}

\section*{Problem 2}

\section*{Problem 3}

\paragraph{(a)} Let us write $f(x) = \|Ax - b\|^2_2$ for $x\in\R^d$. We have,
for all $h\in\R^d$:
\begin{align*}
    f(x+h) &= \|A(x+h) - b\|_2^2 = \|Ax - b + Ah\|_2^2 = \|Ax-b\|_2^2
    + 2\inprod{Ax-b, Ah} + \|Ah\|_2^2\\
    &=f(x) + 2\inprod{A^T(Ax-b), h} + h^TA^TAh.
\end{align*}
By uniqueness of the Taylor expansion, we can directly read the gradient and
Hessian of $f$ in the above identity:
\begin{displaymath}
    \nabla f(x) = 2A^T(Ax-b)\quad\text{and}\quad \nabla^2f(x) = 2A^TA.
\end{displaymath}
We observe that the Hessian matrix is symmetric semidefinite positive, hence
$f$ is convex. This implies that any local minimum is also a global minimum. We
solve for critical points:
\begin{displaymath}
    \nabla f(x) = 0\quad \Longleftrightarrow \quad x = (A^TA)^{-1}A^Tb.
\end{displaymath}
Hence, $f$ has a unique local minimum which is also the unique global
minimizer. Its expression is given by $x^* = (A^TA)^{-1}A^Tb$.

\paragraph{(b)} Starting from $\tilde{x}$ which is a $\beta$-approximation to
the optimal solution, we can obtain a $(1+\eps)$-approximation by running
a gradient descent starting from $\tilde{x}$. We will denote by $\sigma_{min}$
(resp. $\sigma_{max}$) the smallest (resp. largest) singular value of $A$. This
implies,  using the expression of $\nabla^2 f(x)$ found in \textbf{(a)} that:
\begin{displaymath}
    2\sigma_{min}^2 I \preceq \nabla^2 f(x) \preceq 2\sigma_{max}^2 I
\end{displaymath}
As a consequence, the update rule in the gradient descent takes the following
form:
\begin{displaymath}
    x_{k+1} \leftarrow x_k - \frac{1}{2\sigma_{max}^2}\nabla f(x_k)
    = x_k - \frac{1}{\sigma_{max}^2}A^T(Ax_k - b) = \left(I-\frac{1}{\sigma_{max}^2}
    A^TA\right)x_k + \frac{1}{\sigma_{max}^2}A^Tb
\end{displaymath}
where we used the expression of the gradient found in \textbf{(a)}. The exact
description of the algorithm is given in Algorithm~\ref{alg:gradient}

\begin{algorithm}
    \caption{Gradient Descent}
    \label{alg:gradient}
    \begin{algorithmic}[1]
        \Require $(A, b, \sigma_{max}, \beta, \tilde{x}, \eps)$ such that
        $\|A\tilde{x} - b\|_2^2\leq \beta \text{OPT}$
        \Ensure $x$ such that 
        $\|Ax - b\|_2^2\leq (1+\eps) \text{OPT}$
        \State $x\leftarrow \tilde{x}$
        \While{$\|Ax-b\|_2^2 > \frac{1+\eps}{\beta}\|A\tilde{x} - b\|_2^2$}
            \State $x\leftarrow \left(I-\frac{1}{\sigma_{max}^2}
            A^TA\right)x + \frac{1}{\sigma_{max}^2}A^Tb$
        \EndWhile
        \State\Return $x$
    \end{algorithmic}
\end{algorithm}

When the algorithm terminates, we have $x$ such that
$\|Ax-b\|_2^2\leq\frac{1+\eps}{\beta}\|A\tilde{x}-b\|_2^2\leq
(1+\eps)\text{OPT}$, which proves correctness.

We now want to find a bound on the running time. When we start the algorithm,
we have $f(\tilde{x})\leq \beta f(x^*)$, that is $f(\tilde{x}) - f(x^*)\leq
(\beta-1)f(x^*)$. When we end we have $f(x) - f(x^*)\leq \eps f(x^*)$. That is,
the absolute error was multiplied by a factor $\frac{\eps}{\beta-1}$. We saw in
class that this can be achieved by
$O\left(\frac{\sigma_{max}^2}{\sigma_{min}^2}\log\frac{\beta-1}{\eps}\right)$
iterations of the gradient descent method. Using the definition of $\kappa(A)$,
this is $O\left(\kappa(A)^2\log\frac{\beta-1}{\eps}\right)$ repetitions of the
while loop in Algorithm~\ref{alg:gradient}. Each iteration only involves matrix
arithmetic an can be computed in time $O(nd)$. Overall the complexity is
$O\left(nd\,\kappa(A)^2\log\frac{\beta-1}{\eps}\right)$.\footnote{By
    pre-computing $A^TA$, $A^Tb$ and by being careful with the order in which
    we compute the matrix-vector products, we can in fact have a complexity of
$O\left(nd + d^2\,\kappa(A)^2\log\frac{\beta-1}{\eps}\right)$.}

\section*{Problem 4}

Time spent: 

\end{document}