\documentclass[10pt]{article}
\usepackage{fullpage}
\usepackage[utf8x]{inputenc}
\usepackage{amsmath,amsfonts,amsthm}
\usepackage[english]{babel}
\usepackage[capitalize, noabbrev]{cleveref}
\usepackage{algorithm}
\usepackage{algpseudocode}

\newenvironment{enumerate*}%
  {\vspace{-2ex} \begin{enumerate} %
     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
  {\end{enumerate}}
 
\newenvironment{itemize*}%
  {\vspace{-2ex} \begin{itemize} %
     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
  {\end{itemize}}
 
\newenvironment{description*}%
  {\vspace{-2ex} \begin{description} %
     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
  {\end{description}}

\DeclareMathOperator{\E}{\mathbb{E}}
\let\P\relax
\DeclareMathOperator{\P}{\mathbb{P}}
\newcommand{\ex}[1]{\E\left[#1\right]}
\newcommand{\prob}[1]{\P\left[#1\right]}
\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
\newcommand{\R}{\mathbf{R}}
\newcommand{\N}{\mathbf{N}}
\newcommand{\C}{\mathcal{C}}
\newcommand{\eps}{\varepsilon}
\newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}}
\newcommand{\llbracket}{[\![}

\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\algrenewcommand\algorithmicensure{\textbf{Output:}}
\algrenewcommand\algorithmicrequire{\textbf{Input:}}

\author{Thibaut Horel}
\title{CS 224 Problem Set 6 -- Solutions}
\date{November 5, 2014}

\begin{document}
\maketitle

\section*{Problem 1}

\paragraph{(a)} Let us consider a vertex $x$ of $P$. We saw in class that there
exists a subset $M$ of $m$ indices in $\{1,\ldots,n\}$ such that the
corresponding columns in $A$ are linearly independent and such that the
coordinates of $x$ whose indices are outside of $M$ are zero. We denote by
$A_M$ the submatrix extracted from $A$ by only keeping the columns in $M$.
$A_M$ is an $m\times m$ matrix whose columns are independent: it is a square
invertible matrix.  Furthermore we have: $Ax = A_M x_M = b$, where we denote by
$x_M$ the vector obtained from $x$ by only keeping the coordinates in $M$. This
implies, using Cramer's rule that for $i\in M$:
\begin{displaymath}
    x_i = \frac{\det A_i}{\det A_M}
\end{displaymath}
where $A_i$ is the matrix obtained from $A_M$ by replacing the $i$-th column by
$b$. Since $A$ is TUM we have:
\begin{itemize*}
    \item all coefficients of $A$ are in $\{-1, 0, 1\}$. Since $b$ only has
        integer entries, this implies that $\det(A_i)$ is an integer (the
        determinant of a matrix with integer entries is a polynomial in the
        entries and thus is an integer).
    \item $A_M$ is a square submatrix of $A$, hence its determinant is in
        $\{-1, 0, 1\}$.
\end{itemize*}
The two points above and the formula for $x_i$ given by Cramer's rule implie
that $x_i$ is an integer (for $i\in M$). Since $x$ has zero coordinates for
indices outside of $M$, we conclude that $x$ is integral.

\paragraph{(b)} We first note that $A$ is the $n\times m$ matrix defined by
$A_{i,e}=1$ if vertex $i\in V$ is incident to edge $e\in E$ and $A_{i,e}=0$
otherwise. Each edge is incident to exactly two vertices, so each column in $A$
has exactly two entries equal to one (the others entries being 0). Furthermore,
since $G$ is bipartite, we can partition the vertices into to subsets $V_1$
and $V_2$ such that edges only exist between pairs of vertices in $V_1\times
V_2$.  In matrix notation, this mean that there exists a partition $(I, J)$ of
$\{1,\ldots, n\}$, such that for each column in $A$ the indices $i$ and $j$ of
the rows which contain a one in that column are such that $i\in I$ and $j\in
J$.

We now prove that $A$ is TUM. For this, we prove by induction on $k$ that any
submatrix of size $k$ of $A$ has determinant in $\{-1, 0, 1\}$. Since $A$ is
a $0/1$ matrix, the case $k=1$ is trivial. Suppose that the result is true for
submatrices of size $k$ and let us consider a submatrix $A'$ of $A$ of size
$k+1$. We distinguish three cases:
\begin{enumerate*}
\item either there exists a column in $A'$ with only zero entries. In which
    case $\det A'=0$ and we are done.
\item either there exists a column in $A'$ with exactly one entry $(i,j)$ equal
    to one (the others being 0). By considering the cofactor expansion along
    this column, $\det A' = C_{i,j}$ where $C_{i,j}$ is the $i,j$ cofactor of
    $A'$. $C_{i,j}$ is $\pm 1$ times the determinant of a submatrix of size $k$
    of $A$. By induction hypothesis, $C_{i,j}$ is in $\{-1, 0, 1\}$ and we are
    done again in that case.
\item either all columns in $A'$ contain exactly two entries equal to one (the
    other entries being zero). Since $G$ is bipartite, as explained at the
    beginning of this question, there exists a partition $I$, $J$ of the rows
    of $A'$ such that the two ``one'' entries in each column are
    respectively in $I$ and $J$. But then $x^T A' = 0$ where $x_i = 1$ for
    $i\in I$ and $x_j = -1$ for $j\in J$ (in each column we are adding $-1
    + 1 = 0$). We found a linear combination of the rows of $A'$ equal to zero.
    Hence $\det A' = 0$ and we are done.
\end{enumerate*}
In all three cases we have $\det A'\in\{-1,0,1\}$ and we can conclude by the
induction principle.

\paragraph{(c)} We prove this by contraposition.

Let us assume that $G$ is not bipartite. Then it contains a cycle of odd
length. Otherwise, we would be able to find a two-coloring of the graph by
breadth-first search: every time we visit a new node in the BFS traversal, we
give it the opposite color of its parent in the BFS tree. If we had an edge
from this newly visited node to a previously visited node of the same color,
this would imply a cycle of odd length. If there are no odd-cycles then this is
a valid two-coloring and would contradict that $G$ is not bipartite.

Hence, let us consider $v_1,\ldots, v_k$ a cycle of odd length ($k$ is odd).
Let us consider the submatrix $A'$ of $A$ where we only keep the rows
associated with vertices $v_1$ to $v_k$ and columns associated with edges
$(v_1, v_2)$, $(v_2, v_3)$, $\ldots$, $(v_{k-1}, v_k)$ and $(v_k, v_1)$. Up to
a reordering of the rows and the columns $A'$ has the following form:
\begin{displaymath}
    A' = \begin{pmatrix}
        1&0&\cdots&0&1\\
        1&1&\ddots&\vdots&0\\
        0&1&\ddots&0&\vdots\\
        \vdots&\ddots&\ddots&1&0\\
        0&\cdots&0&1&1\\
    \end{pmatrix}
\end{displaymath}
By considering the cofactor expansion of $A'$ along the last column, it is easy
to see that $\det A' = 2$. Since the re-ordering of the rows and columns only
multiplies the determinant by $\pm 1$, we conclude that $\det A' = \pm 2$.
This implies that $A$ is not TUM and concludes the proof of this question.

\section*{Problem 2}

\paragraph{(a)} I claim that an optimal solution to the integer program is in
fact an optimal solution to the cheapest rooted tree problem.

First we note that any feasible solution to the cheapest rooted tree problem is
also feasible for the integer program: indeed, for any set of vertices
$S\subseteq V\setminus\{r\}$, let us consider some vertex $v$ in $S$. There
exists a directed path from $r$ to $v$ (since we have a directed spanning tree
rooted at $v$). One of the edges in that path has to cross from $V\setminus S$
to $S$ at some point, so we have at least one edge entering the set $S$.

Now, let us consider $x$ an optimal solution to the integer program:
\begin{enumerate}
    \item there exists a directed path from the root to any vertex $v\in
        V\setminus\{r\}$. For this, let us consider the set $S_v$ of all
        vertices $u$ such that there exists a directed path from $u$ to $v$.
        Either $S_v$ contains $r$ and we are done, either it doesn't but then
        by feasibility of the solution, there exists an edge $(w, u)$ entering
        $S_v$. But $u\in S_v$, hence there exists a directed path from $u$ to
        $v$, this implies that there exists a directed path from $w$ to $v$
        which is a contradiction since we assumed that $w\notin S_v$.
    \item there exists a unique directed path from the root to any vertex $v\in
        V\setminus\{r\}$. Indeed, let us assume that for some vertex $v$ there
        are at least two paths from the root to $v$. This implies that some
        vertex has at least two entering edges. By removing all but one of
        those edges, we obtain a solution of strictly smaller cost. This
        solution is still feasible since we maintain the property that there
        exists a directed path from the root to any vertex (this property is
        enough to ensure feasibility as we saw in the proof above that a rooted
        tree is feasible for the integer program). This is a contradiction.
\end{enumerate}
1. and 2. together implie that an optimal solution to the integer program is
   a rooted directed tree.

We have shown that any solution to the cheapest rooted tree problem is feasible
for the integer program and that the optimal solution to the integer program is
a feasible rooted tree.

Hence, given an optimal solution to the integer program, we can obtain an
optimal solution to the cheapest rooted tree problem in constant time: simply
output the given solution.

\paragraph{(b)} We saw in class that we can solve a linear program using the
ellipsoid method by doing polynomially queries to a separation oracle: given
a polytope and a point outside the polytope, the separation oracle should
return a hyperplane separating the point and the polytope. We will show how to
implement such an oracle which can answer queries in polynomial time.

Let us denote by $P$ the polytope defined by the linear constraints of the IP.
Finding a separating hyperplane between $x\notin P$ and $P$ is equivalent to
finding a violated constraint (a constraint defines a hyperplane; saying that
it is violated means that $x$ lies on one side of the hyperplane and by
definition the polytope lies on the other side).

Let us consider $x\notin P$. We know that there exists a violated constraint
and we want to find it. A violated constraint is a set $S\subseteq
V\setminus\{r\}$ such that the sum of $x_e$ for the edges $e$ entering $S$ is
strictly less than one. This means that for any vertex $v\in S$, the cost of
the minimum cut between $v$ and $r$ is strictly less than one (since $S$ and
its complement set already provides a cut of cost strictly less than one). Here
we define the cost of a cut with respect to the costs $x_e$ for each edge $e$.

This suggests the following algorithm to find a violated constraint: for each
vertex $v\in V\setminus\{r\}$, compute the minimum cut between $v$ and $r$.
Because $x\notin P$, we know that at least one of these cuts will have a cost
strictly less than one. For such a cut, the part of the cut which does not
contain $r$ defines a violated constraint for $x$.

We note that the running time of this separating oracle is $O(|V| T)$ where $T$
is the time to compute a minimum cut (which is polynomial in $|V|$ and
$|E|$). This shows that the IP can be solved in polynomial time since we only
make polynomially queries to the separating oracle.


\section*{Problem 3}

\paragraph{(a)} Let us write $f(x) = \|Ax - b\|^2_2$ for $x\in\R^d$. We have,
for all $h\in\R^d$:
\begin{align*}
    f(x+h) &= \|A(x+h) - b\|_2^2 = \|Ax - b + Ah\|_2^2 = \|Ax-b\|_2^2
    + 2\inprod{Ax-b, Ah} + \|Ah\|_2^2\\
    &=f(x) + 2\inprod{A^T(Ax-b), h} + h^TA^TAh.
\end{align*}
By uniqueness of the Taylor expansion, we can directly read the gradient and
Hessian of $f$ in the above identity:
\begin{displaymath}
    \nabla f(x) = 2A^T(Ax-b)\quad\text{and}\quad \nabla^2f(x) = 2A^TA.
\end{displaymath}
We observe that the Hessian matrix is symmetric semidefinite positive, hence
$f$ is convex. This implies that any local minimum is also a global minimum. We
solve for critical points:
\begin{displaymath}
    \nabla f(x) = 0\quad \Longleftrightarrow \quad x = (A^TA)^{-1}A^Tb.
\end{displaymath}
Hence, $f$ has a unique local minimum which is also the unique global
minimizer. Its expression is given by $x^* = (A^TA)^{-1}A^Tb$.

\paragraph{(b)} Starting from $\tilde{x}$ which is a $\beta$-approximation to
the optimal solution, we can obtain a $(1+\eps)$-approximation by running
a gradient descent starting from $\tilde{x}$. We will denote by $\sigma_{min}$
(resp. $\sigma_{max}$) the smallest (resp. largest) singular value of $A$. This
implies,  using the expression of $\nabla^2 f(x)$ found in \textbf{(a)} that:
\begin{displaymath}
    2\sigma_{min}^2 I \preceq \nabla^2 f(x) \preceq 2\sigma_{max}^2 I
\end{displaymath}
As a consequence, the update rule in the gradient descent takes the following
form:
\begin{displaymath}
    x_{k+1} \leftarrow x_k - \frac{1}{2\sigma_{max}^2}\nabla f(x_k)
    = x_k - \frac{1}{\sigma_{max}^2}A^T(Ax_k - b) = \left(I-\frac{1}{\sigma_{max}^2}
    A^TA\right)x_k + \frac{1}{\sigma_{max}^2}A^Tb
\end{displaymath}
where we used the expression of the gradient found in \textbf{(a)}. The exact
description of the algorithm is given in Algorithm~\ref{alg:gradient}

\begin{algorithm}
    \caption{Gradient Descent}
    \label{alg:gradient}
    \begin{algorithmic}[1]
        \Require $(A, b, \sigma_{max}, \beta, \tilde{x}, \eps)$ such that
        $\|A\tilde{x} - b\|_2^2\leq \beta \text{OPT}$
        \Ensure $x$ such that 
        $\|Ax - b\|_2^2\leq (1+\eps) \text{OPT}$
        \State $x\leftarrow \tilde{x}$
        \While{$\|Ax-b\|_2^2 > \frac{1+\eps}{\beta}\|A\tilde{x} - b\|_2^2$}
            \State $x\leftarrow \left(I-\frac{1}{\sigma_{max}^2}
            A^TA\right)x + \frac{1}{\sigma_{max}^2}A^Tb$
        \EndWhile
        \State\Return $x$
    \end{algorithmic}
\end{algorithm}

When the algorithm terminates, we have $x$ such that
$\|Ax-b\|_2^2\leq\frac{1+\eps}{\beta}\|A\tilde{x}-b\|_2^2\leq
(1+\eps)\text{OPT}$, which proves correctness.

We now want to find a bound on the running time. When we start the algorithm,
we have $f(\tilde{x})\leq \beta f(x^*)$, that is $f(\tilde{x}) - f(x^*)\leq
(\beta-1)f(x^*)$. When we end we have $f(x) - f(x^*)\leq \eps f(x^*)$. That is,
the absolute error was multiplied by a factor $\frac{\eps}{\beta-1}$. We saw in
class that this can be achieved by
$O\left(\frac{\sigma_{max}^2}{\sigma_{min}^2}\log\frac{\beta-1}{\eps}\right)$
iterations of the gradient descent method. Using the definition of $\kappa(A)$,
this is $O\left(\kappa(A)^2\log\frac{\beta-1}{\eps}\right)$ repetitions of the
while loop in Algorithm~\ref{alg:gradient}. Each iteration only involves matrix
arithmetic an can be computed in time $O(nd)$. Overall the complexity is
$O\left(nd\,\kappa(A)^2\log\frac{\beta-1}{\eps}\right)$.\footnote{By
    pre-computing $A^TA$, $A^Tb$ and by being careful with the order in which
    we compute the matrix-vector products, we can in fact have a complexity of
$O\left(nd + d^2\,\kappa(A)^2\log\frac{\beta-1}{\eps}\right)$.}

\section*{Problem 4}

Time spent: Problem 1 (1h30), Problem 2 (1h), Problem 3 (1h). Total time: 3h30.

\end{document}