\documentclass[10pt]{article} \usepackage{fullpage} \usepackage[utf8x]{inputenc} \usepackage{amsmath,amsfonts,amsthm} \usepackage[english]{babel} \usepackage[capitalize, noabbrev]{cleveref} \usepackage{algorithm} \usepackage{algpseudocode} \newenvironment{enumerate*}% {\vspace{-2ex} \begin{enumerate} % \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}% {\end{enumerate}} \newenvironment{itemize*}% {\vspace{-2ex} \begin{itemize} % \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}% {\end{itemize}} \newenvironment{description*}% {\vspace{-2ex} \begin{description} % \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}% {\end{description}} \DeclareMathOperator{\E}{\mathbb{E}} \let\P\relax \DeclareMathOperator{\P}{\mathbb{P}} \newcommand{\ex}[1]{\E\left[#1\right]} \newcommand{\prob}[1]{\P\left[#1\right]} \newcommand{\inprod}[1]{\left\langle #1 \right\rangle} \newcommand{\R}{\mathbf{R}} \newcommand{\N}{\mathbf{N}} \newcommand{\C}{\mathcal{C}} \newcommand{\eps}{\varepsilon} \newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}} \newcommand{\llbracket}{[\![} \newtheorem{theorem}{Theorem} \newtheorem{lemma}{Lemma} \algrenewcommand\algorithmicensure{\textbf{Output:}} \algrenewcommand\algorithmicrequire{\textbf{Input:}} \author{Thibaut Horel} \title{CS 224 Problem Set 6 -- Solutions} \date{November 5, 2014} \begin{document} \maketitle \section*{Problem 1} \paragraph{(a)} Let us consider a vertex $x$ of $P$. We saw in class that there exists a subset $M$ of $m$ indices in $\{1,\ldots,n\}$ such that the corresponding columns in $A$ are linearly independent and such that the coordinates of $x$ whose indices are outside of $M$ are zero. We denote by $A_M$ the submatrix extracted from $A$ by only keeping the columns in $M$. $A_M$ is an $m\times m$ matrix whose columns are independent: it is a square invertible matrix. Furthermore we have: $Ax = A_M x_M = b$, where we denote by $x_M$ the vector obtained from $x$ by only keeping the coordinates in $M$. This implies, using Cramer's rule that for $i\in M$: \begin{displaymath} x_i = \frac{\det A_i}{\det A_M} \end{displaymath} where $A_i$ is the matrix obtained from $A_M$ by replacing the $i$-th column by $b$. Since $A$ is TUM we have: \begin{itemize*} \item all coefficients of $A$ are in $\{-1, 0, 1\}$. Since $b$ only has integer entries, this implies that $\det(A_i)$ is an integer (the determinant of a matrix with integer entries is a polynomial in the entries and thus is an integer). \item $A_M$ is a square submatrix of $A$, hence its determinant is in $\{-1, 0, 1\}$. \end{itemize*} The two points above and the formula for $x_i$ given by Cramer's rule implie that $x_i$ is an integer (for $i\in M$). Since $x$ has zero coordinates for indices outside of $M$, we conclude that $x$ is integral. \paragraph{(b)} We first note that $A$ is the $n\times m$ matrix defined by $A_{i,e}=1$ if vertex $i\in V$ is incident to edge $e\in E$ and $A_{i,e}=0$ otherwise. Each edge is incident to exactly two vertices, so each column in $A$ has exactly two entries equal to one (the others entries being 0). Furthermore, since $G$ is bipartite, we can partition the vertices into to subsets $V_1$ and $V_2$ such that edges only exist between pairs of vertices in $V_1\times V_2$. In matrix notation, this mean that there exists a partition $(I, J)$ of $\{1,\ldots, n\}$, such that for each column in $A$ the indices $i$ and $j$ of the rows which contain a one in that column are such that $i\in I$ and $j\in J$. We now prove that $A$ is TUM. For this, we prove by induction on $k$ that any submatrix of size $k$ of $A$ has determinant in $\{-1, 0, 1\}$. Since $A$ is a $0/1$ matrix, the case $k=1$ is trivial. Suppose that the result is true for submatrices of size $k$ and let us consider a submatrix $A'$ of $A$ of size $k+1$. We distinguish three cases: \begin{enumerate*} \item either there exists a column in $A'$ with only zero entries. In which case $\det A'=0$ and we are done. \item either there exists a column in $A'$ with exactly one entry $(i,j)$ equal to one (the others being 0). By considering the cofactor expansion along this column, $\det A' = C_{i,j}$ where $C_{i,j}$ is the $i,j$ cofactor of $A'$. $C_{i,j}$ is $\pm 1$ times the determinant of a submatrix of size $k$ of $A$. By induction hypothesis, $C_{i,j}$ is in $\{-1, 0, 1\}$ and we are done again in that case. \item either all columns in $A'$ contain exactly two entries equal to one (the other entries being zero). Since $G$ is bipartite, as explained at the beginning of this question, there exists a partition $I$, $J$ of the rows of $A'$ such that the two ``one'' entries in each column are respectively in $I$ and $J$. But then $x^T A' = 0$ where $x_i = 1$ for $i\in I$ and $x_j = -1$ for $j\in J$ (in each column we are adding $-1 + 1 = 0$). We found a linear combination of the rows of $A'$ equal to zero. Hence $\det A' = 0$ and we are done. \end{enumerate*} In all three cases we have $\det A'\in\{-1,0,1\}$ and we can conclude by the induction principle. \paragraph{(c)} We prove this by contraposition. Let us assume that $G$ is not bipartite. Then it contains a cycle of odd length. Otherwise, we would be able to find a two-coloring of the graph by breadth-first search: every time we visit a new node in the BFS traversal, we give it the opposite color of its parent in the BFS tree. If we had an edge from this newly visited node to a previously visited node of the same color, this would imply a cycle of odd length. If there are no odd-cycles then this is a valid two-coloring and would contradict that $G$ is not bipartite. Hence, let us consider $v_1,\ldots, v_k$ a cycle of odd length ($k$ is odd). Let us consider the submatrix $A'$ of $A$ where we only keep the rows associated with vertices $v_1$ to $v_k$ and columns associated with edges $(v_1, v_2)$, $(v_2, v_3)$, $\ldots$, $(v_{k-1}, v_k)$ and $(v_k, v_1)$. Up to a reordering of the rows and the columns $A'$ has the following form: \begin{displaymath} A' = \begin{pmatrix} 1&0&\cdots&0&1\\ 1&1&\ddots&\vdots&0\\ 0&1&\ddots&0&\vdots\\ \vdots&\ddots&\ddots&1&0\\ 0&\cdots&0&1&1\\ \end{pmatrix} \end{displaymath} By considering the cofactor expansion of $A'$ along the last column, it is easy to see that $\det A' = 2$. Since the re-ordering of the rows and columns only multiplies the determinant by $\pm 1$, we conclude that $\det A' = \pm 2$. This implies that $A$ is not TUM and concludes the proof of this question. \section*{Problem 2} \paragraph{(a)} I claim that an optimal solution to the integer program is in fact an optimal solution to the cheapest rooted tree problem. First we note that any feasible solution to the cheapest rooted tree problem is also feasible for the integer program: indeed, for any set of vertices $S\subseteq V\setminus\{r\}$, let us consider some vertex $v$ in $S$. There exists a directed path from $r$ to $v$ (since we have a directed spanning tree rooted at $v$). One of the edges in that path has to cross from $V\setminus S$ to $S$ at some point, so we have at least one edge entering the set $S$. Now, let us consider $x$ an optimal solution to the integer program: \begin{enumerate} \item there exists a directed path from the root to any vertex $v\in V\setminus\{r\}$. For this, let us consider the set $S_v$ of all vertices $u$ such that there exists a directed path from $u$ to $v$. Either $S_v$ contains $r$ and we are done, either it doesn't but then by feasibility of the solution, there exists an edge $(w, u)$ entering $S_v$. But $u\in S_v$, hence there exists a directed path from $u$ to $v$, this implies that there exists a directed path from $w$ to $v$ which is a contradiction since we assumed that $w\notin S_v$. \item there exists a unique directed path from the root to any vertex $v\in V\setminus\{r\}$. Indeed, let us assume that for some vertex $v$ there are at least two paths from the root to $v$. This implies that some vertex has at least two entering edges. By removing all but one of those edges, we obtain a solution of strictly smaller cost. This solution is still feasible since we maintain the property that there exists a directed path from the root to any vertex (this property is enough to ensure feasibility as we saw in the proof above that a rooted tree is feasible for the integer program). This is a contradiction. \end{enumerate} 1. and 2. together implie that an optimal solution to the integer program is a rooted directed tree. We have shown that any solution to the cheapest rooted tree problem is feasible for the integer program and that the optimal solution to the integer program is a feasible rooted tree. Hence, given an optimal solution to the integer program, we can obtain an optimal solution to the cheapest rooted tree problem in constant time: simply output the given solution. \paragraph{(b)} We saw in class that we can solve a linear program using the ellipsoid method by doing polynomially queries to a separation oracle: given a polytope and a point outside the polytope, the separation oracle should return a hyperplane separating the point and the polytope. We will show how to implement such an oracle which can answer queries in polynomial time. Let us denote by $P$ the polytope defined by the linear constraints of the IP. Finding a separating hyperplane between $x\notin P$ and $P$ is equivalent to finding a violated constraint (a constraint defines a hyperplane; saying that it is violated means that $x$ lies on one side of the hyperplane and by definition the polytope lies on the other side). Let us consider $x\notin P$. We know that there exists a violated constraint and we want to find it. A violated constraint is a set $S\subseteq V\setminus\{r\}$ such that the sum of $x_e$ for the edges $e$ entering $S$ is strictly less than one. This means that for any vertex $v\in S$, the cost of the minimum cut between $v$ and $r$ is strictly less than one (since $S$ and its complement set already provides a cut of cost strictly less than one). Here we define the cost of a cut with respect to the costs $x_e$ for each edge $e$. This suggests the following algorithm to find a violated constraint: for each vertex $v\in V\setminus\{r\}$, compute the minimum cut between $v$ and $r$. Because $x\notin P$, we know that at least one of these cuts will have a cost strictly less than one. For such a cut, the part of the cut which does not contain $r$ defines a violated constraint for $x$. We note that the running time of this separating oracle is $O(|V| T)$ where $T$ is the time to compute a minimum cut (which is polynomial in $|V|$ and $|E|$). This shows that the IP can be solved in polynomial time since we only make polynomially queries to the separating oracle. \section*{Problem 3} \paragraph{(a)} Let us write $f(x) = \|Ax - b\|^2_2$ for $x\in\R^d$. We have, for all $h\in\R^d$: \begin{align*} f(x+h) &= \|A(x+h) - b\|_2^2 = \|Ax - b + Ah\|_2^2 = \|Ax-b\|_2^2 + 2\inprod{Ax-b, Ah} + \|Ah\|_2^2\\ &=f(x) + 2\inprod{A^T(Ax-b), h} + h^TA^TAh. \end{align*} By uniqueness of the Taylor expansion, we can directly read the gradient and Hessian of $f$ in the above identity: \begin{displaymath} \nabla f(x) = 2A^T(Ax-b)\quad\text{and}\quad \nabla^2f(x) = 2A^TA. \end{displaymath} We observe that the Hessian matrix is symmetric semidefinite positive, hence $f$ is convex. This implies that any local minimum is also a global minimum. We solve for critical points: \begin{displaymath} \nabla f(x) = 0\quad \Longleftrightarrow \quad x = (A^TA)^{-1}A^Tb. \end{displaymath} Hence, $f$ has a unique local minimum which is also the unique global minimizer. Its expression is given by $x^* = (A^TA)^{-1}A^Tb$. \paragraph{(b)} Starting from $\tilde{x}$ which is a $\beta$-approximation to the optimal solution, we can obtain a $(1+\eps)$-approximation by running a gradient descent starting from $\tilde{x}$. We will denote by $\sigma_{min}$ (resp. $\sigma_{max}$) the smallest (resp. largest) singular value of $A$. This implies, using the expression of $\nabla^2 f(x)$ found in \textbf{(a)} that: \begin{displaymath} 2\sigma_{min}^2 I \preceq \nabla^2 f(x) \preceq 2\sigma_{max}^2 I \end{displaymath} As a consequence, the update rule in the gradient descent takes the following form: \begin{displaymath} x_{k+1} \leftarrow x_k - \frac{1}{2\sigma_{max}^2}\nabla f(x_k) = x_k - \frac{1}{\sigma_{max}^2}A^T(Ax_k - b) = \left(I-\frac{1}{\sigma_{max}^2} A^TA\right)x_k + \frac{1}{\sigma_{max}^2}A^Tb \end{displaymath} where we used the expression of the gradient found in \textbf{(a)}. The exact description of the algorithm is given in Algorithm~\ref{alg:gradient} \begin{algorithm} \caption{Gradient Descent} \label{alg:gradient} \begin{algorithmic}[1] \Require $(A, b, \sigma_{max}, \beta, \tilde{x}, \eps)$ such that $\|A\tilde{x} - b\|_2^2\leq \beta \text{OPT}$ \Ensure $x$ such that $\|Ax - b\|_2^2\leq (1+\eps) \text{OPT}$ \State $x\leftarrow \tilde{x}$ \While{$\|Ax-b\|_2^2 > \frac{1+\eps}{\beta}\|A\tilde{x} - b\|_2^2$} \State $x\leftarrow \left(I-\frac{1}{\sigma_{max}^2} A^TA\right)x + \frac{1}{\sigma_{max}^2}A^Tb$ \EndWhile \State\Return $x$ \end{algorithmic} \end{algorithm} When the algorithm terminates, we have $x$ such that $\|Ax-b\|_2^2\leq\frac{1+\eps}{\beta}\|A\tilde{x}-b\|_2^2\leq (1+\eps)\text{OPT}$, which proves correctness. We now want to find a bound on the running time. When we start the algorithm, we have $f(\tilde{x})\leq \beta f(x^*)$, that is $f(\tilde{x}) - f(x^*)\leq (\beta-1)f(x^*)$. When we end we have $f(x) - f(x^*)\leq \eps f(x^*)$. That is, the absolute error was multiplied by a factor $\frac{\eps}{\beta-1}$. We saw in class that this can be achieved by $O\left(\frac{\sigma_{max}^2}{\sigma_{min}^2}\log\frac{\beta-1}{\eps}\right)$ iterations of the gradient descent method. Using the definition of $\kappa(A)$, this is $O\left(\kappa(A)^2\log\frac{\beta-1}{\eps}\right)$ repetitions of the while loop in Algorithm~\ref{alg:gradient}. Each iteration only involves matrix arithmetic an can be computed in time $O(nd)$. Overall the complexity is $O\left(nd\,\kappa(A)^2\log\frac{\beta-1}{\eps}\right)$.\footnote{By pre-computing $A^TA$, $A^Tb$ and by being careful with the order in which we compute the matrix-vector products, we can in fact have a complexity of $O\left(nd + d^2\,\kappa(A)^2\log\frac{\beta-1}{\eps}\right)$.} \section*{Problem 4} Time spent: Problem 1 (1h30), Problem 2 (1h), Problem 3 (1h). Total time: 3h30. \end{document}