added subsection learning is harder than optimizing

author: jeanpouget-abadie <jean.pougetabadie@gmail.com> 2015-03-19 11:34:03 -0400
committer: jeanpouget-abadie <jean.pougetabadie@gmail.com> 2015-03-19 11:34:03 -0400
commit: 4a68b488b10e6f32e154f3316eb0026b7fa33b10 (patch)
tree: e28b9297e4a3dbc4c716748a42d744b6bd1b400d /results.tex
parent: da6f6f069e0272d237c3d29e50c737d803fb9a3b (diff)
download: learn-optimize-4a68b488b10e6f32e154f3316eb0026b7fa33b10.tar.gz
1 files changed, 154 insertions, 67 deletions
diff --git a/results.tex b/results.tex
index 2c0873d..f9ca671 100644
--- a/results.tex
+++ b/results.tex
@@ -4,6 +4,7 @@
 \newtheorem{proposition}{Proposition}
 \newtheorem{corollary}{Corollary}
 \newtheorem{problem}{Problem}
+\newtheorem{theorem}{Theorem}
 
 \title{Learn and/or Optimize}
 \author{}
@@ -14,109 +15,165 @@
 
 \section{Preliminary Results}
 
-We cite the following result from \cite{vershynin2010introduction} (Remark 5.40):
+We cite the following result from~\cite{vershynin2010introduction} (Remark 5.40):
 
-\begin{proposition}
-\label{prop:non_isotropic_isometry}
-Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are independent sub-gaussian random vectors in $\mathbb{R}^n$ with second moment matrix $\Sigma$. Then for every $t \geq 0$, the following inequality holds with probability at least: $1- 2 \exp(-ct^2)$:
-\begin{equation}
-\|\frac{1}{N} A^T A - \Sigma \| \leq \max(\delta, \delta^2) \text{ where } \delta = C\sqrt{\frac{n}{N}} + \frac{t}{\sqrt{N}}
-\end{equation}
-where $C$, $c$ depend only on the subgaussian norm of the rows $K \equiv \max_i \| A_i\|_{\psi_2}$.
+\begin{proposition}\label{prop:non_isotropic_isometry} 
+Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are independent
+sub-gaussian random vectors in $\mathbb{R}^n$ with second moment matrix
+$\Sigma$. Then for every $t \geq 0$, the following inequality holds with
+probability at least: $1- 2 \exp(-ct^2)$: \begin{equation} \|\frac{1}{N} A^T A
+- \Sigma \| \leq \max(\delta, \delta^2) \ \text{where}\ \delta =
+C\sqrt{\frac{n}{N}} + \frac{t}{\sqrt{N}} \end{equation} where $C$, $c$ depend
+only on the subgaussian norm of the rows $K \equiv \max_i \| A_i\|_{\psi_2}$.
 \end{proposition}
 
-The following result is a simple corollary of Proposition~\ref{prop:non_isotropic_isometry}:
+The following result is a simple corollary of
+Proposition~\ref{prop:non_isotropic_isometry}:
 
-\begin{corollary}
-\label{cor:number_of_rows_needed}
-Let $n \in \mathbb{N}$ and $k \in ]0,n[$. Assume that $A$ is an $N \times n$ 
-matrix whose rows $A_i$ are independent Bernoulli variable vectors in 
-$\mathbb{R}^n$ such that $\mathbb{P}(A_{i,j} = 1) = \frac{k}{n} = 1 - 
-\mathbb{P}(A_{i,j} = 0)$ and let $\sigma \equiv \frac{k}{n}(1- \frac{k}{n}) 
-\neq 0$, then if $N > (C+1)^2 n/\sigma^2$, the matrix A has full row rank 
-with probability at least $1 - e^{-cn}$, where $C, c$ are constant depending only on  the subgaussian norm of the rows $K \equiv \sup_{p \geq 1} \frac{1}{\sqrt{p}}(\mathbb{E}|A_1|^p)^\frac{1}{p} = k$\footnote{Is this true? And how do the constants behave?}
-\end{corollary}
+\begin{corollary}\label{cor:number_of_rows_needed} Let $n \in \mathbb{N}$ and
+  $k \in ]0,n[$. Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are
+  independent Bernoulli variable vectors in $\mathbb{R}^n$ such that
+  $\mathbb{P}(A_{i,j} = 1) = \frac{k}{n} = 1 - \mathbb{P}(A_{i,j} = 0)$ and let
+  $\sigma \equiv \frac{k}{n}(1- \frac{k}{n}) \neq 0$, then if $N > {(C+1)}^2
+  n/\sigma^2$, the matrix A has full row rank with probability at least $1 -
+  e^{-cn}$, where $C, c$ are constant depending only on  the subgaussian norm
+  of the rows $K \equiv \sup_{p \geq 1}
+  {\frac{1}{\sqrt{p}}(\mathbb{E}|A_1|^p)}^\frac{1}{p} = k$\footnote{Is this true?
+  And how do the constants behave?} \end{corollary}
 
-\begin{proof}
-It is easy to compare the kernels of $A$ and $A^TA$ and notice that $rank(A) = rank(A^T A)$. Since $A^TA$ is an $n \times n$ matrix, it follows that if $A^TA$ is invertible, then $A$ has full row rank. In other words, if $A$ has smallest singular value $\sigma_{\min}(A) > 0$, then $A$ has full row rank. 
-Consider Prop.~\ref{prop:non_isotropic_isometry} with $t \equiv \sqrt{n}$ and with $N > (C+1)^2 n$, then with probability $1 - 2\exp(-cn)$:
-\begin{equation}
-\|\frac{1}{N} A^T A - \sigma I \| \leq (C+1)\sqrt{\frac{n}{N}}
-\end{equation}
+\begin{proof} It is easy to compare the kernels of $A$ and $A^TA$ and notice
+  that $rank(A) = rank(A^T A)$. Since $A^TA$ is an $n \times n$ matrix, it
+  follows that if $A^TA$ is invertible, then $A$ has full row rank. In other
+  words, if $A$ has smallest singular value $\sigma_{\min}(A) > 0$, then $A$
+  has full row rank.  Consider Prop.~\ref{prop:non_isotropic_isometry} with $t
+  \equiv \sqrt{n}$ and with $N > {(C+1)}^{2} n$, then with probability $1 -
+  2\exp(-cn)$: \begin{equation} \|\frac{1}{N} A^T A - \sigma I \| \leq
+    (C+1)\sqrt{\frac{n}{N}} \end{equation}
 
-It follows that for any vector $x \in \mathbb{R}^n$, $|\frac{1}{N}\|A x\|^2_2 -\sigma | \leq (C+1)\sqrt{\frac{n}{N}} \implies \|A x\|^2_2 \geq N\left(\sigma - (C+1)\sqrt{\frac{n}{N}}\right)$. If $N > (C+1)^2n/\sigma^2$, then $\sigma_{\min}(A) > 0$ with probability at least $1 - e^{-cn}$
-\end{proof}
+It follows that for any vector $x \in \mathbb{R}^n$, $|\frac{1}{N}\|A x\|^2_2
+-\sigma | \leq (C+1)\sqrt{\frac{n}{N}} \implies \|A x\|^2_2 \geq N\left(\sigma
+- (C+1)\sqrt{\frac{n}{N}}\right)$. If $N > {(C+1)}^2n/\sigma^2$, then
+$\sigma_{\min}(A) > 0$ with probability at least $1 - e^{-cn}$ \end{proof}
 
-Note that this is clearly not optimal. For example, note that for $k=1$, the solution to the coupon collector problem states that only ${\cal O}(n\log n)$ rows are sufficient before the matrix $A$ has full row rank with high probability, and not ${\cal O}(n^2)$ as stated by Corollary~\ref{cor:number_of_rows_needed}.
+Note that this is clearly not optimal. For example, note that for $k=1$, the
+solution to the coupon collector problem states that only ${\cal O}(n\log n)$
+rows are sufficient before the matrix $A$ has full row rank with high
+probability, and not ${\cal O}(n^2)$ as stated by
+Corollary~\ref{cor:number_of_rows_needed}.
 
 \section{Passive Optimization of Additive Functions}
 
-Let $\Omega$ be a universe of elements of size $n$. For $k \in [n]$, let $D_k$ be the uniform probability distribution over all sets of size $k$, and let $D_p$ be the product probability distribution over sets such that $\forall j \in \Omega, \mathbb{P}_{S \sim D_p}(j \in S) = p$. Note that $D_{k}$ and $D_{p}$ for $p \equiv \frac{k}{n}$ are very similar, and will be considered almost interchangeably throughout the following notes. Consider the following problem:
+Let $\Omega$ be a universe of elements of size $n$. For $k \in [n]$, let $D_k$
+be the uniform probability distribution over all sets of size $k$, and let
+$D_p$ be the product probability distribution over sets such that $\forall j
+\in \Omega, \mathbb{P}_{S \sim D_p}(j \in S) = p$. Note that $D_{k}$ and
+$D_{p}$ for $p \equiv \frac{k}{n}$ are very similar, and will be considered
+almost interchangeably throughout the following notes. Consider the following
+problem:
 
-\begin{problem}
-Let $f = (w_i)_{i \in \Omega}$ be an unknown additive function: $f(S) \equiv \sum_{u \in S} w_u$. How many (set, value) pairs $(S, f(S))$ where $S \sim D_k$ must be observed until we can optimize $f$ exactly or to a constant approximation factor under cardinality constraint $k$?
+\begin{problem} Let $f = {(w_i)}_{i \in \Omega}$ be an unknown additive function:
+  $f(S) \equiv \sum_{u \in S} w_u$. How many (set, value) pairs $(S, f(S))$
+  where $S \sim D_k$ must be observed until we can optimize $f$ exactly or to a
+  constant approximation factor under cardinality constraint $k$?
 \end{problem}
 
 
-\subsection{Inverting the system}
-\label{subsec:invert_the_system}
+\subsection{Inverting the system}\label{subsec:invert_the_system}
 
-Note that for observations $(S_j, f(S_j))_{j \in N}$ we can write the system $A w = b$, where each row of $A_j$ corresponds to the indicator vector of the set $S_j$ and $b_j \equiv f(S_j)$. From Corollary~\ref{cor:number_of_rows_needed}, it is easy to see that $(C_K+1)^2 \frac{n^3}{k(n - k)}$ rows are sufficient for $A$ to have full row rank with probability $1-e^{-c_k n}$. If $A$ has full row rank, then it is easy to invert the system in polynomial time, and both learn and optimize $f$ for any cardinality constraint.
+Note that for observations ${(S_j, f(S_j))}_{j \in N}$ we can write the system $A
+w = b$, where each row of $A_j$ corresponds to the indicator vector of the set
+$S_j$ and $b_j \equiv f(S_j)$. From Corollary~\ref{cor:number_of_rows_needed},
+it is easy to see that ${(C_K+1)}^2 \frac{n^3}{k(n - k)}$ rows are sufficient for
+$A$ to have full row rank with probability $1-e^{-c_k n}$. If $A$ has full row
+rank, then it is easy to invert the system in polynomial time, and both learn
+and optimize $f$ for any cardinality constraint.
 
-\begin{proposition}
-Assume that $N$ pairs of set-value $(S_j, f(S_j))_{j \in N}$ are observed, where $S_j \sim D_p$ and $p \equiv \frac{k}{n}$. If $N > (C_k +1)^2 \frac{n}{\sigma^2}$, then we can learn $f$ exactly, and therefore optimize it under any cardinality constraint, with probability $1-e^{-cn}$.
+\begin{proposition} Assume that $N$ pairs of set-value ${(S_j, f(S_j))}_{j \in
+  N}$ are observed, where $S_j \sim D_p$ and $p \equiv \frac{k}{n}$. If $N >
+  {(C_k +1)}^2 \frac{n}{\sigma^2}$, then we can learn $f$ exactly, and therefore
+  optimize it under any cardinality constraint, with probability $1-e^{-cn}$.
 \end{proposition}
 
+TODO:A much easier result to show is that an $nxn$ matrix is singular with
+probability $1/\sqrt{2}^n$, look at the determinant of random matrix theory pdf
+-> and see that by taking the determinant mod 2 -> we can bound the probability
+that the determinant is even?
+
 \subsection{Average weight method}
 
-Another method is to compute for every node $i$ the \emph{average weight of every set containing element $i$}. Let $w_i$ be the weight of element $i$, $w \equiv f(\Omega) = \sum_{i \in \Omega} w_i$ and $p \equiv \frac{k}{n}$, then
-$$\forall i \in \Omega, \mathbb{E}_{S \sim D_p}\left[f(S)| i \in S\right] = pw + (1 -p)w_i$$
+Another method is to compute for every node $i$ the \emph{average weight of
+every set containing element $i$}. Let $w_i$ be the weight of element $i$, $w
+\equiv f(\Omega) = \sum_{i \in \Omega} w_i$ and $p \equiv \frac{k}{n}$, then
+$$\forall i \in \Omega, \mathbb{E}_{S \sim D_p}\left[f(S)| i \in S\right] = pw
++ (1 -p)w_i$$
 
-Note that the average weight of every set containing element $i$ preserves the ranking of the weights of the elements. For observations $(S_j, f(S_j))_{j \in N}$ and for $N_i \equiv |\{S : i \in S\}|$, we define the following estimator:
+Note that the average weight of every set containing element $i$ preserves the
+ranking of the weights of the elements. For observations ${(S_j, f(S_j))}_{j \in
+N}$ and for $N_i \equiv |\{S : i \in S\}|$, we define the following estimator:
 
-\begin{equation}
-\forall i \in \Omega, w_i^{N_i} \equiv \frac{1}{N_i}\sum_{S | i \in S} \frac{f(S) - pw}{1-p}
-\end{equation}
+\begin{equation} \forall i \in \Omega, w_i^{N_i} \equiv \frac{1}{N_i}\sum_{S |
+i \in S} \frac{f(S) - pw}{1-p} \end{equation}
 
-As shown above, $w_i^{N_i} \rightarrow w_i$ as $N_i \rightarrow +\infty$. We can obtain a concentration bound of $w_i^{N_i}$ around $w_i$, using Hoeffding's lemma:
+As shown above, $w_i^{N_i} \rightarrow w_i$ as $N_i \rightarrow +\infty$. We
+can obtain a concentration bound of $w_i^{N_i}$ around $w_i$, using Hoeffding's
+lemma:
 
-\begin{equation}
-\mathbb{P}\left(\middle|w_i^{N_i} - w_i \middle|  \geq \epsilon w_i \right) \leq 2e^{-2(1-p)N_i\frac{\epsilon^2 w_i^2}{w^2}}
+\begin{equation} \mathbb{P}\left(\middle|w_i^{N_i} - w_i \middle|  \geq
+\epsilon w_i \right) \leq 2e^{-2(1-p)N_i\frac{\epsilon^2 w_i^2}{w^2}}
 \end{equation}
 
-\emph{TODO: multiplicative boudns are very bad for zero weights...Need to look at additive bounds for these zeros.}
+\emph{TODO:multiplicative boudns are very bad for zero weights\dots Need to look
+at additive bounds for these zeros.}
 
-For $N_i$ sufficiently large for each element $i$, we have $\forall i \in \Omega, (1-\epsilon) w_i \leq w_i^{N_i} \leq (1 + \epsilon) w_i$. Under this assumption, if we choose the $k$ elements with largest estimated weight $W_i^{N_i}$, we obtain a $\frac{1-\epsilon}{1+\epsilon}$-approximation to OPT, where OPT is the value of the maximum weight set of $k$ elements for the function $f$. To ensure that $N_i$ is sufficiently large for each element, we note that $\mathbb{E}(N_i) = pN$ and use a Chernoff bound coupled with a classic union bound:
+For $N_i$ sufficiently large for each element $i$, we have $\forall i \in
+\Omega, (1-\epsilon) w_i \leq w_i^{N_i} \leq (1 + \epsilon) w_i$. Under this
+assumption, if we choose the $k$ elements with largest estimated weight
+$W_i^{N_i}$, we obtain a $\frac{1-\epsilon}{1+\epsilon}$-approximation to OPT,
+where OPT is the value of the maximum weight set of $k$ elements for the
+function $f$. To ensure that $N_i$ is sufficiently large for each element, we
+note that $\mathbb{E}(N_i) = pN$ and use a Chernoff bound coupled with a
+classic union bound:
 
-\begin{equation}
-\mathbb{P}\left(\bigcup_{i \in \Omega} \left[N_i \leq \frac{pN}{2}\right]\right) \leq \sum_{i\in \Omega} \mathbb{P}\left(N_i \leq \frac{pN}{2}\right) \leq n e^{-\frac{pN}{8}}
-\end{equation}
+\begin{equation} \mathbb{P}\left(\bigcup_{i \in \Omega} \left[N_i \leq
+\frac{pN}{2}\right]\right) \leq \sum_{i\in \Omega} \mathbb{P}\left(N_i \leq
+\frac{pN}{2}\right) \leq n e^{-\frac{pN}{8}} \end{equation}
 
-As such, for $C > 0$ and $N \geq (C+1)\frac{8}{p}\log n$, we have $\forall i \in \Omega, N_i \geq \frac{pN}{2}$ with probability at least $1-\frac{1}{n^C}$
+As such, for $C > 0$ and $N \geq (C+1)\frac{8}{p}\log n$, we have $\forall i
+\in \Omega, N_i \geq \frac{pN}{2}$ with probability at least $1-\frac{1}{n^C}$
 
-\begin{proposition}
-Assume that $N$ pairs of set-value $(S_j, f(S_j))_{j \in N}$ are observed, where $S_j \sim D_p$ and $p \equiv \frac{k}{n}$. If $N > XXX$, then we can $\epsilon$-learn $f$ and optimize it to a $(1+\epsilon)/(1-\epsilon)$ factor for any cardinality constraint with probability $XXX$.
-\end{proposition}
+\begin{proposition} Assume that $N$ pairs of set-value ${(S_j, f(S_j))}_{j \in
+  N}$ are observed, where $S_j \sim D_p$ and $p \equiv \frac{k}{n}$. If $N >
+  XXX$, then we can $\epsilon$-learn $f$ and optimize it to a
+  $(1+\epsilon)/(1-\epsilon)$ factor for any cardinality constraint with
+  probability $XXX$.  \end{proposition}
 
 \section{Passive Optimization of Submodular functions}
 
 \subsection{Inverting the system}
 
-A result by \cite{balcan2011learning} states that:
+A result by~\cite{balcan2011learning} states that:
 
-\begin{proposition}
-\label{prop:balcan_learned}
-Let ${\cal F}$ be the class of non-negative, monotone, submodular functions over $\Omega$. There is an algorithm that PMAC-learns ${\cal F}$ with approximation factor $\sqrt{n+1}$, by outputting a function $\tilde f: S\mapsto \sqrt{\frac{1}{(n+1)z}w^T \chi(S)}$.
-\end{proposition}
+\begin{proposition}\label{prop:balcan_learned} Let ${\cal F}$ be the class of
+  non-negative, monotone, submodular functions over $\Omega$. There is an
+  algorithm that PMAC-learns ${\cal F}$ with approximation factor $\sqrt{n+1}$,
+  by outputting a function $\tilde f: S\mapsto \sqrt{\frac{1}{(n+1)z}w^T
+  \chi(S)}$.  \end{proposition}
 
-By adapting Section~\ref{subsec:invert_the_system}, we get the following result:
+By adapting Section~\ref{subsec:invert_the_system}, we get the following
+result:
 
-\begin{proposition}
-Any non-negative monotone submodular function can be passively optimized under cardinality constraint $k \in \mathbb{N}$ from $D_p$ for \emph{any} $p \in ]0, 1[$ with polynomially many samples up to a factor $1/\sqrt{n+1}$. 
-\end{proposition}
+TODO:actually this is is not entirely accurate! Though we might not care about
+not learning the zeros of the function, not being able to estimate correctly
+a fraction of the non-zero sets is a big problem
 
-\begin{proof}
-We optimize $\tilde f$ from Proposition~\ref{prop:balcan_learned}, which can easily be done by solving for the system $Ax = b^2$ where $(b^2)_i = (b_i)^2$. TODO: complete proof
+\begin{proposition} Any non-negative monotone submodular function can be
+passively optimized under cardinality constraint $k \in \mathbb{N}$ from $D_p$
+for \emph{any} $p \in ]0, 1[$ with polynomially many samples up to a factor
+  $1/\sqrt{n+1}$. TODO:what about $\epsilon$ \end{proposition}
+
+\begin{proof} We optimize $\tilde f$ from
+Proposition~\ref{prop:balcan_learned}, which can easily be done by solving for
+the system $Ax = b^2$ where ${(b^2)}_i = (b_i)^2$. TODO: complete proof
 \end{proof}
 
 
@@ -125,8 +182,38 @@ We optimize $\tilde f$ from Proposition~\ref{prop:balcan_learned}, which can eas
 
 Same thing can probably be said for the average weight method.
 
-\bibliography{optimize}
-\bibliographystyle{apalike}
+\subsection{Passive Optimization can be easier than Learning}
+
+We consider the Matroid construction from~\cite{balcan2011learning}: it is easy
+ to see that from this example we can construct a set of matroid rank functions 
+ which can never be learned (under any distribution, even with active queries!)
+ but which are very easy to optimize exactly! We recall the Matroid construction
+ (Theorem 1):
+
+\begin{theorem}
+  For any $k$ with $k = 2^{o(n^{1/3}}$, there exists a family of sets ${\cal A}
+  \subseteq2^{[n]}$ and a family of matroids $\cal{M} = \{M_{\cal{B}} :
+  \cal{B} \subseteq\cal{A} \}$ such that:
+  \begin{itemize}
+    \item $|\cal{A}| = k$ and $|A| = n^{1/3}$ for every $A \in \cal{A}$
+    \item For every $\cal{B} \subseteq\cal{A}$ and every $A \in \cal{A}$, we
+      have:
+      \begin{align*}
+        \text{rank}_{M_{\cal{B}}}(A) & = 8 \log k \ if A\in\cal{B} \\
+                                     & = |A| \ if A\in \cal{A}\backslash\cal{B}
+      \end{align*}
+  \end{itemize}
+\end{theorem}
+
+By considering the above family of matroids, such that
+$|\cal{A}\backslash\cal{B}| \geq frac{1}{n}\cal{A}$, we will observe at least
+one element from $\cal{A}\backslash\cal{B}$ with constant probability after
+polynomially many observations as long as the distribution is uniform (but for
+a more general class of distribution $\cal{D}$ this would be true as well),
+which means we can maximise exactly. Yet we cannot learn the family of matroid
+rank functions under any distribution.
+
+\bibliography{optimize} \bibliographystyle{apalike}
 
 
-\end{document}
-\ No newline at end of file
+\end{document}
author	jeanpouget-abadie <jean.pougetabadie@gmail.com>	2015-03-19 11:34:03 -0400
committer	jeanpouget-abadie <jean.pougetabadie@gmail.com>	2015-03-19 11:34:03 -0400
commit	4a68b488b10e6f32e154f3316eb0026b7fa33b10 (patch)
tree	e28b9297e4a3dbc4c716748a42d744b6bd1b400d /results.tex
parent	da6f6f069e0272d237c3d29e50c737d803fb9a3b (diff)
download	learn-optimize-4a68b488b10e6f32e154f3316eb0026b7fa33b10.tar.gz