1 files changed, 84 insertions, 66 deletions
diff --git a/results.tex b/results.tex
index bae1694..f90c72f 100644
--- a/results.tex
+++ b/results.tex
@@ -14,6 +14,7 @@
 \newtheorem{problem}{Problem}
 \newtheorem{theorem}{Theorem}
 \newtheorem{claim}{Claim}
+\newtheorem{remark}{Remark}
 
 \title{Learn and/or Optimize}
 \author{}
@@ -23,6 +24,7 @@
 \maketitle
 
 \section{Preliminary Results}
+\label{sec:matrix_theory}
 
 \subsection{Generic Random Matrix Theory}
 We cite the following result from~\cite{vershynin2010introduction} (Remark 5.40):
@@ -140,9 +142,9 @@ an absolute constant, we can return a set $S$ such that $S \in K$ and $f(S)
 In the case of \emph{passive} observations of set-value pairs under a
 distribution $D$ for a function $f$, recent research has focused on whether we
 can efficiently and approximately \emph{learn} $f$. This was formalized in the
-{\cal PMAC} model from \cite{balcan2011learning}. When thinking about passive
+PMAC model from \cite{balcan2011learning}. When thinking about passive
 optimization, it is necessary to understand the link between being able to
-learn $D-{\cal PMAC}$ $f$ and being able to $D-$optimize $f$.
+ $D-PMAC$ learn $f$ and being able to $D-$optimize $f$.
 
 \subsection{Additive function}
 
@@ -153,88 +155,104 @@ adopt the notation $w \equiv f(\Omega)$.
 
 \subsubsection{Inverting the system}\label{subsec:invert_the_system}
 
-Note that for observations ${(S_j, f(S_j))}_{j \in N}$ we can write the system
-$A w = b$, where each row of $A_j$ corresponds to the indicator vector of the
-set $S_j$ and $b_j \equiv f(S_j)$. From
-Corollary~\ref{cor:number_of_rows_needed}, it is easy to see that ${(C_K+1)}^2
-\frac{n^3}{k(n - k)}$ rows are sufficient for $A$ to have full row rank with
-probability $1-e^{-c_k n}$. If $A$ has full row rank, then it is easy to invert
-the system in polynomial time, and both learn and optimize $f$ for any
-cardinality constraint.
+Suppose we have observed $n^c$ set-value pairs ${(S_j, f(S_j))}_{j \in N}$ with
+$S_j \sim D$ where $n \equiv |\Omega|$. Consider the $n^c \times n$ matrix $A$ where for all $i$, the row
+$A_i = \chi_{S_i}$, the indicator vector of set $S_i$. Let $B$ be the $n^c
+\times 1$ vector such that $\forall i, b_j \equiv f(S_j)$. It is easy to see
+that if $w$ is the $n \times 1$ corresponding weight vector for $f$ then:
+\begin{displaymath}
+  A w = B
+\end{displaymath}
 
-\begin{proposition} Assume that $N$ pairs of set-value ${(S_j, f(S_j))}_{j \in
-  N}$ are observed, where $S_j \sim D_p$ and $p \equiv \frac{k}{n}$. If $N >
-  {(C_k +1)}^2 \frac{n}{\sigma^2}$, then we can learn $f$ exactly, and therefore
-  optimize it under any cardinality constraint, with probability $1-e^{-cn}$.
-\end{proposition}
+Note that if $A$ has full column rank, then we can solve for $w$ exactly and
+also optimize $f$ under any cardinality constraint. We can therefore cast the
+question of $D-$learning and $D-$optimizing $f$ as a random matrix theory
+problem: what is the probability that after $n^c$ for $c > 0$ independent
+samples from $D$, the matrix $A$ will have full rank? See
+section~\ref{sec:matrix_theory}
 
-\subsubsection{Average weight method}
+\paragraph{Extension}
+Note that the previous reasoning can easily be extended to any \emph{almost}
+additive function. Consider a function $g$ such that there exists $\alpha > 0$
+and $\beta > 0$ and an additive function $f$ such that $\forall S, \alpha f(S)
+\leq g(S) \leq \beta f(S)$, then by solving for $\max_{S\in K} f(S)$ we have a
+$\alpha/\beta$-approximation to the optimum of $g$ since:
 
-Another method is to compute for every node $i$ the \emph{average weight of
-every set containing element $i$}. Let $w_i$ be the weight of element $i$, $w
-\equiv f(\Omega) = \sum_{i \in \Omega} w_i$ and $p \equiv \frac{k}{n}$, then
-$$\forall i \in \Omega, \mathbb{E}_{S \sim D_p}\left[f(S)| i \in S\right] = pw
-+ (1 -p)w_i$$
-
-Note that the average weight of every set containing element $i$ preserves the
-ranking of the weights of the elements. For observations ${(S_j, f(S_j))}_{j \in
-N}$ and for $N_i \equiv |\{S : i \in S\}|$, we define the following estimator:
+\begin{displaymath}
+g(S^*) \geq \alpha f(S^*) \geq \alpha f(OPT) \geq
+\frac{\alpha}{\beta} g(OPT)
+\end{displaymath}
 
-\begin{equation} \forall i \in \Omega, w_i^{N_i} \equiv \frac{1}{N_i}\sum_{S |
-i \in S} \frac{f(S) - pw}{1-p} \end{equation}
+where $OPT \in \arg \max_{S \in K} g(S)$. This can be taken one step further by
+considering a function $g$ such that there exists $\alpha, \beta >0$, an
+additive function $f$ and a bijective univariate function $\phi$, such that
+$\forall S, \alpha \phi(f(S)) \leq g(S) \leq \beta \phi(f(S))$. In this case, we solve the system $A w = \phi^{-1}(B)$ and obtain once again an $\alpha/\beta$-approximation to the optimum of $g$.
 
-As shown above, $w_i^{N_i} \rightarrow w_i$ as $N_i \rightarrow +\infty$. We
-can obtain a concentration bound of $w_i^{N_i}$ around $w_i$, using Hoeffding's
-lemma:
+\begin{remark}
+Note that here $D-$optimizing $f$ is easy because $D-$learning $f$ is easy. We
+would like to understand whether being able to $D-$learn $f$ is really
+necessary to $D-$optimizing it. In fact, many results for PMAC-learning more
+complex functions, such as general submodular functions, are negative. Can we
+hope to find positive results in cases where PMAC-learning is impossible?
+\end{remark}
 
-\begin{equation} \mathbb{P}\left(\middle|w_i^{N_i} - w_i \middle|  \geq
-\epsilon w_i \right) \leq 2e^{-2(1-p)N_i\frac{\epsilon^2 w_i^2}{w^2}}
-\end{equation}
+\subsubsection{Average weight method}
 
-\emph{TODO:multiplicative boudns are very bad for zero weights\dots Need to look
-at additive bounds for these zeros.}
+We consider here another method to $D-$optimizing $f$, which only requires
+$D-$learning $f$ approximately. Note that for every element $i \in \Omega$, and
+for a \emph{product} distribution $D$:
+\begin{displaymath}
+  \mathbb{E}_{S \sim D}(f(S)|i \in S) - \mathbb{E}_{S \sim D}(f(S) | i \notin
+  S) = w_i
+\end{displaymath}
 
-For $N_i$ sufficiently large for each element $i$, we have $\forall i \in
-\Omega, (1-\epsilon) w_i \leq w_i^{N_i} \leq (1 + \epsilon) w_i$. Under this
-assumption, if we choose the $k$ elements with largest estimated weight
-$W_i^{N_i}$, we obtain a $\frac{1-\epsilon}{1+\epsilon}$-approximation to OPT,
-where OPT is the value of the maximum weight set of $k$ elements for the
-function $f$. To ensure that $N_i$ is sufficiently large for each element, we
-note that $\mathbb{E}(N_i) = pN$ and use a Chernoff bound coupled with a
-classic union bound:
+%For every node
+%$i$, we compute the \emph{average weight of every set containing element $i$}. Let $w_i$ be
+%the weight of element $i$, $w \equiv f(\Omega) = \sum_{i \in \Omega} w_i$ and
+%$p \equiv \frac{k}{n}$, then $$\forall i \in \Omega, \mathbb{E}_{S \sim
+%D_p}\left[f(S)| i \in S\right] = pw + (1 -p)w_i$$
 
-\begin{equation} \mathbb{P}\left(\bigcup_{i \in \Omega} \left[N_i \leq
-\frac{pN}{2}\right]\right) \leq \sum_{i\in \Omega} \mathbb{P}\left(N_i \leq
-\frac{pN}{2}\right) \leq n e^{-\frac{pN}{8}} \end{equation}
+%Note that the average weight of every set containing element $i$ preserves the
+%ranking of the weights of the elements. For observations ${(S_j, f(S_j))}_{j \in
+%N}$ and for $N_i \equiv |\{S : i \in S\}|$, we define the following estimator:
 
-As such, for $C > 0$ and $N \geq (C+1)\frac{8}{p}\log n$, we have $\forall i
-\in \Omega, N_i \geq \frac{pN}{2}$ with probability at least $1-\frac{1}{n^C}$
+%\begin{equation} \forall i \in \Omega, w_i^{N_i} \equiv \frac{1}{N_i}\sum_{S |
+%i \in S} \frac{f(S) - pw}{1-p} \end{equation}
 
-\begin{proposition} Assume that $N$ pairs of set-value ${(S_j, f(S_j))}_{j \in
-  N}$ are observed, where $S_j \sim D_p$ and $p \equiv \frac{k}{n}$. If $N >
-  XXX$, then we can $\epsilon$-learn $f$ and optimize it to a
-  $(1+\epsilon)/(1-\epsilon)$ factor for any cardinality constraint with
-  probability $XXX$.  \end{proposition}
+%As shown above, $w_i^{N_i} \rightarrow w_i$ as $N_i \rightarrow +\infty$. We
+%can obtain a concentration bound of $w_i^{N_i}$ around $w_i$, using Hoeffding's
+%lemma:
 
-\subsection{General submodular functions}
+%\begin{equation} \mathbb{P}\left(\middle|w_i^{N_i} - w_i \middle|  \geq
+%\epsilon w_i \right) \leq 2e^{-2(1-p)N_i\frac{\epsilon^2 w_i^2}{w^2}}
+%\end{equation}
 
-\subsubsection{Inverting the system}
+%\emph{TODO:multiplicative boudns are very bad for zero weights\dots Need to look
+%at additive bounds for these zeros.}
 
-A result by~\cite{balcan2011learning} states that:
+%For $N_i$ sufficiently large for each element $i$, we have $\forall i \in
+%\Omega, (1-\epsilon) w_i \leq w_i^{N_i} \leq (1 + \epsilon) w_i$. Under this
+%assumption, if we choose the $k$ elements with largest estimated weight
+%$W_i^{N_i}$, we obtain a $\frac{1-\epsilon}{1+\epsilon}$-approximation to OPT,
+%where OPT is the value of the maximum weight set of $k$ elements for the
+%function $f$. To ensure that $N_i$ is sufficiently large for each element, we
+%note that $\mathbb{E}(N_i) = pN$ and use a Chernoff bound coupled with a
+%classic union bound:
 
-\begin{proposition}\label{prop:balcan_learned} Let ${\cal F}$ be the class of
-  non-negative, monotone, submodular functions over $\Omega$. There is an
-  algorithm that PMAC-learns ${\cal F}$ with approximation factor $\sqrt{n+1}$,
-  by outputting a function $\tilde f: S\mapsto \sqrt{\frac{1}{(n+1)z}w^T
-  \chi(S)}$.  \end{proposition}
+%\begin{equation} \mathbb{P}\left(\bigcup_{i \in \Omega} \left[N_i \leq
+%\frac{pN}{2}\right]\right) \leq \sum_{i\in \Omega} \mathbb{P}\left(N_i \leq
+%\frac{pN}{2}\right) \leq n e^{-\frac{pN}{8}} \end{equation}
 
-{\bf TODO}: Does this imply a good guarantee if we solve the system
-$Ax=f^{-1}(b)$? Probably not, though we might not care about not learning the
-zeros of the function perfectly, not being able to estimate correctly a
-fraction of the non-zero sets is a big problem
+%As such, for $C > 0$ and $N \geq (C+1)\frac{8}{p}\log n$, we have $\forall i
+%\in \Omega, N_i \geq \frac{pN}{2}$ with probability at least $1-\frac{1}{n^C}$
 
-\subsubsection{Average weight method}
+%\begin{proposition} Assume that $N$ pairs of set-value ${(S_j, f(S_j))}_{j \in
+  %N}$ are observed, where $S_j \sim D_p$ and $p \equiv \frac{k}{n}$. If $N >
+  %XXX$, then we can $\epsilon$-learn $f$ and optimize it to a
+  %$(1+\epsilon)/(1-\epsilon)$ factor for any cardinality constraint with
+  %probability $XXX$.  \end{proposition}
 
+\subsection{General submodular functions}
 
 \subsection{Parametric submodular functions}