\documentclass[10pt]{article}
\usepackage{fullpage, amsmath, amssymb, amsthm}

\newtheorem{proposition}{Proposition}
\newtheorem{corollary}{Corollary}
\newtheorem{problem}{Problem}
\newtheorem{theorem}{Theorem}

\title{Learn and/or Optimize}
\author{}
\date{}

\begin{document}
\maketitle

\section{Preliminary Results}

We cite the following result from~\cite{vershynin2010introduction} (Remark 5.40):

\begin{proposition}\label{prop:non_isotropic_isometry} 
Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are independent
sub-gaussian random vectors in $\mathbb{R}^n$ with second moment matrix
$\Sigma$. Then for every $t \geq 0$, the following inequality holds with
probability at least: $1- 2 \exp(-ct^2)$: \begin{equation} \|\frac{1}{N} A^T A
- \Sigma \| \leq \max(\delta, \delta^2) \ \text{where}\ \delta =
C\sqrt{\frac{n}{N}} + \frac{t}{\sqrt{N}} \end{equation} where $C$, $c$ depend
only on the subgaussian norm of the rows $K \equiv \max_i \| A_i\|_{\psi_2}$.
\end{proposition}

The following result is a simple corollary of
Proposition~\ref{prop:non_isotropic_isometry}:

\begin{corollary}\label{cor:number_of_rows_needed} Let $n \in \mathbb{N}$ and
  $k \in ]0,n[$. Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are
  independent Bernoulli variable vectors in $\mathbb{R}^n$ such that
  $\mathbb{P}(A_{i,j} = 1) = \frac{k}{n} = 1 - \mathbb{P}(A_{i,j} = 0)$ and let
  $\sigma \equiv \frac{k}{n}(1- \frac{k}{n}) \neq 0$, then if $N > {(C+1)}^2
  n/\sigma^2$, the matrix A has full row rank with probability at least $1 -
  e^{-cn}$, where $C, c$ are constant depending only on  the subgaussian norm
  of the rows $K \equiv \sup_{p \geq 1}
  {\frac{1}{\sqrt{p}}(\mathbb{E}|A_1|^p)}^\frac{1}{p} = k$\footnote{Is this true?
  And how do the constants behave?} \end{corollary}

\begin{proof} It is easy to compare the kernels of $A$ and $A^TA$ and notice
  that $rank(A) = rank(A^T A)$. Since $A^TA$ is an $n \times n$ matrix, it
  follows that if $A^TA$ is invertible, then $A$ has full row rank. In other
  words, if $A$ has smallest singular value $\sigma_{\min}(A) > 0$, then $A$
  has full row rank.  Consider Prop.~\ref{prop:non_isotropic_isometry} with $t
  \equiv \sqrt{n}$ and with $N > {(C+1)}^{2} n$, then with probability $1 -
  2\exp(-cn)$: \begin{equation} \|\frac{1}{N} A^T A - \sigma I \| \leq
    (C+1)\sqrt{\frac{n}{N}} \end{equation}

It follows that for any vector $x \in \mathbb{R}^n$, $|\frac{1}{N}\|A x\|^2_2
-\sigma | \leq (C+1)\sqrt{\frac{n}{N}} \implies \|A x\|^2_2 \geq N\left(\sigma
- (C+1)\sqrt{\frac{n}{N}}\right)$. If $N > {(C+1)}^2n/\sigma^2$, then
$\sigma_{\min}(A) > 0$ with probability at least $1 - e^{-cn}$ \end{proof}

Note that this is clearly not optimal. For example, note that for $k=1$, the
solution to the coupon collector problem states that only ${\cal O}(n\log n)$
rows are sufficient before the matrix $A$ has full row rank with high
probability, and not ${\cal O}(n^2)$ as stated by
Corollary~\ref{cor:number_of_rows_needed}.

\section{Passive Optimization of Additive Functions}

Let $\Omega$ be a universe of elements of size $n$. For $k \in [n]$, let $D_k$
be the uniform probability distribution over all sets of size $k$, and let
$D_p$ be the product probability distribution over sets such that $\forall j
\in \Omega, \mathbb{P}_{S \sim D_p}(j \in S) = p$. Note that $D_{k}$ and
$D_{p}$ for $p \equiv \frac{k}{n}$ are very similar, and will be considered
almost interchangeably throughout the following notes. Consider the following
problem:

\begin{problem} Let $f = {(w_i)}_{i \in \Omega}$ be an unknown additive function:
  $f(S) \equiv \sum_{u \in S} w_u$. How many (set, value) pairs $(S, f(S))$
  where $S \sim D_k$ must be observed until we can optimize $f$ exactly or to a
  constant approximation factor under cardinality constraint $k$?
\end{problem}


\subsection{Inverting the system}\label{subsec:invert_the_system}

Note that for observations ${(S_j, f(S_j))}_{j \in N}$ we can write the system $A
w = b$, where each row of $A_j$ corresponds to the indicator vector of the set
$S_j$ and $b_j \equiv f(S_j)$. From Corollary~\ref{cor:number_of_rows_needed},
it is easy to see that ${(C_K+1)}^2 \frac{n^3}{k(n - k)}$ rows are sufficient for
$A$ to have full row rank with probability $1-e^{-c_k n}$. If $A$ has full row
rank, then it is easy to invert the system in polynomial time, and both learn
and optimize $f$ for any cardinality constraint.

\begin{proposition} Assume that $N$ pairs of set-value ${(S_j, f(S_j))}_{j \in
  N}$ are observed, where $S_j \sim D_p$ and $p \equiv \frac{k}{n}$. If $N >
  {(C_k +1)}^2 \frac{n}{\sigma^2}$, then we can learn $f$ exactly, and therefore
  optimize it under any cardinality constraint, with probability $1-e^{-cn}$.
\end{proposition}

TODO:A much easier result to show is that an $nxn$ matrix is singular with
probability $1/\sqrt{2}^n$, look at the determinant of random matrix theory pdf
-> and see that by taking the determinant mod 2 -> we can bound the probability
that the determinant is even?

\subsection{Average weight method}

Another method is to compute for every node $i$ the \emph{average weight of
every set containing element $i$}. Let $w_i$ be the weight of element $i$, $w
\equiv f(\Omega) = \sum_{i \in \Omega} w_i$ and $p \equiv \frac{k}{n}$, then
$$\forall i \in \Omega, \mathbb{E}_{S \sim D_p}\left[f(S)| i \in S\right] = pw
+ (1 -p)w_i$$

Note that the average weight of every set containing element $i$ preserves the
ranking of the weights of the elements. For observations ${(S_j, f(S_j))}_{j \in
N}$ and for $N_i \equiv |\{S : i \in S\}|$, we define the following estimator:

\begin{equation} \forall i \in \Omega, w_i^{N_i} \equiv \frac{1}{N_i}\sum_{S |
i \in S} \frac{f(S) - pw}{1-p} \end{equation}

As shown above, $w_i^{N_i} \rightarrow w_i$ as $N_i \rightarrow +\infty$. We
can obtain a concentration bound of $w_i^{N_i}$ around $w_i$, using Hoeffding's
lemma:

\begin{equation} \mathbb{P}\left(\middle|w_i^{N_i} - w_i \middle|  \geq
\epsilon w_i \right) \leq 2e^{-2(1-p)N_i\frac{\epsilon^2 w_i^2}{w^2}}
\end{equation}

\emph{TODO:multiplicative boudns are very bad for zero weights\dots Need to look
at additive bounds for these zeros.}

For $N_i$ sufficiently large for each element $i$, we have $\forall i \in
\Omega, (1-\epsilon) w_i \leq w_i^{N_i} \leq (1 + \epsilon) w_i$. Under this
assumption, if we choose the $k$ elements with largest estimated weight
$W_i^{N_i}$, we obtain a $\frac{1-\epsilon}{1+\epsilon}$-approximation to OPT,
where OPT is the value of the maximum weight set of $k$ elements for the
function $f$. To ensure that $N_i$ is sufficiently large for each element, we
note that $\mathbb{E}(N_i) = pN$ and use a Chernoff bound coupled with a
classic union bound:

\begin{equation} \mathbb{P}\left(\bigcup_{i \in \Omega} \left[N_i \leq
\frac{pN}{2}\right]\right) \leq \sum_{i\in \Omega} \mathbb{P}\left(N_i \leq
\frac{pN}{2}\right) \leq n e^{-\frac{pN}{8}} \end{equation}

As such, for $C > 0$ and $N \geq (C+1)\frac{8}{p}\log n$, we have $\forall i
\in \Omega, N_i \geq \frac{pN}{2}$ with probability at least $1-\frac{1}{n^C}$

\begin{proposition} Assume that $N$ pairs of set-value ${(S_j, f(S_j))}_{j \in
  N}$ are observed, where $S_j \sim D_p$ and $p \equiv \frac{k}{n}$. If $N >
  XXX$, then we can $\epsilon$-learn $f$ and optimize it to a
  $(1+\epsilon)/(1-\epsilon)$ factor for any cardinality constraint with
  probability $XXX$.  \end{proposition}

\section{Passive Optimization of Submodular functions}

\subsection{Inverting the system}

A result by~\cite{balcan2011learning} states that:

\begin{proposition}\label{prop:balcan_learned} Let ${\cal F}$ be the class of
  non-negative, monotone, submodular functions over $\Omega$. There is an
  algorithm that PMAC-learns ${\cal F}$ with approximation factor $\sqrt{n+1}$,
  by outputting a function $\tilde f: S\mapsto \sqrt{\frac{1}{(n+1)z}w^T
  \chi(S)}$.  \end{proposition}

By adapting Section~\ref{subsec:invert_the_system}, we get the following
result:

TODO:actually this is is not entirely accurate! Though we might not care about
not learning the zeros of the function, not being able to estimate correctly
a fraction of the non-zero sets is a big problem

\begin{proposition} Any non-negative monotone submodular function can be
passively optimized under cardinality constraint $k \in \mathbb{N}$ from $D_p$
for \emph{any} $p \in ]0, 1[$ with polynomially many samples up to a factor
  $1/\sqrt{n+1}$. TODO:what about $\epsilon$ \end{proposition}

\begin{proof} We optimize $\tilde f$ from
Proposition~\ref{prop:balcan_learned}, which can easily be done by solving for
the system $Ax = b^2$ where ${(b^2)}_i = (b_i)^2$. TODO: complete proof
\end{proof}


\subsection{Average weight method}

Same thing can probably be said for the average weight method.

\subsection{Passive Optimization can be easier than Learning}

We consider the Matroid construction from~\cite{balcan2011learning}: it is easy
 to see that from this example we can construct a set of matroid rank functions 
 which can never be learned (under any distribution, even with active queries!)
 but which are very easy to optimize exactly! We recall the Matroid construction
 (Theorem 1):

\begin{theorem}
  For any $k$ with $k = 2^{o(n^{1/3}}$, there exists a family of sets ${\cal A}
  \subseteq2^{[n]}$ and a family of matroids $\cal{M} = \{M_{\cal{B}} :
  \cal{B} \subseteq\cal{A} \}$ such that:
  \begin{itemize}
    \item $|\cal{A}| = k$ and $|A| = n^{1/3}$ for every $A \in \cal{A}$
    \item For every $\cal{B} \subseteq\cal{A}$ and every $A \in \cal{A}$, we
      have:
      \begin{align*}
        \text{rank}_{M_{\cal{B}}}(A) & = 8 \log k \ if A\in\cal{B} \\
                                     & = |A| \ if A\in \cal{A}\backslash\cal{B}
      \end{align*}
  \end{itemize}
\end{theorem}

By considering the above family of matroids, such that
$|\cal{A}\backslash\cal{B}| \geq frac{1}{n}\cal{A}$, we will observe at least
one element from $\cal{A}\backslash\cal{B}$ with constant probability after
polynomially many observations as long as the distribution is uniform (but for
a more general class of distribution $\cal{D}$ this would be true as well),
which means we can maximise exactly. Yet we cannot learn the family of matroid
rank functions under any distribution.

\bibliography{optimize} \bibliographystyle{apalike}


\end{document}