\documentclass[10pt]{article}
\usepackage{fullpage, amsmath, amssymb, amsthm}
\usepackage[utf8x]{inputenc}

\newtheorem{proposition}{Proposition}
\newtheorem{corollary}{Corollary}
\newtheorem{problem}{Problem}
\newtheorem{theorem}{Theorem}

\title{Learn and/or Optimize}
\author{}
\date{}

\begin{document}
\maketitle

\section{Preliminary Results}

\subsection{Generic Random Matrix Theory}
We cite the following result from~\cite{vershynin2010introduction} (Remark 5.40):

\begin{proposition}\label{prop:non_isotropic_isometry} 
Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are independent
sub-gaussian random vectors in $\mathbb{R}^n$ with second moment matrix
$\Sigma$. Then for every $t \geq 0$, the following inequality holds with
probability at least: $1- 2 \exp(-ct^2)$: \begin{equation} \|\frac{1}{N} A^T A
- \Sigma \| \leq \max(\delta, \delta^2) \ \text{where}\ \delta =
C\sqrt{\frac{n}{N}} + \frac{t}{\sqrt{N}} \end{equation} where $C$, $c$ depend
only on the subgaussian norm of the rows $K \equiv \max_i \| A_i\|_{\psi_2}$.
\end{proposition}

The following result is a simple corollary of
Proposition~\ref{prop:non_isotropic_isometry}:

\begin{corollary}\label{cor:number_of_rows_needed} Let $n \in \mathbb{N}$ and
  $k \in ]0,n[$. Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are
  independent Bernoulli variable vectors in $\mathbb{R}^n$ such that
  $\mathbb{P}(A_{i,j} = 1) = \frac{k}{n} = 1 - \mathbb{P}(A_{i,j} = 0)$ and let
  $\sigma \equiv \frac{k}{n}(1- \frac{k}{n}) \neq 0$, then if $N > {(C+1)}^2
  n/\sigma^2$, the matrix A has full row rank with probability at least $1 -
  e^{-cn}$, where $C, c$ are constant depending only on  the subgaussian norm
  of the rows $K \equiv \sup_{p \geq 1}
  {\frac{1}{\sqrt{p}}(\mathbb{E}|A_1|^p)}^\frac{1}{p} = k$\footnote{Is this true?
  And how do the constants behave?} \end{corollary}

\begin{proof} It is easy to compare the kernels of $A$ and $A^TA$ and notice
  that $rank(A) = rank(A^T A)$. Since $A^TA$ is an $n \times n$ matrix, it
  follows that if $A^TA$ is invertible, then $A$ has full row rank. In other
  words, if $A$ has smallest singular value $\sigma_{\min}(A) > 0$, then $A$
  has full row rank.  Consider Prop.~\ref{prop:non_isotropic_isometry} with $t
  \equiv \sqrt{n}$ and with $N > {(C+1)}^{2} n$, then with probability $1 -
  2\exp(-cn)$: \begin{equation} \|\frac{1}{N} A^T A - \sigma I \| \leq
    (C+1)\sqrt{\frac{n}{N}} \end{equation}

It follows that for any vector $x \in \mathbb{R}^n$, $|\frac{1}{N}\|A x\|^2_2
-\sigma | \leq (C+1)\sqrt{\frac{n}{N}} \implies \|A x\|^2_2 \geq N\left(\sigma
- (C+1)\sqrt{\frac{n}{N}}\right)$. If $N > {(C+1)}^2n/\sigma^2$, then
$\sigma_{\min}(A) > 0$ with probability at least $1 - e^{-cn}$ \end{proof}

Note that this is clearly not optimal. For example, note that for $k=1$, the
solution to the coupon collector problem states that only ${\cal O}(n\log n)$
rows are sufficient before the matrix $A$ has full row rank with high
probability, and not ${\cal O}(n^2)$ as stated by
Corollary~\ref{cor:number_of_rows_needed}.

\paragraph{Note:} $\Theta(n\log n)$ is in expectation for the coupon collector
problem. If one want an exponentially small probability of failure, then
I think results on the coupon collector problem also imply that a quadratic
number of rows are required.

\subsection{More Direct Approach}

Here we work on $\mathbb{F}_2$. An $n\times n$ binary matrix can be seen as
a matrix over $\mathbb{F}_2$. Let us assume that each row of $A$ is chosen
uniformly at random among all binary rows of length $n$. A standard counting
arguments shows that the number of non-singular matrices over $\mathbb{F}_2$
is:
\begin{displaymath}
    N_n = (2^n-1)(2^n - 2)\dots (2^n- 2^{n-1})
    = (2^n)^n\prod_{i=1}^n\left(1-\frac{1}{2^i}\right)
\end{displaymath}

Hence the probability that our random matrix $A$ is invertible is:
\begin{displaymath}
    P_n  = \prod_{i=1}^n\left(1-\frac{1}{2^i}\right)
\end{displaymath}
It is easy to see that $P_n$ is a decreasing sequence. Its limit is
$\phi(\frac{1}{2})$ where $\phi$ is the Euler function. We have
$\phi(\frac{1}{2})\approx
0.289$ and $P_n$ converges
exponentially fast to this constant (one way to see this is to use the
Pentagonal number theorem).

Hence, if we observe $\ell\cdot n$ uniformly random binary rows, the
probability that they will have full column rank is at least:
\begin{displaymath}
    P_{\ell\cdot n}\geq 1 - \left(1-\phi\left(\frac{1}{2}\right)\right)^{\ell}
\end{displaymath}

Note that this is the probability of having full column rank over
$\mathbb{F}_2$. A standard linear algebra argument shows that this implies full
column rank over $\mathbb{R}$.

\paragraph{TODO:} Study the case where we only observe sets of size exactly
$k$, or at most $k$. This amounts to replacing $2^n$ in the computation above
by:
\begin{displaymath}
{n\choose k}\quad\text{or}\quad\sum_{j=0}^k{n\choose j}
\end{displaymath}

Thinking about it, why do we assume that the sample sets are of size exactly
$k$. I think it would make more sense from the learning perspective to consider
uniformly random sets. In this case, the above approach allows us to conclude
directly.

More generally, I think the “right” way to think about this is to look at $A$
as the adjacency matrix of a random $k$-regular graph. There are tons of
results about the probability of the adjacency matrix to be non singular.

\section{Passive Optimization of Additive Functions}

Let $\Omega$ be a universe of elements of size $n$. For $k \in [n]$,
let $D_k$ be the uniform probability distribution over all sets of size $k$,
and let $D_p$ be the product probability distribution over sets such that
$\forall j \in \Omega, \mathbb{P}_{S \sim D_p}(j \in S) = p$. Note that $D_{k}$
and $D_{p}$ for $p \equiv \frac{k}{n}$ are very similar, and will be considered
almost interchangeably throughout the following notes.

\textbf{TODO:} Make the argument about independently choosing elements with
proba $k/n$ vs choosing a set of size $k$ rigorous. The “more direct” approach
above should allow us to consider the case of sets of size $k$ directly.

Consider the following problem:

\begin{problem}
Let $f = (w_i)_{i \in \Omega}$ be an unknown additive function: $f(S) \equiv
\sum_{u \in S} w_u$. How many (set, value) pairs $(S, f(S))$ where $S \sim D_k$
must be observed until we can optimize $f$ exactly or to a constant
approximation factor under cardinality constraint $k$?
\end{problem}


\subsection{Inverting the system}\label{subsec:invert_the_system}

Note that for observations ${(S_j, f(S_j))}_{j \in N}$ we can write the system $A
w = b$, where each row of $A_j$ corresponds to the indicator vector of the set
$S_j$ and $b_j \equiv f(S_j)$. From Corollary~\ref{cor:number_of_rows_needed},
it is easy to see that ${(C_K+1)}^2 \frac{n^3}{k(n - k)}$ rows are sufficient for
$A$ to have full row rank with probability $1-e^{-c_k n}$. If $A$ has full row
rank, then it is easy to invert the system in polynomial time, and both learn
and optimize $f$ for any cardinality constraint.

\begin{proposition} Assume that $N$ pairs of set-value ${(S_j, f(S_j))}_{j \in
  N}$ are observed, where $S_j \sim D_p$ and $p \equiv \frac{k}{n}$. If $N >
  {(C_k +1)}^2 \frac{n}{\sigma^2}$, then we can learn $f$ exactly, and therefore
  optimize it under any cardinality constraint, with probability $1-e^{-cn}$.
\end{proposition}

\subsection{Average weight method}

Another method is to compute for every node $i$ the \emph{average weight of
every set containing element $i$}. Let $w_i$ be the weight of element $i$, $w
\equiv f(\Omega) = \sum_{i \in \Omega} w_i$ and $p \equiv \frac{k}{n}$, then
$$\forall i \in \Omega, \mathbb{E}_{S \sim D_p}\left[f(S)| i \in S\right] = pw
+ (1 -p)w_i$$

Note that the average weight of every set containing element $i$ preserves the
ranking of the weights of the elements. For observations ${(S_j, f(S_j))}_{j \in
N}$ and for $N_i \equiv |\{S : i \in S\}|$, we define the following estimator:

\begin{equation} \forall i \in \Omega, w_i^{N_i} \equiv \frac{1}{N_i}\sum_{S |
i \in S} \frac{f(S) - pw}{1-p} \end{equation}

As shown above, $w_i^{N_i} \rightarrow w_i$ as $N_i \rightarrow +\infty$. We
can obtain a concentration bound of $w_i^{N_i}$ around $w_i$, using Hoeffding's
lemma:

\begin{equation} \mathbb{P}\left(\middle|w_i^{N_i} - w_i \middle|  \geq
\epsilon w_i \right) \leq 2e^{-2(1-p)N_i\frac{\epsilon^2 w_i^2}{w^2}}
\end{equation}

\emph{TODO:multiplicative boudns are very bad for zero weights\dots Need to look
at additive bounds for these zeros.}

For $N_i$ sufficiently large for each element $i$, we have $\forall i \in
\Omega, (1-\epsilon) w_i \leq w_i^{N_i} \leq (1 + \epsilon) w_i$. Under this
assumption, if we choose the $k$ elements with largest estimated weight
$W_i^{N_i}$, we obtain a $\frac{1-\epsilon}{1+\epsilon}$-approximation to OPT,
where OPT is the value of the maximum weight set of $k$ elements for the
function $f$. To ensure that $N_i$ is sufficiently large for each element, we
note that $\mathbb{E}(N_i) = pN$ and use a Chernoff bound coupled with a
classic union bound:

\begin{equation} \mathbb{P}\left(\bigcup_{i \in \Omega} \left[N_i \leq
\frac{pN}{2}\right]\right) \leq \sum_{i\in \Omega} \mathbb{P}\left(N_i \leq
\frac{pN}{2}\right) \leq n e^{-\frac{pN}{8}} \end{equation}

As such, for $C > 0$ and $N \geq (C+1)\frac{8}{p}\log n$, we have $\forall i
\in \Omega, N_i \geq \frac{pN}{2}$ with probability at least $1-\frac{1}{n^C}$

\begin{proposition} Assume that $N$ pairs of set-value ${(S_j, f(S_j))}_{j \in
  N}$ are observed, where $S_j \sim D_p$ and $p \equiv \frac{k}{n}$. If $N >
  XXX$, then we can $\epsilon$-learn $f$ and optimize it to a
  $(1+\epsilon)/(1-\epsilon)$ factor for any cardinality constraint with
  probability $XXX$.  \end{proposition}

\section{Passive Optimization of Submodular functions}

\subsection{Inverting the system}

A result by~\cite{balcan2011learning} states that:

\begin{proposition}\label{prop:balcan_learned} Let ${\cal F}$ be the class of
  non-negative, monotone, submodular functions over $\Omega$. There is an
  algorithm that PMAC-learns ${\cal F}$ with approximation factor $\sqrt{n+1}$,
  by outputting a function $\tilde f: S\mapsto \sqrt{\frac{1}{(n+1)z}w^T
  \chi(S)}$.  \end{proposition}

{\bf TODO}: Does this imply a good guarantee if we solve the system
$Ax=f^{-1}(b)$? Probably not, though we might not care about not learning the
zeros of the function perfectly, not being able to estimate correctly a
fraction of the non-zero sets is a big problem

\subsection{Average weight method}

\section{Passive Optimization vs. Passive Learning}

\subsection{Returning max of observations}

This doesn't work. Give examples as to why! Remember that there are strong
concentration results for submodular functions -> look at expected value of
observed sets

\subsection{Passive Optimization can be easier than Passive Learning}

Recall the matroid construction from~\cite{balcan2011learning}:
\begin{theorem}
  For any $k$ with $k = 2^{o(n^{1/3})}$, there exists a family of sets ${\cal A}
  \subseteq2^{[n]}$ and a family of matroids $\cal{M} = \{M_{\cal{B}} :
  \cal{B} \subseteq\cal{A} \}$ such that:
  \begin{itemize}
    \item $|{\cal A}| = k$ and $|A| = n^{1/3}$ for every $A \in \cal{A}$
    \item For every $\cal{B} \subseteq\cal{A}$ and every $A \in \cal{A}$, we
      have:
      \begin{align*}
        \text{rank}_{M_{\cal{B}}}(A) & = 8 \log k \ if A\in{\cal B} \\
                                     & = |A| \ if A\in {\cal A}\backslash{\cal B}
      \end{align*}
  \end{itemize}
\end{theorem}

Consider the following subset of the above family of matroids: ${\cal
M}^{\epsilon} = \{M_{\cal B} : {\cal B} \subseteq{\cal A} \wedge |{\cal
A}\backslash{\cal B}| \geq \epsilon|{\cal A}|\}$ for $k = 2^{n^{1/6}}$.
Consider an \emph{unknown} function $f$, corresponding to the rank function of
one of the matroids $M_{\cal B}$ from ${\cal M}^{\epsilon}$.  Note that as long
as we observe \emph{one} set from ${\cal A} \backslash {\cal B}$, we can
optimize $f$ exactly! Indeed, the largest possible value for $f$ under
cardinality constraint $n^{1/3}$ is $\max_{A\in 2^{[n]}} |A| = n^{1/3}$.

One example of a distribution under which this occurs with probability at least
a constant is $D_u$, the uniform distribution over all sets of ${\cal A}$.
For $c>0$, after $n^c$ observations $O \equiv (S_i, f(S_i))_i$ for $S_i \sim D_u$, we will
observe at least one element from $\cal{A}\backslash\cal{B}$ with constant
probability:

\begin{equation} \nonumber \mathbb{P}(\bigcup_{S_i} S_i \in {\cal
A}\backslash{\cal B}) \geq 1 - (1 - \epsilon)^{n^c} \approx \epsilon n^c
\end{equation}

However, it follows from the analysis of~\cite{balcan2011learning} that we
cannot learn $f$ under any distribution, even with active value queries!
Indeed, for any polynomially-sized set of observations, there exists a
super-polynomially number of functions in ${\cal M^1}$ which coincide on this
set of observations, but which take very different values outside of this set
of observations: $8n^{1/6}$ for $A \in {\cal B}$ and $n^{1/3}$ for $A \in {\cal
A}\backslash {\cal B}$.

{\bf TODO:} A cleaner simpler example would be nice.


\bibliography{optimize} \bibliographystyle{apalike}


\end{document}