\documentclass[10pt]{article}
\usepackage{fullpage, amsmath, amssymb, amsthm, bbm}
\usepackage[utf8x]{inputenc}

\DeclareMathOperator{\E}{\mathbb{E}}
\let\P\relax
\DeclareMathOperator{\P}{\mathbb{P}}
\newcommand{\ex}[1]{\E\left[#1\right]}
\newcommand{\prob}[1]{\P\left[#1\right]}
\newcommand{\reals}{\mathbb{R}}
\newcommand{\ints}{\mathbb{N}}
\renewcommand{\O}{\mathcal{O}}


\newtheorem{proposition}{Proposition}
\newtheorem{corollary}{Corollary}
\newtheorem{problem}{Problem}
\newtheorem{theorem}{Theorem}
\newtheorem{claim}{Claim}
\newtheorem{remark}{Remark}

\title{Learn and/or Optimize}
\author{}
\date{}

\begin{document}
\maketitle

\section{Preliminary Results}
\label{sec:matrix_theory}

\subsection{Generic Random Matrix Theory}
We cite the following result from~\cite{vershynin2010introduction} (Remark 5.40):

\begin{proposition}\label{prop:non_isotropic_isometry} 
Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are independent
sub-gaussian random vectors in $\mathbb{R}^n$ with second moment matrix
$\Sigma$. Then for every $t \geq 0$, the following inequality holds with
probability at least: $1- 2 \exp(-ct^2)$: \begin{equation} \|\frac{1}{N} A^T A
- \Sigma \| \leq \max(\delta, \delta^2) \ \text{where}\ \delta =
C\sqrt{\frac{n}{N}} + \frac{t}{\sqrt{N}} \end{equation} where $C$, $c$ depend
only on the subgaussian norm of the rows $K \equiv \max_i \| A_i\|_{\psi_2}$.
\end{proposition}

The following result is a simple corollary of
Proposition~\ref{prop:non_isotropic_isometry}:

\begin{corollary}\label{cor:number_of_rows_needed} Let $n \in \mathbb{N}$ and
  $k \in ]0,n[$. Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are
  independent Bernoulli variable vectors in $\mathbb{R}^n$ such that
  $\mathbb{P}(A_{i,j} = 1) = \frac{k}{n} = 1 - \mathbb{P}(A_{i,j} = 0)$ and let
  $\sigma \equiv \frac{k}{n}(1- \frac{k}{n}) \neq 0$, then if $N > {(C+1)}^2
  n/\sigma^2$, the matrix A has full row rank with probability at least $1 -
  e^{-cn}$, where $C, c$ are constant depending only on  the subgaussian norm
  of the rows $K \equiv \sup_{p \geq 1}
  {\frac{1}{\sqrt{p}}(\mathbb{E}|A_1|^p)}^\frac{1}{p} = k$\footnote{Is this true?
  And how do the constants behave?} \end{corollary}

\begin{proof} It is easy to compare the kernels of $A$ and $A^TA$ and notice
  that $rank(A) = rank(A^T A)$. Since $A^TA$ is an $n \times n$ matrix, it
  follows that if $A^TA$ is invertible, then $A$ has full row rank. In other
  words, if $A$ has smallest singular value $\sigma_{\min}(A) > 0$, then $A$
  has full row rank.  Consider Prop.~\ref{prop:non_isotropic_isometry} with $t
  \equiv \sqrt{n}$ and with $N > {(C+1)}^{2} n$, then with probability $1 -
  2\exp(-cn)$: \begin{equation} \|\frac{1}{N} A^T A - \sigma I \| \leq
    (C+1)\sqrt{\frac{n}{N}} \end{equation}

It follows that for any vector $x \in \mathbb{R}^n$, $|\frac{1}{N}\|A x\|^2_2
-\sigma | \leq (C+1)\sqrt{\frac{n}{N}} \implies \|A x\|^2_2 \geq N\left(\sigma
- (C+1)\sqrt{\frac{n}{N}}\right)$. If $N > {(C+1)}^2n/\sigma^2$, then
$\sigma_{\min}(A) > 0$ with probability at least $1 - e^{-cn}$ \end{proof}

\subsection{More Direct Approach}

Here we work on $\mathbb{F}_2$. An $n\times n$ binary matrix can be seen as
a matrix over $\mathbb{F}_2$. Let us assume that each row of $A$ is chosen
uniformly at random among all binary rows of length $n$. A standard counting
arguments shows that the number of non-singular matrices over $\mathbb{F}_2$
is:
\begin{displaymath}
    N_n = (2^n-1)(2^n - 2)\dots (2^n- 2^{n-1})
    = (2^n)^n\prod_{i=1}^n\left(1-\frac{1}{2^i}\right)
\end{displaymath}

Hence the probability that our random matrix $A$ is invertible is:
\begin{displaymath}
    P_n  = \prod_{i=1}^n\left(1-\frac{1}{2^i}\right)
\end{displaymath}
It is easy to see that $P_n$ is a decreasing sequence. Its limit is
$\phi(\frac{1}{2})$ where $\phi$ is the Euler function. We have
$\phi(\frac{1}{2})\approx
0.289$ and $P_n$ converges
exponentially fast to this constant (one way to see this is to use the
Pentagonal number theorem).

Hence, if we observe $\ell\cdot n$ uniformly random binary rows, the
probability that they will have full column rank is at least:
\begin{displaymath}
    P_{\ell\cdot n}\geq 1 - \left(1-\phi\left(\frac{1}{2}\right)\right)^{\ell}
\end{displaymath}

Note that this is the probability of having full column rank over
$\mathbb{F}_2$. A standard linear algebra argument shows that this implies full
column rank over $\mathbb{R}$.

\paragraph{TODO:} Study the case where we only observe sets of size exactly
$k$, or at most $k$. This amounts to replacing $2^n$ in the computation above
by:
\begin{displaymath}
{n\choose k}\quad\text{or}\quad\sum_{j=0}^k{n\choose j}
\end{displaymath}

Thinking about it, why do we assume that the sample sets are of size exactly
$k$. I think it would make more sense from the learning perspective to consider
uniformly random sets. In this case, the above approach allows us to conclude
directly.

More generally, I think the “right” way to think about this is to look at $A$
as the adjacency matrix of a random $k$-regular graph. There are tons of
results about the probability of the adjacency matrix to be non singular.

\section{Examples}

\subsection{Generic Functions}

\paragraph{TODO:} Add citations!

These are generic examples and serve as building blocks for the applications
below:
\begin{itemize}
    \item $f(S) = g\left(\sum_{i\in S} w_i\right)$ for a concave
        $g:\reals\to\reals$ and weights $(w_i)_{i\in N}\in \reals_+^{|N|}$. Note
        that $f$ is monotone iff $g$ is monotone. In this case, $g$ does not
        matter for the purpose of optimization: the sets are in the same order
        with or without $g$, the only things which matter are the weights which
        serve as natural parameters for this class of functions. This class of
        functions contains:
        \begin{itemize}
            \item additive functions (when $g$ is the identity function).
            \item $f(S) = |S\cap X|$ for some set $X\subseteq N$. This is the
                case where the weights are $0/1$ and $g$ is the identity
                function.
            \item symmetric submodular functions: when the weights are all one.
            \item budget-additive functions, when $g:x\mapsto \min(B, x)$ for
                some $B$.
        \end{itemize}
    \item $f(S) = \max_{i\in S} w_i$ for weights $(w_i)_{i\in N}\in
        \reals_+^{|N|}$. This class of functions is also naturally parametrized
        by the weights.
    \item (weighted) matroid rank functions. Given a matroid $M$ over a ground
        set $N$, we define its rank function to be:
        \begin{displaymath}
            \forall S\subseteq N,\;
            r(S) = \max_{\substack{I\subseteq S\\I\in M}} |I|
        \end{displaymath}
        more generally, given a weight function $w:N\to\reals_+$, we define the
        weighted matroid rank function:
        \begin{displaymath}
            \forall S\subseteq N,\;
            r(S) = \max_{\substack{I\subseteq S\\I\in M}} \sum_{i\in I} w_i
        \end{displaymath}
\end{itemize}

\paragraph{Remark} The function $f(S)= \max_{i\in S}w_i$ is a weighted matroid
rank function for the $1$-uniform matroid (the matroid where the independent
sets are the sets of size at most one).

\subsection{Applications}

\begin{itemize}
    \item \emph{Coverage functions:} they can be written as a positive linear
        combination of matroid rank functions:
        \begin{displaymath}
            f(S) = \sum_{u\in\mathcal{U}} w_u c_u(S)
        \end{displaymath}
        where $c_u$ is the rank function of the matroid $M = \big\{ \emptyset,
        \{u\}\big\}$.
    \item \emph{Facility location:} (cite Bilmes) there is a universe
        $\mathcal{U}$ of locations and a proximity score $s_{i,j}$ for each
        pair of locations.  We pick a subset of locations $S$ and each point in
        the universe is allocated to its closest location (the one with highest
        proximity):
        \begin{displaymath}
            f(S) = \sum_{u\in\mathcal{U}} \max_{v\in S} s_{u,v}
        \end{displaymath}
        This can be seen as a sum of weighted matroid rank functions: one for
        each location in the universe associated with a $1$-uniform matroid
        (other applications: job scheduling).

    \item \emph{Image segmentation:} (cite Jegelka) can be (in some cases)
        written as a graph cut function. For image segmentation the goal is to
        minimize the cut.
        \begin{displaymath}
            f(S) = \sum_{e\in E} w_e c_e(S)
        \end{displaymath}
        where $c_e(S)$ is one iff $e\in E(S,\bar{S})$. \textbf{TODO:} I think
        this can be written as a matroid rank function.
    \item \emph{Learning} (cite Krause) there is
        a hypothesis $A$ (a random variable) which is “refined” when more
        observations are made.  Imagine that there is a finite set $X_1,\dots,
        X_n$ of possible observations (random variables). Then, assuming that
        the observations are independent conditioned on $A$, the information
        gain:
        \begin{displaymath}
            f(S) = H(A) - H\big(A\,|\,(X_i)_{i\in S}\big)
        \end{displaymath}
        is submodular. The $\log\det$ is the specific case of a linear
        hypothesis observed with additional independent Gaussian noise.
    \item \emph{Entropy:} Closely related to the previous one. If $(X_1,\dots,
        X_n)$ are random variables, then:
        $ f(S) = H(X_S) $ is submodular. In particular, if $(X_1,\dots,
        X_n)$ are jointly multivariate gaussian, then:
        \begin{displaymath}
            f(S) = c|S| + \frac{1}{2}\log\det X_SX_S^T
        \end{displaymath}
        for $c= 2\pi e...$ and we fall back to the usual $\log\det$ function.
    \item \emph{data subset selection/summarization:} in statistical machine
        translation, Bilmes used sum of concave over modular:
        \begin{displaymath}
            f(S) = \sum_{f} \lambda_f \phi\left(\sum_{e\in S}w_f(e)\right)
        \end{displaymath}
        where each $f$ represents a feature, $w_f(e)$ represents how much of
        $f$ element $e$ has, and $\phi$ captures decreasing marginal gain when
        we have a lot of a given feature.
        Facility location functions are also commonly used for subset selection.
    \item \emph{concave spectral functions} One would be tempted to say that
        any multivariate concave function of a modular function is submodular.
        This would be the natural generalization of concave over modular.
        However \textbf{this is not true in general}. However, a possible nice
        generalization is the following. Let $M$ be a symmetric $n\times
        n$ matrix, and $g$ is a ``matrix concave'' function. Then:
        \begin{displaymath}
            f(S) = \mathrm{Tr}\big(g(M_S)\big)
        \end{displaymath}
        is submodular. This contains the $\log\det$ (when $g$ is the matrix
        $\log$) but tons of other functions (like quantum entropy).
\end{itemize}
In summary, the two most general classes of submodular functions (which capture
all the examples known to man) are: sums of matrix concave functions and sums
of matroid rank functions. Sums of concave over modular are also nice if we
want to start with a simpler example.

\section{Passive Optimization}

Let $\Omega$ be the universe of elements and $f$ a function defined on subsets
of $\Omega$: $f : S \in 2^{[\Omega]} \mapsto f(S) \in \mathbb{R}$. Let $K$ be a
collection of sets of $2^{[\Omega]}$, which we call \emph{constraints}. Let
$S^*_K$ be any solution to $\max_{S \in K} f(S)$, which we will also denote by
$S^*$ when there is no ambiguity. Let $L$ be the problem size, which is often
(but not always) equal to $|\Omega|$.

In general, we say we can efficiently optimize a function $f$ under constraint
$K$ when we have a polynomial-time algorithm making adaptive value queries to
$f$,which returns a set $S$ such that $S \in K$ and $f(S) \geq \alpha f(S^*)$
with high probability and $\alpha$ an absolute constant.

Here, we consider the scenario where we cannot make adaptive value queries, and
in fact, where we cannot make queries at all! Instead, we suppose that we
observe a polynomial number of set-value pairs $(S, f(S))$ where $S$ is taken
from a known distribution $D$. We say we can efficiently \emph{passively
optimize} $f$ under distribution $D$ or $D-$optimize $f$ under constraints $K$
when, after observing ${\cal O}(L^c)$ set-value pairs from $D$ where $c > 0$ is
an absolute constant, we can return a set $S$ such that $S \in K$ and $f(S)
\geq \alpha f(S^*)$ with high probability and $\alpha$ an absolute constant.

In the case of \emph{passive} observations of set-value pairs under a
distribution $D$ for a function $f$, recent research has focused on whether we
can efficiently and approximately \emph{learn} $f$. This was formalized in the
PMAC model from \cite{balcan2011learning}. When thinking about passive
optimization, it is necessary to understand the link between being able to
 $D-PMAC$ learn $f$ and being able to $D-$optimize $f$.

\subsection{Additive function}

We consider here the simple case of additive functions. A function $f$ is
additive if there exists a weight vector $(w_s)_{s \in \Omega}$ such that
$\forall S \subseteq \Omega, \ f(S) = \sum_{s \in S} w_s$. We will sometimes
adopt the notation $w \equiv f(\Omega)$.

\subsubsection{Inverting the system}\label{subsec:invert_the_system}

Suppose we have observed $n^c$ set-value pairs ${(S_j, f(S_j))}_{j \in N}$ with
$S_j \sim D$ where $n \equiv |\Omega|$. Consider the $n^c \times n$ matrix $A$
where for all $i$, the row $A_i = \chi_{S_i}$, the indicator vector of set
$S_i$. Let $B$ be the $n^c \times 1$ vector such that $\forall i, b_j \equiv
f(S_j)$. It is easy to see that if $w$ is the $n \times 1$ corresponding weight
vector for $f$ then:
\begin{displaymath}
  A w = B
\end{displaymath}

Note that if $A$ has full column rank, then we can solve for $w$ exactly and
also optimize $f$ under any cardinality constraint. We can therefore cast the
question of $D-$learning and $D-$optimizing $f$ as a random matrix theory
problem: what is the probability that after $n^c$ for $c > 0$ independent
samples from $D$, the matrix $A$ will have full rank? See
section~\ref{sec:matrix_theory}

\paragraph{Extension}
Note that the previous reasoning can easily be extended to any \emph{almost}
additive function. Consider a function $g$ such that there exists $\alpha > 0$
and $\beta > 0$ and an additive function $f$ such that $\forall S, \alpha f(S)
\leq g(S) \leq \beta f(S)$, then by solving for $\max_{S\in K} f(S)$ we have a
$\alpha/\beta$-approximation to the optimum of $g$ since:

\begin{displaymath}
g(S^*) \geq \alpha f(S^*) \geq \alpha f(OPT) \geq
\frac{\alpha}{\beta} g(OPT)
\end{displaymath}

where $OPT \in \arg \max_{S \in K} g(S)$. This can be taken one step further by
considering a function $g$ such that there exists $\alpha, \beta >0$, an
additive function $f$ and a bijective univariate function $\phi$, such that
$\forall S, \alpha \phi(f(S)) \leq g(S) \leq \beta \phi(f(S))$. In this case, we
solve the system $A w = \phi^{-1}(B)$ and obtain once again an
$\alpha/\beta$-approximation to the optimum of $g$.

\begin{remark}
Note that here $D-$optimizing $f$ is easy because $D-$learning $f$ is easy. We
would like to understand whether being able to $D-$learn $f$ is really
necessary to $D-$optimizing it. In fact, many results for PMAC-learning more
complex functions, such as general submodular functions, are negative. Can we
hope to find positive results in cases where PMAC-learning is impossible?
\end{remark}

\subsubsection{Average weight method}

We consider here another method to $D-$optimizing $f$, which only requires
$D-$learning $f$ approximately. Note that for every element $i \in \Omega$, and
for a \emph{product} distribution $D$:
\begin{displaymath}
  \mathbb{E}_{S \sim D}(f(S)|i \in S) - \mathbb{E}_{S \sim D}(f(S) | i \notin
  S) = \sum_{j \neq i \in \Omega} w_s \left(\mathbb{P}(j\in S|i \in S) -
  \mathbb{P}(j \in S | i \notin S)\right) + w_i(1 - 0) = w_i
\end{displaymath}

Let $O$ be the collection of all sets $S$ for which we have observed $f(S)$ and
let $O_i \equiv \{S \in O : i \in S\}$ and $O^c_i \equiv \{ S\in O: i \notin S
\}$. If $O_i$ and $O^c_i$ are non-empty, define the following weight estimator:
\begin{displaymath}
  \hat W_i \equiv \frac{1}{|O_i|} \sum_{S \in O_i} f(S) - \frac{1}{|O_i^c|}
  \sum_{S \in O_i^c} f(S)
\end{displaymath}

If $D$ is a product distribution such that $ \exists c > 0, \forall i,
\mathbb{P}(i) \geq c$, it is
easy to show that $\forall i \in
\Omega,\ \hat W_i \rightarrow_{|O| \rightarrow +\infty} w_i$
By using standard concentration bounds, we can hope to obtain within $poly(n,
\frac{1}{\epsilon})$ observations:


%For every node $i$, we compute the \emph{average weight of every set containing
%element $i$}.  Let $w_i$ be
%the weight of element $i$, $w \equiv f(\Omega) = \sum_{i \in \Omega} w_i$ and
%$p \equiv \frac{k}{n}$, then $$\forall i \in \Omega, \mathbb{E}_{S \sim
%D_p}\left[f(S)| i \in S\right] = pw + (1 -p)w_i$$

%Note that the average weight of every set containing element $i$ preserves the
%ranking of the weights of the elements. For observations ${(S_j, f(S_j))}_{j \in
%N}$ and for $N_i \equiv |\{S : i \in S\}|$, we define the following estimator:

%\begin{equation} \forall i \in \Omega, w_i^{N_i} \equiv \frac{1}{N_i}\sum_{S |
%i \in S} \frac{f(S) - pw}{1-p} \end{equation}

%As shown above, $w_i^{N_i} \rightarrow w_i$ as $N_i \rightarrow +\infty$. We
%can obtain a concentration bound of $w_i^{N_i}$ around $w_i$, using Hoeffding's
%lemma:

%\begin{equation} \mathbb{P}\left(\middle|w_i^{N_i} - w_i \middle|  \geq
%\epsilon w_i \right) \leq 2e^{-2(1-p)N_i\frac{\epsilon^2 w_i^2}{w^2}}
%\end{equation}

%\emph{TODO:multiplicative boudns are very bad for zero weights\dots Need to look
%at additive bounds for these zeros.}

%For $N_i$ sufficiently large for each element $i$, we have $\forall i \in
%\Omega, (1-\epsilon) w_i \leq w_i^{N_i} \leq (1 + \epsilon) w_i$. Under this
%assumption, if we choose the $k$ elements with largest estimated weight
%$W_i^{N_i}$, we obtain a $\frac{1-\epsilon}{1+\epsilon}$-approximation to OPT,
%where OPT is the value of the maximum weight set of $k$ elements for the
%function $f$. To ensure that $N_i$ is sufficiently large for each element, we
%note that $\mathbb{E}(N_i) = pN$ and use a Chernoff bound coupled with a
%classic union bound:

%\begin{equation} \mathbb{P}\left(\bigcup_{i \in \Omega} \left[N_i \leq
%\frac{pN}{2}\right]\right) \leq \sum_{i\in \Omega} \mathbb{P}\left(N_i \leq
%\frac{pN}{2}\right) \leq n e^{-\frac{pN}{8}} \end{equation}

%As such, for $C > 0$ and $N \geq (C+1)\frac{8}{p}\log n$, we have $\forall i
%\in \Omega, N_i \geq \frac{pN}{2}$ with probability at least $1-\frac{1}{n^C}$

%\begin{proposition} Assume that $N$ pairs of set-value ${(S_j, f(S_j))}_{j \in
  %N}$ are observed, where $S_j \sim D_p$ and $p \equiv \frac{k}{n}$. If $N >
  %XXX$, then we can $\epsilon$-learn $f$ and optimize it to a
  %$(1+\epsilon)/(1-\epsilon)$ factor for any cardinality constraint with
  %probability $XXX$.  \end{proposition}

\subsection{General submodular functions}

\subsection{Parametric submodular functions}

\textbf{Note:} all known examples of submodular functions are parametric. The
question is not parametric vs non-parametric but whether or not the number of
parameters is polynomial or exponential in $n$ (the size of the ground set).
For all examples I know except martroid rank functions, it seems to be
polynomial.

\subsubsection{Influence Functions}

Let $G = (V, E)$ be a weighted directed graph. Without loss of generality, we
can assign a weight $p_{u,v} \in [0,1]$ to every possible edge $(u,v) \in V^2$.
Let $m$ be the number of non-zero edges of $G$. Let $\sigma_G(S, p)$ be the
influence of the set $S \subseteq V$ in $G$ under the IC model with parameters
$p$.

Recall from \cite{Kempe:03} that:
\begin{equation}
    \sigma_G(S, p) = \sum_{v\in V} \P_{G_p}\big[r_{G_p}(S\leadsto v)\big]
\end{equation}
where $G_p$ is a random graph where each edge $(u,v)\in E$ appears with
probability $p_{u,v}$ and $r_{G_p}(S\leadsto v)$ is the indicator variable of
\emph{there exists a path from $S$ to $v$ in $G_p$}.

\begin{claim}
\label{cla:oracle}
If for all $(u,v) \in V^2$, $p_{u,v} \geq p'_{u,v} \geq p_{u,v}
- \frac{1}{\alpha m}$, then:
\begin{displaymath}
    \forall S \subseteq V,\, \sigma_{G}(S, p) \geq \sigma_{G}(S,p') \geq (1
    - 1/\alpha) \sigma_{G}(S,p)
\end{displaymath}
\end{claim}

\begin{proof}
    We define two coupled random graphs $G_p$ and $G_p'$ as follows: for each
    edge $(u,v)\in E$, draw a uniform random variable $\mathcal{U}_{u,v}\in
    [0,1]$. Include $(u,v)$ in $G_p$ (resp. $G_{p'}$) iff
    $\mathcal{U}_{u,v}\leq p_{u,v}$ (resp. $\mathcal{U}_{u,v}\leq p'_{u,v}$).
    It is clear from the construction that each edge $(u,v)$ will be present in
    $G_p$ (resp. $G_p'$) with probability $p_{u,v}$ (resp. $p'_{u,v}$). Hence:
    \begin{displaymath}
        \sigma_G(S, p) = \sum_{v\in V} \P_{G_p}\big[r_{G_p}(S\leadsto v)\big]
        \text{  and  }
        \sigma_G(S, p') = \sum_{v\in V} \P_{G_p'}\big[r_{G_p'}(S\leadsto v)\big]
    \end{displaymath}

    By construction $G_{p'}$ is always a subgraph of $G_p$, i.e
    $r_{G_p'}(S\leadsto v)\leq r_{G_p}(S\leadsto v)$. This proves the left-hand
    side of the Claim's inequality.

    The probability that an edge is present in $G_p$ but not in $G_p'$ is at
    most $\frac{1}{\alpha m}$, so with probability $\left(1-\frac{1}{\alpha
    m}\right)^m$, $G_p$ and $G_p'$ have the same set of edges. This implies that:
    \begin{displaymath} \P_{G_{p'}}\big[r_{G_p'}(S\leadsto v)\big]\geq
    \left(1-\frac{1}{\alpha m}\right)^m\P_{G_{p}}\big[r_{G_p}(S\leadsto v)\big]
  \end{displaymath}
    which proves the right-hand side of the claim after observing that
    $\left(1-\frac{1}{\alpha m}\right)^m\geq 1-\frac{1}{\alpha}$ with equality
    iff $m=1$.
\end{proof}

 We can use Claim~\ref{cla:oracle} to find a constant factor approximation
 algorithm to maximising influence on $G$ by maximising influence on $G'$. For
 $k \in \mathbb{N}^*$, let $O_k \in \arg\max_{|S| \leq k} \sigma_G(S)$ and
 $\sigma_{G}^* = \sigma_G(O_k)$ where we omit the $k$ when it is unambiguous.


\begin{proposition}
\label{prop:approx_optim}
Suppose we have an unknown graph $G$ and a known graph $G'$ such that $V = V'$
and for all $(u,v) \in V^2, |p_{u,v} - p_{u,v}'| \leq  \frac{1}{\alpha m}$.
Then for all $k \in \mathbb{N}^*$, we can find a set $\hat O_k$ such that
$\sigma_{G}(\hat O_k) \geq (1 - e^{\frac{2}{\alpha} - 1}) \sigma^*_{G}$
\end{proposition}

\begin{proof} For every edge $(u,v) \in V^2$, let $\hat p = p'_{u,v} -
  \frac{1}{\alpha m}$. We are now in the conditions of Claim~\ref{cla:oracle}
  with $\alpha \leftarrow \alpha/2$. We return the set $\hat O_k$ obtained by
  greedy maximisation on $\hat G$. It is a classic exercise to show then that
  $\sigma_G(\hat O_k) \geq 1 - e^{\frac{2}{\alpha} - 1}$ (see Pset 1, CS284r).
\end{proof}

A small note on the approximation factor: it is only $>0$ for $\alpha > 2$.
Note that $\alpha \geq 7 \implies 1 - e^{\frac{2}{\alpha} - 1} \geq
\frac{1}{2}$ and that it converges to $(1 - 1/e)$ which is the best possible
polynomial-time approximation ratio to influence maximisation unless $P = NP$.
Also note that in the limit of large $m$, $(1 -\frac{1}{\alpha m})^m
\rightarrow \exp(-1/\alpha)$ and the approximation ratio goes to $1 -
\exp(-\exp(-2/\alpha))$.

\subsubsection{Active set selection of stationary Gaussian Processes}

\section{Passive Optimization vs. Passive Learning}

\subsection{Failed Attempt: returning max of observations}

This doesn't work. Give examples as to why! Remember that there are strong
concentration results for submodular functions -> look at expected value of
observed sets

\subsection{Example where optimization possible, learning impossible}

Recall the matroid construction from~\cite{balcan2011learning}:
\begin{theorem}
  For any $k$ with $k = 2^{o(n^{1/3})}$, there exists a family of sets ${\cal A}
  \subseteq2^{[n]}$ and a family of matroids $\cal{M} = \{M_{\cal{B}} :
  \cal{B} \subseteq\cal{A} \}$ such that:
  \begin{itemize}
    \item $|{\cal A}| = k$ and $|A| = n^{1/3}$ for every $A \in \cal{A}$
    \item For every $\cal{B} \subseteq\cal{A}$ and every $A \in \cal{A}$, we
      have:
      \begin{align*}
        \text{rank}_{M_{\cal{B}}}(A) & = 8 \log k \ if A\in{\cal B} \\
                                     & = |A| \ if A\in {\cal A}\backslash{\cal B}
      \end{align*}
  \end{itemize}
\end{theorem}

Consider the following subset of the above family of matroids: ${\cal
M}^{\epsilon} = \{M_{\cal B} : {\cal B} \subseteq{\cal A} \wedge |{\cal
A}\backslash{\cal B}| \geq \epsilon|{\cal A}|\}$ for $k = 2^{n^{1/6}}$.
Consider an \emph{unknown} function $f$, corresponding to the rank function of
one of the matroids $M_{\cal B}$ from ${\cal M}^{\epsilon}$.  Note that as long
as we observe \emph{one} set from ${\cal A} \backslash {\cal B}$, we can
optimize $f$ exactly! Indeed, the largest possible value for $f$ under
cardinality constraint $n^{1/3}$ is $\max_{A\in 2^{[n]}} |A| = n^{1/3}$.

One example of a distribution under which this occurs with probability at least
a constant is $D_u$, the uniform distribution over all sets of ${\cal A}$.  For
$c>0$, after $n^c$ observations $O \equiv (S_i, f(S_i))_i$ for $S_i \sim D_u$,
we will observe at least one element from $\cal{A}\backslash\cal{B}$ with
constant probability:

\begin{equation} \nonumber \mathbb{P}(\bigcup_{S_i} S_i \in {\cal
A}\backslash{\cal B}) \geq 1 - (1 - \epsilon)^{n^c} \approx \epsilon n^c
\end{equation}

However, it follows from the analysis of~\cite{balcan2011learning} that we
cannot learn $f$ under any distribution, even with active value queries!
Indeed, for any polynomially-sized set of observations, there exists a
super-polynomially number of functions in ${\cal M^1}$ which coincide on this
set of observations, but which take very different values outside of this set
of observations: $8n^{1/6}$ for $A \in {\cal B}$ and $n^{1/3}$ for $A \in {\cal
A}\backslash {\cal B}$.

{\bf TODO:} A cleaner simpler example would be nice.

\section{Meeting notes: 04.03.2015}

Consider the following function:
\begin{displaymath}
    g(S) = \max\left\{\sqrt{n}|X\cap S|, |S|\right\}
\end{displaymath}
where $|X|=\sqrt{n}$. Assume that you are given polynomially many samples where
each element is included with probability $1/2$. Then with high probability all
the samples will have size roughly $n/2$, so you will observe $g(S)=|S|$ with
high probability.

\paragraph{Claim 1:} $g$ is PAC-learnable because if you output $|S|$ then you
will be correct with high probability is $S$ is drawn from the same
distribution as above.

\paragraph{Claim 2:} $g$ is not optimizable under budget $\sqrt{n}$ because you
never learn anything about $X$. The set you output will necessarily be random
with respect to $X$: $\sqrt{n}|X \cap S| \approx
n^{\frac{1}{3}}(\frac{\sqrt{n}}{n})\sqrt{n}$ and therefore $g(S) \approx
\sqrt{n}$ with high probability. However if we had been able to output$X$, we
would have $g(X) = n^{\frac{5}{6}}$.

\paragraph{Open Question:} Cook a similar example where $g$ is submodular and
where you are observing sets of the same size as your budget.

\paragraph{Positive results:} Try to obtain guarantees about learning
parameters of parametric submodular functions and show whether or not these
guarantees are sufficient for optimization. First look at learning weights in
a cover function. Maybe facility location? Sums of concave over modular are
probably too hard because of the connection to neural networks.

\paragraph{Notes}

Are you sure the example shouldn't hav $n^{1/3}$ rather than $\sqrt{n}$?
Otherwise a random set $S$ will be such that $\sqrt{n}|X \cap S|
\sqrt{n}\frac{\sqrt{n}}{2} = \frac{n}{2}$, which is roughly equal to $|S|$.

\section{Which learning guarantees imply optimization?}

Here, we consider the following question: which learning guarantees imply
optimization? For example, \cite{TODO} provides the following guarantee for
cover functions:

\begin{theorem}
There exists an algorithm such that for any $\epsilon>0$ given random and
uniform examples of a coverage function $c$ outputs a hypothesis, which is also
a coverage function $h$, such that with probability $2/3$: $\mathbb{E}_{\cal
U}[|h(x) - c(x)|] \leq \epsilon$. The algorithm runs in time $\tilde {\cal O}(n)
\cdot \text{poly}(s/\epsilon)$ and uses $\log(n)\cdot \text{poly}(s/epsilon)$
and examples, where $s = \min\{size(c), (1/\epsilon)^{\log(1/\epsilon)}\}$.
\end{theorem}

We would like to understand to what extent this $\ell1$-bound allows us to
optimize the coverage function $c$ under cardinality constraints using the
hypothesis $h$. Let us consider the simpler case of an additive function, and
suppose we have a similar guarantee: $\mathbb{E}_{x \sim {\cal U}}[|h(x) -
c(x)|]$ where $\forall x, c(x) \equiv \sum_i w_i x_i$ and $h(x) \equiv \sum_i
\hat w_i x_i$. Can we find a bound on $\|w - \hat w\|_{\infty}$?

As it turns out, yes. Let $\forall i, w_i - \hat w_i \equiv \alpha_i$ and let
$V(x) \equiv |h(x) - c(x)| = |\sum_{i} \alpha_i x_i |$. Consider the collection
of good sets ${\cal G} \equiv \{S : v(S) < 4\epsilon\}$. We claim that $|{\cal G}|
\geq \frac{3}{4}\cdot2^n$. Suppose the contrary, there is at least a quarter of
the sets which have value $v(S) > 4\epsilon$ such that $\mathbb{E}_{x \sim {\cal
U}}[|v(x)|] \geq \frac{1}{2^n}\sum_{S \in {\cal G}^c} |v(S)| >
\frac{1}{4}\cdot4\epsilon = \epsilon$ which is a contradiction. Consider element
$i$ such that $|\alpha_i| \equiv \|\alpha\|_{\infty}$. Consider the collection
of sets which contain $i$: ${\cal S}_i \equiv \{S : i \in S\}$.  Notice that
$|{\cal S}_i| = |{\cal S}_i^c| = 2^{n-1}$. Therefore, $|{\cal G} \cap {\cal
S}_i^c| \geq \frac{1}{4}\cdot 2^n$. For all sets $S$ in ${\cal G} \cap {\cal
S}_i^c$, $v(S\cup\{j\}) \geq \alpha_i - 4\epsilon$. It follows that
$\mathbb{E}_{x \sim {\cal U}}[|v(x)|] \geq \frac{1}{4}(\alpha_j - 4\epsilon)$
and therefore we have $\|w - \hat w\|_{\infty} \leq 8 \epsilon$.

\section{Tricky coverage function example}

Let $U = \{x_1, x_2\}$ be a universe of size $2$ and $w : x \in U \mapsto
\mathbb{R}$ a weight function on $U$. Consider a family ${\cal A}$ of $n$ sets
$A_1, A_2, \dots A_n$ such that $A_1 \equiv \{x_1\}$, $A_2 \equiv \{x_2\}$ and
$\forall i \neq 1,2, A_i = \{x_1, x_2\}$. Let $k \in \mathbb{R}^+$ be any
positive constant and consider the family of coverage functions defined on $U$
and the family of sets ${\cal A}$:
$${\cal C} \equiv \{c : {\cal A} \mapsto \mathbb{R}^+ : \forall S \subset {\cal
A}, c(S) = w(\cup_{S_i \in S} S_i) \text{ and } w(U) = k \}$$

Note that any coverage function from this family of coverage functions is
constant almost everywhere, equal to $w(x_1)+w(x_2) = k$:

\begin{itemize}
\item $\forall S \neq A_1,
A_2, c(S) = k$
\item $c(\{A_1\}) = w(x_1)$, and $c(\{A_2\}) = w(x_2)$
\end{itemize}

\subsection{Easy to learn under any distribution}

TODO

\subsection{Hard to optimize under cardinality constraint with $K=1$}

Given any distribution ${\cal D}$ on $2^U$ such that
$\mathbb{P}(\{A_1\}) + \mathbb{P}(\{A_2\}) = o(n^{-c})$ for any constant $c$,
i.e. $\exists f : n \mapsto \mathbb{R}^+ \text{ s.t. } \lim_{n\rightarrow
+\infty} f(n) = +\infty \text{ and } \mathbb{P}(\{A_1\}) + \mathbb{P}(\{A_2\}) =
{\cal O}(n^{-f(n)})$, and for any constant $C \in \mathbb{R}$ and $m \equiv
n^C$, then for $n$ sufficiently large, the set $\hat S$ returned by any one-shot
optimization algorithm after observing random sets $S_1, S_2, \dots S_m$ from
${\cal D}$ is such that:

$$\mathbb{P}_{S_1, \dots S_m \sim {\cal D}}\left(\mathbb{P}_{\hat S}
\left(f(\hat S) > 0\right) \geq \frac{1}{n}\right) \leq \frac{n^C}{2f(n)}
\rightarrow 0 \text{ when } n \rightarrow +\infty$$

The previous proposition can be reformulated in the slightly weaker formulation
which is more intuitive:

$$\mathbb{P}_{S_1, \dots S_m \sim {\cal D}}\left(\mathbb{E}(f(\hat S)) \geq
\frac{1}{n}OPT \right) \geq 1 - \frac{n^C}{2f(n)} \rightarrow 1 \text{ when } n
\rightarrow +\infty$$

Note that the restriction on ${\cal D}$ is verified by many distributions. In
the case of the uniform distribution $ = \frac{1}{2^{n-1}}$ and in the case of
observing all sets of size $k>1$ uniformly at random: $=0$ This example is not
fully satisfying for two reasons:

\begin{itemize}
  \item the coverage function is not monotone.
  \item the sets we observe most do not satisfy the constraint.
\end{itemize}

Note that even if we observe sets which \emph{almost} satisfy the cardinality
constraint, such as all sets of size 2 and only such sets, we would still have
no hope to output a set with expected value within $\frac{1}{n}$ of OPT\@ under
cardinality constraint $K=1$. This example can be modified to address the second
issue.

\subsection{Fixing second issue}

We extend the previous example by slightly modifying ${\cal A}$

\bibliography{optimize} \bibliographystyle{apalike}


\end{document}