From d8a68c6917f5b6053117e0145f6d4d80a8bec26b Mon Sep 17 00:00:00 2001
From: Thibaut Horel <thibaut.horel@gmail.com>
Date: Tue, 14 Apr 2015 15:20:19 -0400
Subject: Starting paper draft

---
 brainstorm/optimize.bib         |  15 +
 brainstorm/results.tex          | 616 ++++++++++++++++++++++++++++++++++++++++
 optimize.bib                    |  15 -
 paper/main.tex                  |  42 +++
 paper/optimize.bib              |  15 +
 paper/sections/introduction.tex |  27 ++
 paper/sections/negative.tex     | 113 ++++++++
 paper/sections/positive.tex     | 365 ++++++++++++++++++++++++
 results.tex                     | 616 ----------------------------------------
 9 files changed, 1193 insertions(+), 631 deletions(-)
 create mode 100644 brainstorm/optimize.bib
 create mode 100644 brainstorm/results.tex
 delete mode 100644 optimize.bib
 create mode 100644 paper/main.tex
 create mode 100644 paper/optimize.bib
 create mode 100644 paper/sections/introduction.tex
 create mode 100644 paper/sections/negative.tex
 create mode 100644 paper/sections/positive.tex
 delete mode 100644 results.tex

diff --git a/brainstorm/optimize.bib b/brainstorm/optimize.bib
new file mode 100644
index 0000000..3fb5309
--- /dev/null
+++ b/brainstorm/optimize.bib
@@ -0,0 +1,15 @@
+@article{vershynin2010introduction,
+  title={Introduction to the non-asymptotic analysis of random matrices},
+  author={Vershynin, Roman},
+  journal={arXiv preprint arXiv:1011.3027},
+  year={2010}
+}
+
+@inproceedings{balcan2011learning,
+  title={Learning submodular functions},
+  author={Balcan, Maria-Florina and Harvey, Nicholas JA},
+  booktitle={Proceedings of the forty-third annual ACM symposium on Theory of computing},
+  pages={793--802},
+  year={2011},
+  organization={ACM}
+}
\ No newline at end of file
diff --git a/brainstorm/results.tex b/brainstorm/results.tex
new file mode 100644
index 0000000..4f2a70d
--- /dev/null
+++ b/brainstorm/results.tex
@@ -0,0 +1,616 @@
+\documentclass[10pt]{article}
+\usepackage{fullpage, amsmath, amssymb, amsthm, bbm}
+\usepackage[utf8x]{inputenc}
+
+\DeclareMathOperator{\E}{\mathbb{E}}
+\let\P\relax
+\DeclareMathOperator{\P}{\mathbb{P}}
+\newcommand{\ex}[1]{\E\left[#1\right]}
+\newcommand{\prob}[1]{\P\left[#1\right]}
+\newcommand{\reals}{\mathbb{R}}
+\newcommand{\ints}{\mathbb{N}}
+\renewcommand{\O}{\mathcal{O}}
+
+
+\newtheorem{proposition}{Proposition}
+\newtheorem{corollary}{Corollary}
+\newtheorem{problem}{Problem}
+\newtheorem{theorem}{Theorem}
+\newtheorem{claim}{Claim}
+\newtheorem{remark}{Remark}
+
+\title{Learn and/or Optimize}
+\author{}
+\date{}
+
+\begin{document}
+\maketitle
+
+\section{Preliminary Results}
+\label{sec:matrix_theory}
+
+\subsection{Generic Random Matrix Theory}
+We cite the following result from~\cite{vershynin2010introduction} (Remark 5.40):
+
+\begin{proposition}\label{prop:non_isotropic_isometry} 
+Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are independent
+sub-gaussian random vectors in $\mathbb{R}^n$ with second moment matrix
+$\Sigma$. Then for every $t \geq 0$, the following inequality holds with
+probability at least: $1- 2 \exp(-ct^2)$: \begin{equation} \|\frac{1}{N} A^T A
+- \Sigma \| \leq \max(\delta, \delta^2) \ \text{where}\ \delta =
+C\sqrt{\frac{n}{N}} + \frac{t}{\sqrt{N}} \end{equation} where $C$, $c$ depend
+only on the subgaussian norm of the rows $K \equiv \max_i \| A_i\|_{\psi_2}$.
+\end{proposition}
+
+The following result is a simple corollary of
+Proposition~\ref{prop:non_isotropic_isometry}:
+
+\begin{corollary}\label{cor:number_of_rows_needed} Let $n \in \mathbb{N}$ and
+  $k \in ]0,n[$. Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are
+  independent Bernoulli variable vectors in $\mathbb{R}^n$ such that
+  $\mathbb{P}(A_{i,j} = 1) = \frac{k}{n} = 1 - \mathbb{P}(A_{i,j} = 0)$ and let
+  $\sigma \equiv \frac{k}{n}(1- \frac{k}{n}) \neq 0$, then if $N > {(C+1)}^2
+  n/\sigma^2$, the matrix A has full row rank with probability at least $1 -
+  e^{-cn}$, where $C, c$ are constant depending only on  the subgaussian norm
+  of the rows $K \equiv \sup_{p \geq 1}
+  {\frac{1}{\sqrt{p}}(\mathbb{E}|A_1|^p)}^\frac{1}{p} = k$\footnote{Is this true?
+  And how do the constants behave?} \end{corollary}
+
+\begin{proof} It is easy to compare the kernels of $A$ and $A^TA$ and notice
+  that $rank(A) = rank(A^T A)$. Since $A^TA$ is an $n \times n$ matrix, it
+  follows that if $A^TA$ is invertible, then $A$ has full row rank. In other
+  words, if $A$ has smallest singular value $\sigma_{\min}(A) > 0$, then $A$
+  has full row rank.  Consider Prop.~\ref{prop:non_isotropic_isometry} with $t
+  \equiv \sqrt{n}$ and with $N > {(C+1)}^{2} n$, then with probability $1 -
+  2\exp(-cn)$: \begin{equation} \|\frac{1}{N} A^T A - \sigma I \| \leq
+    (C+1)\sqrt{\frac{n}{N}} \end{equation}
+
+It follows that for any vector $x \in \mathbb{R}^n$, $|\frac{1}{N}\|A x\|^2_2
+-\sigma | \leq (C+1)\sqrt{\frac{n}{N}} \implies \|A x\|^2_2 \geq N\left(\sigma
+- (C+1)\sqrt{\frac{n}{N}}\right)$. If $N > {(C+1)}^2n/\sigma^2$, then
+$\sigma_{\min}(A) > 0$ with probability at least $1 - e^{-cn}$ \end{proof}
+
+\subsection{More Direct Approach}
+
+Here we work on $\mathbb{F}_2$. An $n\times n$ binary matrix can be seen as
+a matrix over $\mathbb{F}_2$. Let us assume that each row of $A$ is chosen
+uniformly at random among all binary rows of length $n$. A standard counting
+arguments shows that the number of non-singular matrices over $\mathbb{F}_2$
+is:
+\begin{displaymath}
+    N_n = (2^n-1)(2^n - 2)\dots (2^n- 2^{n-1})
+    = (2^n)^n\prod_{i=1}^n\left(1-\frac{1}{2^i}\right)
+\end{displaymath}
+
+Hence the probability that our random matrix $A$ is invertible is:
+\begin{displaymath}
+    P_n  = \prod_{i=1}^n\left(1-\frac{1}{2^i}\right)
+\end{displaymath}
+It is easy to see that $P_n$ is a decreasing sequence. Its limit is
+$\phi(\frac{1}{2})$ where $\phi$ is the Euler function. We have
+$\phi(\frac{1}{2})\approx
+0.289$ and $P_n$ converges
+exponentially fast to this constant (one way to see this is to use the
+Pentagonal number theorem).
+
+Hence, if we observe $\ell\cdot n$ uniformly random binary rows, the
+probability that they will have full column rank is at least:
+\begin{displaymath}
+    P_{\ell\cdot n}\geq 1 - \left(1-\phi\left(\frac{1}{2}\right)\right)^{\ell}
+\end{displaymath}
+
+Note that this is the probability of having full column rank over
+$\mathbb{F}_2$. A standard linear algebra argument shows that this implies full
+column rank over $\mathbb{R}$.
+
+\paragraph{TODO:} Study the case where we only observe sets of size exactly
+$k$, or at most $k$. This amounts to replacing $2^n$ in the computation above
+by:
+\begin{displaymath}
+{n\choose k}\quad\text{or}\quad\sum_{j=0}^k{n\choose j}
+\end{displaymath}
+
+Thinking about it, why do we assume that the sample sets are of size exactly
+$k$. I think it would make more sense from the learning perspective to consider
+uniformly random sets. In this case, the above approach allows us to conclude
+directly.
+
+More generally, I think the “right” way to think about this is to look at $A$
+as the adjacency matrix of a random $k$-regular graph. There are tons of
+results about the probability of the adjacency matrix to be non singular.
+
+\section{Examples}
+
+\subsection{Generic Functions}
+
+\paragraph{TODO:} Add citations!
+
+These are generic examples and serve as building blocks for the applications
+below:
+\begin{itemize}
+    \item $f(S) = g\left(\sum_{i\in S} w_i\right)$ for a concave
+        $g:\reals\to\reals$ and weights $(w_i)_{i\in N}\in \reals_+^{|N|}$. Note
+        that $f$ is monotone iff $g$ is monotone. In this case, $g$ does not
+        matter for the purpose of optimization: the sets are in the same order
+        with or without $g$, the only things which matter are the weights which
+        serve as natural parameters for this class of functions. This class of
+        functions contains:
+        \begin{itemize}
+            \item additive functions (when $g$ is the identity function).
+            \item $f(S) = |S\cap X|$ for some set $X\subseteq N$. This is the
+                case where the weights are $0/1$ and $g$ is the identity
+                function.
+            \item symmetric submodular functions: when the weights are all one.
+            \item budget-additive functions, when $g:x\mapsto \min(B, x)$ for
+                some $B$.
+        \end{itemize}
+    \item $f(S) = \max_{i\in S} w_i$ for weights $(w_i)_{i\in N}\in
+        \reals_+^{|N|}$. This class of functions is also naturally parametrized
+        by the weights.
+    \item (weighted) matroid rank functions. Given a matroid $M$ over a ground
+        set $N$, we define its rank function to be:
+        \begin{displaymath}
+            \forall S\subseteq N,\;
+            r(S) = \max_{\substack{I\subseteq S\\I\in M}} |I|
+        \end{displaymath}
+        more generally, given a weight function $w:N\to\reals_+$, we define the
+        weighted matroid rank function:
+        \begin{displaymath}
+            \forall S\subseteq N,\;
+            r(S) = \max_{\substack{I\subseteq S\\I\in M}} \sum_{i\in I} w_i
+        \end{displaymath}
+\end{itemize}
+
+\paragraph{Remark} The function $f(S)= \max_{i\in S}w_i$ is a weighted matroid
+rank function for the $1$-uniform matroid (the matroid where the independent
+sets are the sets of size at most one).
+
+\subsection{Applications}
+
+\begin{itemize}
+    \item \emph{Coverage functions:} they can be written as a positive linear
+        combination of matroid rank functions:
+        \begin{displaymath}
+            f(S) = \sum_{u\in\mathcal{U}} w_u c_u(S)
+        \end{displaymath}
+        where $c_u$ is the rank function of the matroid $M = \big\{ \emptyset,
+        \{u\}\big\}$.
+    \item \emph{Facility location:} (cite Bilmes) there is a universe
+        $\mathcal{U}$ of locations and a proximity score $s_{i,j}$ for each
+        pair of locations.  We pick a subset of locations $S$ and each point in
+        the universe is allocated to its closest location (the one with highest
+        proximity):
+        \begin{displaymath}
+            f(S) = \sum_{u\in\mathcal{U}} \max_{v\in S} s_{u,v}
+        \end{displaymath}
+        This can be seen as a sum of weighted matroid rank functions: one for
+        each location in the universe associated with a $1$-uniform matroid
+        (other applications: job scheduling).
+
+    \item \emph{Image segmentation:} (cite Jegelka) can be (in some cases)
+        written as a graph cut function. For image segmentation the goal is to
+        minimize the cut.
+        \begin{displaymath}
+            f(S) = \sum_{e\in E} w_e c_e(S)
+        \end{displaymath}
+        where $c_e(S)$ is one iff $e\in E(S,\bar{S})$. \textbf{TODO:} I think
+        this can be written as a matroid rank function.
+    \item \emph{Learning} (cite Krause) there is
+        a hypothesis $A$ (a random variable) which is “refined” when more
+        observations are made.  Imagine that there is a finite set $X_1,\dots,
+        X_n$ of possible observations (random variables). Then, assuming that
+        the observations are independent conditioned on $A$, the information
+        gain:
+        \begin{displaymath}
+            f(S) = H(A) - H\big(A\,|\,(X_i)_{i\in S}\big)
+        \end{displaymath}
+        is submodular. The $\log\det$ is the specific case of a linear
+        hypothesis observed with additional independent Gaussian noise.
+    \item \emph{Entropy:} Closely related to the previous one. If $(X_1,\dots,
+        X_n)$ are random variables, then:
+        $ f(S) = H(X_S) $ is submodular. In particular, if $(X_1,\dots,
+        X_n)$ are jointly multivariate gaussian, then:
+        \begin{displaymath}
+            f(S) = c|S| + \frac{1}{2}\log\det X_SX_S^T
+        \end{displaymath}
+        for $c= 2\pi e...$ and we fall back to the usual $\log\det$ function.
+    \item \emph{data subset selection/summarization:} in statistical machine
+        translation, Bilmes used sum of concave over modular:
+        \begin{displaymath}
+            f(S) = \sum_{f} \lambda_f \phi\left(\sum_{e\in S}w_f(e)\right)
+        \end{displaymath}
+        where each $f$ represents a feature, $w_f(e)$ represents how much of
+        $f$ element $e$ has, and $\phi$ captures decreasing marginal gain when
+        we have a lot of a given feature.
+        Facility location functions are also commonly used for subset selection.
+    \item \emph{concave spectral functions} One would be tempted to say that
+        any multivariate concave function of a modular function is submodular.
+        This would be the natural generalization of concave over modular.
+        However \textbf{this is not true in general}. However, a possible nice
+        generalization is the following. Let $M$ be a symmetric $n\times
+        n$ matrix, and $g$ is a ``matrix concave'' function. Then:
+        \begin{displaymath}
+            f(S) = \mathrm{Tr}\big(g(M_S)\big)
+        \end{displaymath}
+        is submodular. This contains the $\log\det$ (when $g$ is the matrix
+        $\log$) but tons of other functions (like quantum entropy).
+\end{itemize}
+In summary, the two most general classes of submodular functions (which capture
+all the examples known to man) are: sums of matrix concave functions and sums
+of matroid rank functions. Sums of concave over modular are also nice if we
+want to start with a simpler example.
+
+\section{Passive Optimization}
+
+Let $\Omega$ be the universe of elements and $f$ a function defined on subsets
+of $\Omega$: $f : S \in 2^{[\Omega]} \mapsto f(S) \in \mathbb{R}$. Let $K$ be a
+collection of sets of $2^{[\Omega]}$, which we call \emph{constraints}. Let
+$S^*_K$ be any solution to $\max_{S \in K} f(S)$, which we will also denote by
+$S^*$ when there is no ambiguity. Let $L$ be the problem size, which is often
+(but not always) equal to $|\Omega|$.
+
+In general, we say we can efficiently optimize a function $f$ under constraint
+$K$ when we have a polynomial-time algorithm making adaptive value queries to
+$f$,which returns a set $S$ such that $S \in K$ and $f(S) \geq \alpha f(S^*)$
+with high probability and $\alpha$ an absolute constant.
+
+Here, we consider the scenario where we cannot make adaptive value queries, and
+in fact, where we cannot make queries at all! Instead, we suppose that we
+observe a polynomial number of set-value pairs $(S, f(S))$ where $S$ is taken
+from a known distribution $D$. We say we can efficiently \emph{passively
+optimize} $f$ under distribution $D$ or $D-$optimize $f$ under constraints $K$
+when, after observing ${\cal O}(L^c)$ set-value pairs from $D$ where $c > 0$ is
+an absolute constant, we can return a set $S$ such that $S \in K$ and $f(S)
+\geq \alpha f(S^*)$ with high probability and $\alpha$ an absolute constant.
+
+In the case of \emph{passive} observations of set-value pairs under a
+distribution $D$ for a function $f$, recent research has focused on whether we
+can efficiently and approximately \emph{learn} $f$. This was formalized in the
+PMAC model from \cite{balcan2011learning}. When thinking about passive
+optimization, it is necessary to understand the link between being able to
+ $D-PMAC$ learn $f$ and being able to $D-$optimize $f$.
+
+\subsection{Additive function}
+
+We consider here the simple case of additive functions. A function $f$ is
+additive if there exists a weight vector $(w_s)_{s \in \Omega}$ such that
+$\forall S \subseteq \Omega, \ f(S) = \sum_{s \in S} w_s$. We will sometimes
+adopt the notation $w \equiv f(\Omega)$.
+
+\subsubsection{Inverting the system}\label{subsec:invert_the_system}
+
+Suppose we have observed $n^c$ set-value pairs ${(S_j, f(S_j))}_{j \in N}$ with
+$S_j \sim D$ where $n \equiv |\Omega|$. Consider the $n^c \times n$ matrix $A$
+where for all $i$, the row $A_i = \chi_{S_i}$, the indicator vector of set
+$S_i$. Let $B$ be the $n^c \times 1$ vector such that $\forall i, b_j \equiv
+f(S_j)$. It is easy to see that if $w$ is the $n \times 1$ corresponding weight
+vector for $f$ then:
+\begin{displaymath}
+  A w = B
+\end{displaymath}
+
+Note that if $A$ has full column rank, then we can solve for $w$ exactly and
+also optimize $f$ under any cardinality constraint. We can therefore cast the
+question of $D-$learning and $D-$optimizing $f$ as a random matrix theory
+problem: what is the probability that after $n^c$ for $c > 0$ independent
+samples from $D$, the matrix $A$ will have full rank? See
+section~\ref{sec:matrix_theory}
+
+\paragraph{Extension}
+Note that the previous reasoning can easily be extended to any \emph{almost}
+additive function. Consider a function $g$ such that there exists $\alpha > 0$
+and $\beta > 0$ and an additive function $f$ such that $\forall S, \alpha f(S)
+\leq g(S) \leq \beta f(S)$, then by solving for $\max_{S\in K} f(S)$ we have a
+$\alpha/\beta$-approximation to the optimum of $g$ since:
+
+\begin{displaymath}
+g(S^*) \geq \alpha f(S^*) \geq \alpha f(OPT) \geq
+\frac{\alpha}{\beta} g(OPT)
+\end{displaymath}
+
+where $OPT \in \arg \max_{S \in K} g(S)$. This can be taken one step further by
+considering a function $g$ such that there exists $\alpha, \beta >0$, an
+additive function $f$ and a bijective univariate function $\phi$, such that
+$\forall S, \alpha \phi(f(S)) \leq g(S) \leq \beta \phi(f(S))$. In this case, we
+solve the system $A w = \phi^{-1}(B)$ and obtain once again an
+$\alpha/\beta$-approximation to the optimum of $g$.
+
+\begin{remark}
+Note that here $D-$optimizing $f$ is easy because $D-$learning $f$ is easy. We
+would like to understand whether being able to $D-$learn $f$ is really
+necessary to $D-$optimizing it. In fact, many results for PMAC-learning more
+complex functions, such as general submodular functions, are negative. Can we
+hope to find positive results in cases where PMAC-learning is impossible?
+\end{remark}
+
+\subsubsection{Average weight method}
+
+We consider here another method to $D-$optimizing $f$, which only requires
+$D-$learning $f$ approximately. Note that for every element $i \in \Omega$, and
+for a \emph{product} distribution $D$:
+\begin{displaymath}
+  \mathbb{E}_{S \sim D}(f(S)|i \in S) - \mathbb{E}_{S \sim D}(f(S) | i \notin
+  S) = \sum_{j \neq i \in \Omega} w_s \left(\mathbb{P}(j\in S|i \in S) -
+  \mathbb{P}(j \in S | i \notin S)\right) + w_i(1 - 0) = w_i
+\end{displaymath}
+
+Let $O$ be the collection of all sets $S$ for which we have observed $f(S)$ and
+let $O_i \equiv \{S \in O : i \in S\}$ and $O^c_i \equiv \{ S\in O: i \notin S
+\}$. If $O_i$ and $O^c_i$ are non-empty, define the following weight estimator:
+\begin{displaymath}
+  \hat W_i \equiv \frac{1}{|O_i|} \sum_{S \in O_i} f(S) - \frac{1}{|O_i^c|}
+  \sum_{S \in O_i^c} f(S)
+\end{displaymath}
+
+If $D$ is a product distribution such that $ \exists c > 0, \forall i,
+\mathbb{P}(i) \geq c$, it is
+easy to show that $\forall i \in
+\Omega,\ \hat W_i \rightarrow_{|O| \rightarrow +\infty} w_i$
+By using standard concentration bounds, we can hope to obtain within $poly(n,
+\frac{1}{\epsilon})$ observations:
+
+
+%For every node $i$, we compute the \emph{average weight of every set containing
+%element $i$}.  Let $w_i$ be
+%the weight of element $i$, $w \equiv f(\Omega) = \sum_{i \in \Omega} w_i$ and
+%$p \equiv \frac{k}{n}$, then $$\forall i \in \Omega, \mathbb{E}_{S \sim
+%D_p}\left[f(S)| i \in S\right] = pw + (1 -p)w_i$$
+
+%Note that the average weight of every set containing element $i$ preserves the
+%ranking of the weights of the elements. For observations ${(S_j, f(S_j))}_{j \in
+%N}$ and for $N_i \equiv |\{S : i \in S\}|$, we define the following estimator:
+
+%\begin{equation} \forall i \in \Omega, w_i^{N_i} \equiv \frac{1}{N_i}\sum_{S |
+%i \in S} \frac{f(S) - pw}{1-p} \end{equation}
+
+%As shown above, $w_i^{N_i} \rightarrow w_i$ as $N_i \rightarrow +\infty$. We
+%can obtain a concentration bound of $w_i^{N_i}$ around $w_i$, using Hoeffding's
+%lemma:
+
+%\begin{equation} \mathbb{P}\left(\middle|w_i^{N_i} - w_i \middle|  \geq
+%\epsilon w_i \right) \leq 2e^{-2(1-p)N_i\frac{\epsilon^2 w_i^2}{w^2}}
+%\end{equation}
+
+%\emph{TODO:multiplicative boudns are very bad for zero weights\dots Need to look
+%at additive bounds for these zeros.}
+
+%For $N_i$ sufficiently large for each element $i$, we have $\forall i \in
+%\Omega, (1-\epsilon) w_i \leq w_i^{N_i} \leq (1 + \epsilon) w_i$. Under this
+%assumption, if we choose the $k$ elements with largest estimated weight
+%$W_i^{N_i}$, we obtain a $\frac{1-\epsilon}{1+\epsilon}$-approximation to OPT,
+%where OPT is the value of the maximum weight set of $k$ elements for the
+%function $f$. To ensure that $N_i$ is sufficiently large for each element, we
+%note that $\mathbb{E}(N_i) = pN$ and use a Chernoff bound coupled with a
+%classic union bound:
+
+%\begin{equation} \mathbb{P}\left(\bigcup_{i \in \Omega} \left[N_i \leq
+%\frac{pN}{2}\right]\right) \leq \sum_{i\in \Omega} \mathbb{P}\left(N_i \leq
+%\frac{pN}{2}\right) \leq n e^{-\frac{pN}{8}} \end{equation}
+
+%As such, for $C > 0$ and $N \geq (C+1)\frac{8}{p}\log n$, we have $\forall i
+%\in \Omega, N_i \geq \frac{pN}{2}$ with probability at least $1-\frac{1}{n^C}$
+
+%\begin{proposition} Assume that $N$ pairs of set-value ${(S_j, f(S_j))}_{j \in
+  %N}$ are observed, where $S_j \sim D_p$ and $p \equiv \frac{k}{n}$. If $N >
+  %XXX$, then we can $\epsilon$-learn $f$ and optimize it to a
+  %$(1+\epsilon)/(1-\epsilon)$ factor for any cardinality constraint with
+  %probability $XXX$.  \end{proposition}
+
+\subsection{General submodular functions}
+
+\subsection{Parametric submodular functions}
+
+\textbf{Note:} all known examples of submodular functions are parametric. The
+question is not parametric vs non-parametric but whether or not the number of
+parameters is polynomial or exponential in $n$ (the size of the ground set).
+For all examples I know except martroid rank functions, it seems to be
+polynomial.
+
+\subsubsection{Influence Functions}
+
+Let $G = (V, E)$ be a weighted directed graph. Without loss of generality, we
+can assign a weight $p_{u,v} \in [0,1]$ to every possible edge $(u,v) \in V^2$.
+Let $m$ be the number of non-zero edges of $G$. Let $\sigma_G(S, p)$ be the
+influence of the set $S \subseteq V$ in $G$ under the IC model with parameters
+$p$.
+
+Recall from \cite{Kempe:03} that:
+\begin{equation}
+    \sigma_G(S, p) = \sum_{v\in V} \P_{G_p}\big[r_{G_p}(S\leadsto v)\big]
+\end{equation}
+where $G_p$ is a random graph where each edge $(u,v)\in E$ appears with
+probability $p_{u,v}$ and $r_{G_p}(S\leadsto v)$ is the indicator variable of
+\emph{there exists a path from $S$ to $v$ in $G_p$}.
+
+\begin{claim}
+\label{cla:oracle}
+If for all $(u,v) \in V^2$, $p_{u,v} \geq p'_{u,v} \geq p_{u,v}
+- \frac{1}{\alpha m}$, then:
+\begin{displaymath}
+    \forall S \subseteq V,\, \sigma_{G}(S, p) \geq \sigma_{G}(S,p') \geq (1
+    - 1/\alpha) \sigma_{G}(S,p)
+\end{displaymath}
+\end{claim}
+
+\begin{proof}
+    We define two coupled random graphs $G_p$ and $G_p'$ as follows: for each
+    edge $(u,v)\in E$, draw a uniform random variable $\mathcal{U}_{u,v}\in
+    [0,1]$. Include $(u,v)$ in $G_p$ (resp. $G_{p'}$) iff
+    $\mathcal{U}_{u,v}\leq p_{u,v}$ (resp. $\mathcal{U}_{u,v}\leq p'_{u,v}$).
+    It is clear from the construction that each edge $(u,v)$ will be present in
+    $G_p$ (resp. $G_p'$) with probability $p_{u,v}$ (resp. $p'_{u,v}$). Hence:
+    \begin{displaymath}
+        \sigma_G(S, p) = \sum_{v\in V} \P_{G_p}\big[r_{G_p}(S\leadsto v)\big]
+        \text{  and  }
+        \sigma_G(S, p') = \sum_{v\in V} \P_{G_p'}\big[r_{G_p'}(S\leadsto v)\big]
+    \end{displaymath}
+
+    By construction $G_{p'}$ is always a subgraph of $G_p$, i.e
+    $r_{G_p'}(S\leadsto v)\leq r_{G_p}(S\leadsto v)$. This proves the left-hand
+    side of the Claim's inequality.
+
+    The probability that an edge is present in $G_p$ but not in $G_p'$ is at
+    most $\frac{1}{\alpha m}$, so with probability $\left(1-\frac{1}{\alpha
+    m}\right)^m$, $G_p$ and $G_p'$ have the same set of edges. This implies that:
+    \begin{displaymath} \P_{G_{p'}}\big[r_{G_p'}(S\leadsto v)\big]\geq
+    \left(1-\frac{1}{\alpha m}\right)^m\P_{G_{p}}\big[r_{G_p}(S\leadsto v)\big]
+  \end{displaymath}
+    which proves the right-hand side of the claim after observing that
+    $\left(1-\frac{1}{\alpha m}\right)^m\geq 1-\frac{1}{\alpha}$ with equality
+    iff $m=1$.
+\end{proof}
+
+ We can use Claim~\ref{cla:oracle} to find a constant factor approximation
+ algorithm to maximising influence on $G$ by maximising influence on $G'$. For
+ $k \in \mathbb{N}^*$, let $O_k \in \arg\max_{|S| \leq k} \sigma_G(S)$ and
+ $\sigma_{G}^* = \sigma_G(O_k)$ where we omit the $k$ when it is unambiguous.
+
+
+\begin{proposition}
+\label{prop:approx_optim}
+Suppose we have an unknown graph $G$ and a known graph $G'$ such that $V = V'$
+and for all $(u,v) \in V^2, |p_{u,v} - p_{u,v}'| \leq  \frac{1}{\alpha m}$.
+Then for all $k \in \mathbb{N}^*$, we can find a set $\hat O_k$ such that
+$\sigma_{G}(\hat O_k) \geq (1 - e^{\frac{2}{\alpha} - 1}) \sigma^*_{G}$
+\end{proposition}
+
+\begin{proof} For every edge $(u,v) \in V^2$, let $\hat p = p'_{u,v} -
+  \frac{1}{\alpha m}$. We are now in the conditions of Claim~\ref{cla:oracle}
+  with $\alpha \leftarrow \alpha/2$. We return the set $\hat O_k$ obtained by
+  greedy maximisation on $\hat G$. It is a classic exercise to show then that
+  $\sigma_G(\hat O_k) \geq 1 - e^{\frac{2}{\alpha} - 1}$ (see Pset 1, CS284r).
+\end{proof}
+
+A small note on the approximation factor: it is only $>0$ for $\alpha > 2$.
+Note that $\alpha \geq 7 \implies 1 - e^{\frac{2}{\alpha} - 1} \geq
+\frac{1}{2}$ and that it converges to $(1 - 1/e)$ which is the best possible
+polynomial-time approximation ratio to influence maximisation unless $P = NP$.
+Also note that in the limit of large $m$, $(1 -\frac{1}{\alpha m})^m
+\rightarrow \exp(-1/\alpha)$ and the approximation ratio goes to $1 -
+\exp(-\exp(-2/\alpha))$.
+
+\subsubsection{Active set selection of stationary Gaussian Processes}
+
+\section{Passive Optimization vs. Passive Learning}
+
+\subsection{Failed Attempt: returning max of observations}
+
+This doesn't work. Give examples as to why! Remember that there are strong
+concentration results for submodular functions -> look at expected value of
+observed sets
+
+\subsection{Example where optimization possible, learning impossible}
+
+Recall the matroid construction from~\cite{balcan2011learning}:
+\begin{theorem}
+  For any $k$ with $k = 2^{o(n^{1/3})}$, there exists a family of sets ${\cal A}
+  \subseteq2^{[n]}$ and a family of matroids $\cal{M} = \{M_{\cal{B}} :
+  \cal{B} \subseteq\cal{A} \}$ such that:
+  \begin{itemize}
+    \item $|{\cal A}| = k$ and $|A| = n^{1/3}$ for every $A \in \cal{A}$
+    \item For every $\cal{B} \subseteq\cal{A}$ and every $A \in \cal{A}$, we
+      have:
+      \begin{align*}
+        \text{rank}_{M_{\cal{B}}}(A) & = 8 \log k \ if A\in{\cal B} \\
+                                     & = |A| \ if A\in {\cal A}\backslash{\cal B}
+      \end{align*}
+  \end{itemize}
+\end{theorem}
+
+Consider the following subset of the above family of matroids: ${\cal
+M}^{\epsilon} = \{M_{\cal B} : {\cal B} \subseteq{\cal A} \wedge |{\cal
+A}\backslash{\cal B}| \geq \epsilon|{\cal A}|\}$ for $k = 2^{n^{1/6}}$.
+Consider an \emph{unknown} function $f$, corresponding to the rank function of
+one of the matroids $M_{\cal B}$ from ${\cal M}^{\epsilon}$.  Note that as long
+as we observe \emph{one} set from ${\cal A} \backslash {\cal B}$, we can
+optimize $f$ exactly! Indeed, the largest possible value for $f$ under
+cardinality constraint $n^{1/3}$ is $\max_{A\in 2^{[n]}} |A| = n^{1/3}$.
+
+One example of a distribution under which this occurs with probability at least
+a constant is $D_u$, the uniform distribution over all sets of ${\cal A}$.  For
+$c>0$, after $n^c$ observations $O \equiv (S_i, f(S_i))_i$ for $S_i \sim D_u$,
+we will observe at least one element from $\cal{A}\backslash\cal{B}$ with
+constant probability:
+
+\begin{equation} \nonumber \mathbb{P}(\bigcup_{S_i} S_i \in {\cal
+A}\backslash{\cal B}) \geq 1 - (1 - \epsilon)^{n^c} \approx \epsilon n^c
+\end{equation}
+
+However, it follows from the analysis of~\cite{balcan2011learning} that we
+cannot learn $f$ under any distribution, even with active value queries!
+Indeed, for any polynomially-sized set of observations, there exists a
+super-polynomially number of functions in ${\cal M^1}$ which coincide on this
+set of observations, but which take very different values outside of this set
+of observations: $8n^{1/6}$ for $A \in {\cal B}$ and $n^{1/3}$ for $A \in {\cal
+A}\backslash {\cal B}$.
+
+{\bf TODO:} A cleaner simpler example would be nice.
+
+\section{Meeting notes: 04.03.2015}
+
+Consider the following function:
+\begin{displaymath}
+    g(S) = \max\left\{\sqrt{n}|X\cap S|, |S|\right\}
+\end{displaymath}
+where $|X|=\sqrt{n}$. Assume that you are given polynomially many samples where
+each element is included with probability $1/2$. Then with high probability all
+the samples will have size roughly $n/2$, so you will observe $g(S)=|S|$ with
+high probability.
+
+\paragraph{Claim 1:} $g$ is PAC-learnable because if you output $|S|$ then you
+will be correct with high probability is $S$ is drawn from the same
+distribution as above.
+
+\paragraph{Claim 2:} $g$ is not optimizable under budget $\sqrt{n}$ because you
+never learn anything about $X$.
+
+\paragraph{Open Question:} Cook a similar example where $g$ is submodular and
+where you are observing sets of the same size as your budget.
+
+\paragraph{Positive results:} Try to obtain guarantees about learning
+parameters of parametric submodular functions and show whether or not these
+guarantees are sufficient for optimization. First look at learning weights in
+a cover function. Maybe facility location? Sums of concave over modular are
+probably too hard because of the connection to neural networks.
+
+\section{Which learning guarantees imply optimization?}
+
+Here, we consider the following question: which learning guarantees imply
+optimization? For example, \cite{TODO} provides the following guarantee for
+cover functions:
+
+\begin{theorem}
+There exists an algorithm such that for any $\epsilon>0$ given random and
+uniform examples of a coverage function $c$ outputs a hypothesis, which is also
+a coverage function $h$, such that with probability $2/3$: $\mathbb{E}_{\cal
+U}[|h(x) - c(x)|] \leq \epsilon$. The algorithm runs in time $\tilde {\cal O}(n)
+\cdot \text{poly}(s/\epsilon)$ and uses $\log(n)\cdot \text{poly}(s/epsilon)$
+and examples, where $s = \min\{size(c), (1/\epsilon)^{\log(1/\epsilon)}\}$.
+\end{theorem}
+
+We would like to understand to what extent this $\ell1$-bound allows us to
+optimize the coverage function $c$ under cardinality constraints using the
+hypothesis $h$. Let us consider the simpler case of an additive function, and
+suppose we have a similar guarantee: $\mathbb{E}_{x \sim {\cal U}}[|h(x) -
+c(x)|]$ where $\forall x, c(x) \equiv \sum_i w_i x_i$ and $h(x) \equiv \sum_i
+\hat w_i x_i$. Can we find a bound on $\|w - \hat w\|_{\infty}$?
+
+As it turns out, yes. Let $\forall i, w_i - \hat w_i \equiv \alpha_i$ and let
+$V(x) \equiv |h(x) - c(x)| = |\sum_{i} \alpha_i x_i |$. Consider the collection
+of good sets ${\cal G} \equiv \{S : v(S) < 4\epsilon\}$. We claim that $|{\cal G}|
+\geq \frac{3}{4}\cdot2^n$. Suppose the contrary, there is at least a quarter of
+the sets which have value $v(S) > 4\epsilon$ such that $\mathbb{E}_{x \sim {\cal
+U}}[|v(x)|] \geq \frac{1}{2^n}\sum_{S \in {\cal G}^c} |v(S)| >
+\frac{1}{4}\cdot4\epsilon = \epsilon$ which is a contradiction. Consider element
+$i$ such that $|\alpha_i| \equiv \|\alpha\|_{\infty}$. Consider the collection
+of sets which contain $i$: ${\cal S}_i \equiv \{S : i \in S\}$.  Notice that
+$|{\cal S}_i| = |{\cal S}_i^c| = 2^{n-1}$. Therefore, $|{\cal G} \cap {\cal
+S}_i^c| \geq \frac{1}{4}\cdot 2^n$. For all sets $S$ in ${\cal G} \cap {\cal
+S}_i^c$, $v(S\cup\{j\}) \geq \alpha_i - 4\epsilon$. It follows that
+$\mathbb{E}_{x \sim {\cal U}}[|v(x)|] \geq \frac{1}{4}(\alpha_j - 4\epsilon)$
+and therefore we have $\|w - \hat w\|_{\infty} \leq 8 \epsilon$.
+
+\bibliography{optimize} \bibliographystyle{apalike}
+
+
+\end{document}
diff --git a/optimize.bib b/optimize.bib
deleted file mode 100644
index 3fb5309..0000000
--- a/optimize.bib
+++ /dev/null
@@ -1,15 +0,0 @@
-@article{vershynin2010introduction,
-  title={Introduction to the non-asymptotic analysis of random matrices},
-  author={Vershynin, Roman},
-  journal={arXiv preprint arXiv:1011.3027},
-  year={2010}
-}
-
-@inproceedings{balcan2011learning,
-  title={Learning submodular functions},
-  author={Balcan, Maria-Florina and Harvey, Nicholas JA},
-  booktitle={Proceedings of the forty-third annual ACM symposium on Theory of computing},
-  pages={793--802},
-  year={2011},
-  organization={ACM}
-}
\ No newline at end of file
diff --git a/paper/main.tex b/paper/main.tex
new file mode 100644
index 0000000..58b6b22
--- /dev/null
+++ b/paper/main.tex
@@ -0,0 +1,42 @@
+\documentclass[10pt]{article}
+\usepackage{fullpage, amsmath, amssymb, amsthm, bbm}
+\usepackage[utf8x]{inputenc}
+
+\DeclareMathOperator{\E}{\mathbb{E}}
+\let\P\relax
+\DeclareMathOperator{\P}{\mathbb{P}}
+\newcommand{\ex}[1]{\E\left[#1\right]}
+\newcommand{\prob}[1]{\P\left[#1\right]}
+\newcommand{\reals}{\mathbb{R}}
+\newcommand{\ints}{\mathbb{N}}
+\renewcommand{\O}{\mathcal{O}}
+
+
+\newtheorem{proposition}{Proposition}
+\newtheorem{corollary}{Corollary}
+\newtheorem{problem}{Problem}
+\newtheorem{theorem}{Theorem}
+\newtheorem{claim}{Claim}
+\newtheorem{remark}{Remark}
+
+\title{Optimizing Submodular Functions in the Face of Uncertainty}
+\author{}
+\date{}
+
+\begin{document}
+\maketitle
+
+\section{Introduction}
+\input{sections/introduction.tex}
+
+\section{Learning vs. Optimizing}
+\input{sections/negative.tex}
+
+\section{Positive results}
+\input{sections/positive.tex}
+
+\bibliographystyle{apalike}
+\bibliography{optimize}
+
+
+\end{document}
diff --git a/paper/optimize.bib b/paper/optimize.bib
new file mode 100644
index 0000000..3fb5309
--- /dev/null
+++ b/paper/optimize.bib
@@ -0,0 +1,15 @@
+@article{vershynin2010introduction,
+  title={Introduction to the non-asymptotic analysis of random matrices},
+  author={Vershynin, Roman},
+  journal={arXiv preprint arXiv:1011.3027},
+  year={2010}
+}
+
+@inproceedings{balcan2011learning,
+  title={Learning submodular functions},
+  author={Balcan, Maria-Florina and Harvey, Nicholas JA},
+  booktitle={Proceedings of the forty-third annual ACM symposium on Theory of computing},
+  pages={793--802},
+  year={2011},
+  organization={ACM}
+}
\ No newline at end of file
diff --git a/paper/sections/introduction.tex b/paper/sections/introduction.tex
new file mode 100644
index 0000000..bcb406f
--- /dev/null
+++ b/paper/sections/introduction.tex
@@ -0,0 +1,27 @@
+Let $\Omega$ be the universe of elements and $f$ a function defined on subsets
+of $\Omega$: $f : S \in 2^{[\Omega]} \mapsto f(S) \in \mathbb{R}$. Let $K$ be a
+collection of sets of $2^{[\Omega]}$, which we call \emph{constraints}. Let
+$S^*_K$ be any solution to $\max_{S \in K} f(S)$, which we will also denote by
+$S^*$ when there is no ambiguity. Let $L$ be the problem size, which is often
+(but not always) equal to $|\Omega|$.
+
+In general, we say we can efficiently optimize a function $f$ under constraint
+$K$ when we have a polynomial-time algorithm making adaptive value queries to
+$f$,which returns a set $S$ such that $S \in K$ and $f(S) \geq \alpha f(S^*)$
+with high probability and $\alpha$ an absolute constant.
+
+Here, we consider the scenario where we cannot make adaptive value queries, and
+in fact, where we cannot make queries at all! Instead, we suppose that we
+observe a polynomial number of set-value pairs $(S, f(S))$ where $S$ is taken
+from a known distribution $D$. We say we can efficiently \emph{passively
+optimize} $f$ under distribution $D$ or $D-$optimize $f$ under constraints $K$
+when, after observing ${\cal O}(L^c)$ set-value pairs from $D$ where $c > 0$ is
+an absolute constant, we can return a set $S$ such that $S \in K$ and $f(S)
+\geq \alpha f(S^*)$ with high probability and $\alpha$ an absolute constant.
+
+In the case of \emph{passive} observations of set-value pairs under a
+distribution $D$ for a function $f$, recent research has focused on whether we
+can efficiently and approximately \emph{learn} $f$. This was formalized in the
+PMAC model from \cite{balcan2011learning}. When thinking about passive
+optimization, it is necessary to understand the link between being able to
+ $D-PMAC$ learn $f$ and being able to $D-$optimize $f$.
diff --git a/paper/sections/negative.tex b/paper/sections/negative.tex
new file mode 100644
index 0000000..f88117a
--- /dev/null
+++ b/paper/sections/negative.tex
@@ -0,0 +1,113 @@
+\subsection{Learnable $\nRightarrow$ Optimizable}
+
+Consider the following function:
+\begin{displaymath}
+    g(S) = \max\left\{\sqrt{n}|X\cap S|, |S|\right\}
+\end{displaymath}
+where $|X|=\sqrt{n}$. Assume that you are given polynomially many samples where
+each element is included with probability $1/2$. Then with high probability all
+the samples will have size roughly $n/2$, so you will observe $g(S)=|S|$ with
+high probability.
+
+\paragraph{Claim 1:} $g$ is PAC-learnable because if you output $|S|$ then you
+will be correct with high probability is $S$ is drawn from the same
+distribution as above.
+
+\paragraph{Claim 2:} $g$ is not optimizable under budget $\sqrt{n}$ because you
+never learn anything about $X$.
+
+\paragraph{Open Question:} Cook a similar example where $g$ is submodular and
+where you are observing sets of the same size as your budget.
+
+\paragraph{Positive results:} Try to obtain guarantees about learning
+parameters of parametric submodular functions and show whether or not these
+guarantees are sufficient for optimization. First look at learning weights in
+a cover function. Maybe facility location? Sums of concave over modular are
+probably too hard because of the connection to neural networks.
+
+\subsection{Optimizable $n\Rightarrow$ Learnable}
+
+Recall the matroid construction from~\cite{balcan2011learning}:
+\begin{theorem}
+  For any $k$ with $k = 2^{o(n^{1/3})}$, there exists a family of sets ${\cal A}
+  \subseteq2^{[n]}$ and a family of matroids $\cal{M} = \{M_{\cal{B}} :
+  \cal{B} \subseteq\cal{A} \}$ such that:
+  \begin{itemize}
+    \item $|{\cal A}| = k$ and $|A| = n^{1/3}$ for every $A \in \cal{A}$
+    \item For every $\cal{B} \subseteq\cal{A}$ and every $A \in \cal{A}$, we
+      have:
+      \begin{align*}
+        \text{rank}_{M_{\cal{B}}}(A) & = 8 \log k \ if A\in{\cal B} \\
+                                     & = |A| \ if A\in {\cal A}\backslash{\cal B}
+      \end{align*}
+  \end{itemize}
+\end{theorem}
+
+Consider the following subset of the above family of matroids: ${\cal
+M}^{\epsilon} = \{M_{\cal B} : {\cal B} \subseteq{\cal A} \wedge |{\cal
+A}\backslash{\cal B}| \geq \epsilon|{\cal A}|\}$ for $k = 2^{n^{1/6}}$.
+Consider an \emph{unknown} function $f$, corresponding to the rank function of
+one of the matroids $M_{\cal B}$ from ${\cal M}^{\epsilon}$.  Note that as long
+as we observe \emph{one} set from ${\cal A} \backslash {\cal B}$, we can
+optimize $f$ exactly! Indeed, the largest possible value for $f$ under
+cardinality constraint $n^{1/3}$ is $\max_{A\in 2^{[n]}} |A| = n^{1/3}$.
+
+One example of a distribution under which this occurs with probability at least
+a constant is $D_u$, the uniform distribution over all sets of ${\cal A}$.  For
+$c>0$, after $n^c$ observations $O \equiv (S_i, f(S_i))_i$ for $S_i \sim D_u$,
+we will observe at least one element from $\cal{A}\backslash\cal{B}$ with
+constant probability:
+
+\begin{equation} \nonumber \mathbb{P}(\bigcup_{S_i} S_i \in {\cal
+A}\backslash{\cal B}) \geq 1 - (1 - \epsilon)^{n^c} \approx \epsilon n^c
+\end{equation}
+
+However, it follows from the analysis of~\cite{balcan2011learning} that we
+cannot learn $f$ under any distribution, even with active value queries!
+Indeed, for any polynomially-sized set of observations, there exists a
+super-polynomially number of functions in ${\cal M^1}$ which coincide on this
+set of observations, but which take very different values outside of this set
+of observations: $8n^{1/6}$ for $A \in {\cal B}$ and $n^{1/3}$ for $A \in {\cal
+A}\backslash {\cal B}$.
+
+{\bf TODO:} A cleaner simpler example would be nice.
+
+\subsection{Which learning guarantees imply optimization?}
+
+Here, we consider the following question: which learning guarantees imply
+optimization? For example, \cite{TODO} provides the following guarantee for
+cover functions:
+
+\begin{theorem}
+There exists an algorithm such that for any $\epsilon>0$ given random and
+uniform examples of a coverage function $c$ outputs a hypothesis, which is also
+a coverage function $h$, such that with probability $2/3$: $\mathbb{E}_{\cal
+U}[|h(x) - c(x)|] \leq \epsilon$. The algorithm runs in time $\tilde {\cal O}(n)
+\cdot \text{poly}(s/\epsilon)$ and uses $\log(n)\cdot \text{poly}(s/epsilon)$
+and examples, where $s = \min\{size(c), (1/\epsilon)^{\log(1/\epsilon)}\}$.
+\end{theorem}
+
+We would like to understand to what extent this $\ell1$-bound allows us to
+optimize the coverage function $c$ under cardinality constraints using the
+hypothesis $h$. Let us consider the simpler case of an additive function, and
+suppose we have a similar guarantee: $\mathbb{E}_{x \sim {\cal U}}[|h(x) -
+c(x)|]$ where $\forall x, c(x) \equiv \sum_i w_i x_i$ and $h(x) \equiv \sum_i
+\hat w_i x_i$. Can we find a bound on $\|w - \hat w\|_{\infty}$?
+
+As it turns out, yes. Let $\forall i, w_i - \hat w_i \equiv \alpha_i$ and let
+$V(x) \equiv |h(x) - c(x)| = |\sum_{i} \alpha_i x_i |$. Consider the collection
+of good sets ${\cal G} \equiv \{S : v(S) < 4\epsilon\}$. We claim that $|{\cal G}|
+\geq \frac{3}{4}\cdot2^n$. Suppose the contrary, there is at least a quarter of
+the sets which have value $v(S) > 4\epsilon$ such that $\mathbb{E}_{x \sim {\cal
+U}}[|v(x)|] \geq \frac{1}{2^n}\sum_{S \in {\cal G}^c} |v(S)| >
+\frac{1}{4}\cdot4\epsilon = \epsilon$ which is a contradiction. Consider element
+$i$ such that $|\alpha_i| \equiv \|\alpha\|_{\infty}$. Consider the collection
+of sets which contain $i$: ${\cal S}_i \equiv \{S : i \in S\}$.  Notice that
+$|{\cal S}_i| = |{\cal S}_i^c| = 2^{n-1}$. Therefore, $|{\cal G} \cap {\cal
+S}_i^c| \geq \frac{1}{4}\cdot 2^n$. For all sets $S$ in ${\cal G} \cap {\cal
+S}_i^c$, $v(S\cup\{j\}) \geq \alpha_i - 4\epsilon$. It follows that
+$\mathbb{E}_{x \sim {\cal U}}[|v(x)|] \geq \frac{1}{4}(\alpha_j - 4\epsilon)$
+and therefore we have $\|w - \hat w\|_{\infty} \leq 8 \epsilon$.
+
+
+
diff --git a/paper/sections/positive.tex b/paper/sections/positive.tex
new file mode 100644
index 0000000..c4e8e0f
--- /dev/null
+++ b/paper/sections/positive.tex
@@ -0,0 +1,365 @@
+\subsection{Applications}
+
+\paragraph{TODO:} Add citations!
+
+\subsubsection{Building blocks}
+
+These are generic examples and serve as building blocks for the applications
+below:
+\begin{itemize}
+    \item $f(S) = g\left(\sum_{i\in S} w_i\right)$ for a concave
+        $g:\reals\to\reals$ and weights $(w_i)_{i\in N}\in \reals_+^{|N|}$. Note
+        that $f$ is monotone iff $g$ is monotone. In this case, $g$ does not
+        matter for the purpose of optimization: the sets are in the same order
+        with or without $g$, the only things which matter are the weights which
+        serve as natural parameters for this class of functions. This class of
+        functions contains:
+        \begin{itemize}
+            \item additive functions (when $g$ is the identity function).
+            \item $f(S) = |S\cap X|$ for some set $X\subseteq N$. This is the
+                case where the weights are $0/1$ and $g$ is the identity
+                function.
+            \item symmetric submodular functions: when the weights are all one.
+            \item budget-additive functions, when $g:x\mapsto \min(B, x)$ for
+                some $B$.
+        \end{itemize}
+    \item $f(S) = \max_{i\in S} w_i$ for weights $(w_i)_{i\in N}\in
+        \reals_+^{|N|}$. This class of functions is also naturally parametrized
+        by the weights.
+    \item (weighted) matroid rank functions. Given a matroid $M$ over a ground
+        set $N$, we define its rank function to be:
+        \begin{displaymath}
+            \forall S\subseteq N,\;
+            r(S) = \max_{\substack{I\subseteq S\\I\in M}} |I|
+        \end{displaymath}
+        more generally, given a weight function $w:N\to\reals_+$, we define the
+        weighted matroid rank function:
+        \begin{displaymath}
+            \forall S\subseteq N,\;
+            r(S) = \max_{\substack{I\subseteq S\\I\in M}} \sum_{i\in I} w_i
+        \end{displaymath}
+\end{itemize}
+
+\paragraph{Remark} The function $f(S)= \max_{i\in S}w_i$ is a weighted matroid
+rank function for the $1$-uniform matroid (the matroid where the independent
+sets are the sets of size at most one).
+
+\subsubsection{Examples}
+
+\begin{itemize}
+    \item \emph{Coverage functions:} they can be written as a positive linear
+        combination of matroid rank functions:
+        \begin{displaymath}
+            f(S) = \sum_{u\in\mathcal{U}} w_u c_u(S)
+        \end{displaymath}
+        where $c_u$ is the rank function of the matroid $M = \big\{ \emptyset,
+        \{u\}\big\}$.
+    \item \emph{Facility location:} (cite Bilmes) there is a universe
+        $\mathcal{U}$ of locations and a proximity score $s_{i,j}$ for each
+        pair of locations.  We pick a subset of locations $S$ and each point in
+        the universe is allocated to its closest location (the one with highest
+        proximity):
+        \begin{displaymath}
+            f(S) = \sum_{u\in\mathcal{U}} \max_{v\in S} s_{u,v}
+        \end{displaymath}
+        This can be seen as a sum of weighted matroid rank functions: one for
+        each location in the universe associated with a $1$-uniform matroid
+        (other applications: job scheduling).
+
+    \item \emph{Image segmentation:} (cite Jegelka) can be (in some cases)
+        written as a graph cut function. For image segmentation the goal is to
+        minimize the cut.
+        \begin{displaymath}
+            f(S) = \sum_{e\in E} w_e c_e(S)
+        \end{displaymath}
+        where $c_e(S)$ is one iff $e\in E(S,\bar{S})$. \textbf{TODO:} I think
+        this can be written as a matroid rank function.
+    \item \emph{Learning} (cite Krause) there is
+        a hypothesis $A$ (a random variable) which is “refined” when more
+        observations are made.  Imagine that there is a finite set $X_1,\dots,
+        X_n$ of possible observations (random variables). Then, assuming that
+        the observations are independent conditioned on $A$, the information
+        gain:
+        \begin{displaymath}
+            f(S) = H(A) - H\big(A\,|\,(X_i)_{i\in S}\big)
+        \end{displaymath}
+        is submodular. The $\log\det$ is the specific case of a linear
+        hypothesis observed with additional independent Gaussian noise.
+    \item \emph{Entropy:} Closely related to the previous one. If $(X_1,\dots,
+        X_n)$ are random variables, then:
+        $ f(S) = H(X_S) $ is submodular. In particular, if $(X_1,\dots,
+        X_n)$ are jointly multivariate gaussian, then:
+        \begin{displaymath}
+            f(S) = c|S| + \frac{1}{2}\log\det X_SX_S^T
+        \end{displaymath}
+        for $c= 2\pi e...$ and we fall back to the usual $\log\det$ function.
+    \item \emph{data subset selection/summarization:} in statistical machine
+        translation, Bilmes used sum of concave over modular:
+        \begin{displaymath}
+            f(S) = \sum_{f} \lambda_f \phi\left(\sum_{e\in S}w_f(e)\right)
+        \end{displaymath}
+        where each $f$ represents a feature, $w_f(e)$ represents how much of
+        $f$ element $e$ has, and $\phi$ captures decreasing marginal gain when
+        we have a lot of a given feature.
+        Facility location functions are also commonly used for subset selection.
+    \item \emph{concave spectral functions} One would be tempted to say that
+        any multivariate concave function of a modular function is submodular.
+        This would be the natural generalization of concave over modular.
+        However \textbf{this is not true in general}. However, a possible nice
+        generalization is the following. Let $M$ be a symmetric $n\times
+        n$ matrix, and $g$ is a ``matrix concave'' function. Then:
+        \begin{displaymath}
+            f(S) = \mathrm{Tr}\big(g(M_S)\big)
+        \end{displaymath}
+        is submodular. This contains the $\log\det$ (when $g$ is the matrix
+        $\log$) but tons of other functions (like quantum entropy).
+\end{itemize}
+In summary, the two most general classes of submodular functions (which capture
+all the examples known to man) are: sums of matrix concave functions and sums
+of matroid rank functions. Sums of concave over modular are also nice if we
+want to start with a simpler example.
+
+\subsection{Additive Functions}
+
+We consider here the simple case of additive functions. A function $f$ is
+additive if there exists a weight vector $(w_s)_{s \in \Omega}$ such that
+$\forall S \subseteq \Omega, \ f(S) = \sum_{s \in S} w_s$. We will sometimes
+adopt the notation $w \equiv f(\Omega)$.
+
+Suppose we have observed $n^c$ set-value pairs ${(S_j, f(S_j))}_{j \in N}$ with
+$S_j \sim D$ where $n \equiv |\Omega|$. Consider the $n^c \times n$ matrix $A$
+where for all $i$, the row $A_i = \chi_{S_i}$, the indicator vector of set
+$S_i$. Let $B$ be the $n^c \times 1$ vector such that $\forall i, b_j \equiv
+f(S_j)$. It is easy to see that if $w$ is the $n \times 1$ corresponding weight
+vector for $f$ then:
+\begin{displaymath}
+  A w = B
+\end{displaymath}
+
+Note that if $A$ has full column rank, then we can solve for $w$ exactly and
+also optimize $f$ under any cardinality constraint. We can therefore cast the
+question of $D-$learning and $D-$optimizing $f$ as a random matrix theory
+problem: what is the probability that after $n^c$ for $c > 0$ independent
+samples from $D$, the matrix $A$ will have full rank? See
+section~\ref{sec:matrix_theory}
+
+\paragraph{Extension}
+Note that the previous reasoning can easily be extended to any \emph{almost}
+additive function. Consider a function $g$ such that there exists $\alpha > 0$
+and $\beta > 0$ and an additive function $f$ such that $\forall S, \alpha f(S)
+\leq g(S) \leq \beta f(S)$, then by solving for $\max_{S\in K} f(S)$ we have a
+$\alpha/\beta$-approximation to the optimum of $g$ since:
+
+\begin{displaymath}
+g(S^*) \geq \alpha f(S^*) \geq \alpha f(OPT) \geq
+\frac{\alpha}{\beta} g(OPT)
+\end{displaymath}
+
+where $OPT \in \arg \max_{S \in K} g(S)$. This can be taken one step further by
+considering a function $g$ such that there exists $\alpha, \beta >0$, an
+additive function $f$ and a bijective univariate function $\phi$, such that
+$\forall S, \alpha \phi(f(S)) \leq g(S) \leq \beta \phi(f(S))$. In this case, we
+solve the system $A w = \phi^{-1}(B)$ and obtain once again an
+$\alpha/\beta$-approximation to the optimum of $g$.
+
+
+\subsubsection{Average weight method}
+
+We consider here another method to $D-$optimizing $f$, which only requires
+$D-$learning $f$ approximately. Note that for every element $i \in \Omega$, and
+for a \emph{product} distribution $D$:
+\begin{displaymath}
+  \mathbb{E}_{S \sim D}(f(S)|i \in S) - \mathbb{E}_{S \sim D}(f(S) | i \notin
+  S) = \sum_{j \neq i \in \Omega} w_s \left(\mathbb{P}(j\in S|i \in S) -
+  \mathbb{P}(j \in S | i \notin S)\right) + w_i(1 - 0) = w_i
+\end{displaymath}
+
+Let $O$ be the collection of all sets $S$ for which we have observed $f(S)$ and
+let $O_i \equiv \{S \in O : i \in S\}$ and $O^c_i \equiv \{ S\in O: i \notin S
+\}$. If $O_i$ and $O^c_i$ are non-empty, define the following weight estimator:
+\begin{displaymath}
+  \hat W_i \equiv \frac{1}{|O_i|} \sum_{S \in O_i} f(S) - \frac{1}{|O_i^c|}
+  \sum_{S \in O_i^c} f(S)
+\end{displaymath}
+
+If $D$ is a product distribution such that $ \exists c > 0, \forall i,
+\mathbb{P}(i) \geq c$, it is
+easy to show that $\forall i \in
+\Omega,\ \hat W_i \rightarrow_{|O| \rightarrow +\infty} w_i$
+By using standard concentration bounds, we can hope to obtain within $poly(n,
+\frac{1}{\epsilon})$ observations:
+
+\subsubsection{Generic Random Matrix Theory}
+\label{sec:matrix_theory}
+
+We cite the following result from~\cite{vershynin2010introduction} (Remark 5.40):
+
+\begin{proposition}\label{prop:non_isotropic_isometry} 
+Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are independent
+sub-gaussian random vectors in $\mathbb{R}^n$ with second moment matrix
+$\Sigma$. Then for every $t \geq 0$, the following inequality holds with
+probability at least: $1- 2 \exp(-ct^2)$: \begin{equation} \|\frac{1}{N} A^T A
+- \Sigma \| \leq \max(\delta, \delta^2) \ \text{where}\ \delta =
+C\sqrt{\frac{n}{N}} + \frac{t}{\sqrt{N}} \end{equation} where $C$, $c$ depend
+only on the subgaussian norm of the rows $K \equiv \max_i \| A_i\|_{\psi_2}$.
+\end{proposition}
+
+The following result is a simple corollary of
+Proposition~\ref{prop:non_isotropic_isometry}:
+
+\begin{corollary}\label{cor:number_of_rows_needed} Let $n \in \mathbb{N}$ and
+  $k \in ]0,n[$. Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are
+  independent Bernoulli variable vectors in $\mathbb{R}^n$ such that
+  $\mathbb{P}(A_{i,j} = 1) = \frac{k}{n} = 1 - \mathbb{P}(A_{i,j} = 0)$ and let
+  $\sigma \equiv \frac{k}{n}(1- \frac{k}{n}) \neq 0$, then if $N > {(C+1)}^2
+  n/\sigma^2$, the matrix A has full row rank with probability at least $1 -
+  e^{-cn}$, where $C, c$ are constant depending only on  the subgaussian norm
+  of the rows $K \equiv \sup_{p \geq 1}
+  {\frac{1}{\sqrt{p}}(\mathbb{E}|A_1|^p)}^\frac{1}{p} = k$\footnote{Is this true?
+  And how do the constants behave?} \end{corollary}
+
+\begin{proof} It is easy to compare the kernels of $A$ and $A^TA$ and notice
+  that $rank(A) = rank(A^T A)$. Since $A^TA$ is an $n \times n$ matrix, it
+  follows that if $A^TA$ is invertible, then $A$ has full row rank. In other
+  words, if $A$ has smallest singular value $\sigma_{\min}(A) > 0$, then $A$
+  has full row rank.  Consider Prop.~\ref{prop:non_isotropic_isometry} with $t
+  \equiv \sqrt{n}$ and with $N > {(C+1)}^{2} n$, then with probability $1 -
+  2\exp(-cn)$: \begin{equation} \|\frac{1}{N} A^T A - \sigma I \| \leq
+    (C+1)\sqrt{\frac{n}{N}} \end{equation}
+
+It follows that for any vector $x \in \mathbb{R}^n$, $|\frac{1}{N}\|A x\|^2_2
+-\sigma | \leq (C+1)\sqrt{\frac{n}{N}} \implies \|A x\|^2_2 \geq N\left(\sigma
+- (C+1)\sqrt{\frac{n}{N}}\right)$. If $N > {(C+1)}^2n/\sigma^2$, then
+$\sigma_{\min}(A) > 0$ with probability at least $1 - e^{-cn}$ \end{proof}
+
+\subsubsection{Algebraic Approach}
+
+Here we work on $\mathbb{F}_2$. An $n\times n$ binary matrix can be seen as
+a matrix over $\mathbb{F}_2$. Let us assume that each row of $A$ is chosen
+uniformly at random among all binary rows of length $n$. A standard counting
+arguments shows that the number of non-singular matrices over $\mathbb{F}_2$
+is:
+\begin{displaymath}
+    N_n = (2^n-1)(2^n - 2)\dots (2^n- 2^{n-1})
+    = (2^n)^n\prod_{i=1}^n\left(1-\frac{1}{2^i}\right)
+\end{displaymath}
+
+Hence the probability that our random matrix $A$ is invertible is:
+\begin{displaymath}
+    P_n  = \prod_{i=1}^n\left(1-\frac{1}{2^i}\right)
+\end{displaymath}
+It is easy to see that $P_n$ is a decreasing sequence. Its limit is
+$\phi(\frac{1}{2})$ where $\phi$ is the Euler function. We have
+$\phi(\frac{1}{2})\approx
+0.289$ and $P_n$ converges
+exponentially fast to this constant (one way to see this is to use the
+Pentagonal number theorem).
+
+Hence, if we observe $\ell\cdot n$ uniformly random binary rows, the
+probability that they will have full column rank is at least:
+\begin{displaymath}
+    P_{\ell\cdot n}\geq 1 - \left(1-\phi\left(\frac{1}{2}\right)\right)^{\ell}
+\end{displaymath}
+
+Note that this is the probability of having full column rank over
+$\mathbb{F}_2$. A standard linear algebra argument shows that this implies full
+column rank over $\mathbb{R}$.
+
+\paragraph{TODO:} Study the case where we only observe sets of size exactly
+$k$, or at most $k$. This amounts to replacing $2^n$ in the computation above
+by:
+\begin{displaymath}
+{n\choose k}\quad\text{or}\quad\sum_{j=0}^k{n\choose j}
+\end{displaymath}
+
+Thinking about it, why do we assume that the sample sets are of size exactly
+$k$. I think it would make more sense from the learning perspective to consider
+uniformly random sets. In this case, the above approach allows us to conclude
+directly.
+
+More generally, I think the “right” way to think about this is to look at $A$
+as the adjacency matrix of a random $k$-regular graph. There are tons of
+results about the probability of the adjacency matrix being non singular.
+
+\subsection{Influence Functions}
+
+Let $G = (V, E)$ be a weighted directed graph. Without loss of generality, we
+can assign a weight $p_{u,v} \in [0,1]$ to every possible edge $(u,v) \in V^2$.
+Let $m$ be the number of non-zero edges of $G$. Let $\sigma_G(S, p)$ be the
+influence of the set $S \subseteq V$ in $G$ under the IC model with parameters
+$p$.
+
+Recall from \cite{Kempe:03} that:
+\begin{equation}
+    \sigma_G(S, p) = \sum_{v\in V} \P_{G_p}\big[r_{G_p}(S\leadsto v)\big]
+\end{equation}
+where $G_p$ is a random graph where each edge $(u,v)\in E$ appears with
+probability $p_{u,v}$ and $r_{G_p}(S\leadsto v)$ is the indicator variable of
+\emph{there exists a path from $S$ to $v$ in $G_p$}.
+
+\begin{claim}
+\label{cla:oracle}
+If for all $(u,v) \in V^2$, $p_{u,v} \geq p'_{u,v} \geq p_{u,v}
+- \frac{1}{\alpha m}$, then:
+\begin{displaymath}
+    \forall S \subseteq V,\, \sigma_{G}(S, p) \geq \sigma_{G}(S,p') \geq (1
+    - 1/\alpha) \sigma_{G}(S,p)
+\end{displaymath}
+\end{claim}
+
+\begin{proof}
+    We define two coupled random graphs $G_p$ and $G_p'$ as follows: for each
+    edge $(u,v)\in E$, draw a uniform random variable $\mathcal{U}_{u,v}\in
+    [0,1]$. Include $(u,v)$ in $G_p$ (resp. $G_{p'}$) iff
+    $\mathcal{U}_{u,v}\leq p_{u,v}$ (resp. $\mathcal{U}_{u,v}\leq p'_{u,v}$).
+    It is clear from the construction that each edge $(u,v)$ will be present in
+    $G_p$ (resp. $G_p'$) with probability $p_{u,v}$ (resp. $p'_{u,v}$). Hence:
+    \begin{displaymath}
+        \sigma_G(S, p) = \sum_{v\in V} \P_{G_p}\big[r_{G_p}(S\leadsto v)\big]
+        \text{  and  }
+        \sigma_G(S, p') = \sum_{v\in V} \P_{G_p'}\big[r_{G_p'}(S\leadsto v)\big]
+    \end{displaymath}
+
+    By construction $G_{p'}$ is always a subgraph of $G_p$, i.e
+    $r_{G_p'}(S\leadsto v)\leq r_{G_p}(S\leadsto v)$. This proves the left-hand
+    side of the Claim's inequality.
+
+    The probability that an edge is present in $G_p$ but not in $G_p'$ is at
+    most $\frac{1}{\alpha m}$, so with probability $\left(1-\frac{1}{\alpha
+    m}\right)^m$, $G_p$ and $G_p'$ have the same set of edges. This implies that:
+    \begin{displaymath} \P_{G_{p'}}\big[r_{G_p'}(S\leadsto v)\big]\geq
+    \left(1-\frac{1}{\alpha m}\right)^m\P_{G_{p}}\big[r_{G_p}(S\leadsto v)\big]
+  \end{displaymath}
+    which proves the right-hand side of the claim after observing that
+    $\left(1-\frac{1}{\alpha m}\right)^m\geq 1-\frac{1}{\alpha}$ with equality
+    iff $m=1$.
+\end{proof}
+
+ We can use Claim~\ref{cla:oracle} to find a constant factor approximation
+ algorithm to maximising influence on $G$ by maximising influence on $G'$. For
+ $k \in \mathbb{N}^*$, let $O_k \in \arg\max_{|S| \leq k} \sigma_G(S)$ and
+ $\sigma_{G}^* = \sigma_G(O_k)$ where we omit the $k$ when it is unambiguous.
+
+
+\begin{proposition}
+\label{prop:approx_optim}
+Suppose we have an unknown graph $G$ and a known graph $G'$ such that $V = V'$
+and for all $(u,v) \in V^2, |p_{u,v} - p_{u,v}'| \leq  \frac{1}{\alpha m}$.
+Then for all $k \in \mathbb{N}^*$, we can find a set $\hat O_k$ such that
+$\sigma_{G}(\hat O_k) \geq (1 - e^{\frac{2}{\alpha} - 1}) \sigma^*_{G}$
+\end{proposition}
+
+\begin{proof} For every edge $(u,v) \in V^2$, let $\hat p = p'_{u,v} -
+  \frac{1}{\alpha m}$. We are now in the conditions of Claim~\ref{cla:oracle}
+  with $\alpha \leftarrow \alpha/2$. We return the set $\hat O_k$ obtained by
+  greedy maximisation on $\hat G$. It is a classic exercise to show then that
+  $\sigma_G(\hat O_k) \geq 1 - e^{\frac{2}{\alpha} - 1}$ (see Pset 1, CS284r).
+\end{proof}
+
+A small note on the approximation factor: it is only $>0$ for $\alpha > 2$.
+Note that $\alpha \geq 7 \implies 1 - e^{\frac{2}{\alpha} - 1} \geq
+\frac{1}{2}$ and that it converges to $(1 - 1/e)$ which is the best possible
+polynomial-time approximation ratio to influence maximisation unless $P = NP$.
+Also note that in the limit of large $m$, $(1 -\frac{1}{\alpha m})^m
+\rightarrow \exp(-1/\alpha)$ and the approximation ratio goes to $1 -
+\exp(-\exp(-2/\alpha))$.
diff --git a/results.tex b/results.tex
deleted file mode 100644
index 31801fc..0000000
--- a/results.tex
+++ /dev/null
@@ -1,616 +0,0 @@
-\documentclass[10pt]{article}
-\usepackage{fullpage, amsmath, amssymb, amsthm, bbm}
-\usepackage[utf8x]{inputenc}
-
-\DeclareMathOperator{\E}{\mathbb{E}}
-\let\P\relax
-\DeclareMathOperator{\P}{\mathbb{P}}
-\newcommand{\ex}[1]{\E\left[#1\right]}
-\newcommand{\prob}[1]{\P\left[#1\right]}
-\newcommand{\reals}{\mathbb{R}}
-\newcommand{\ints}{\mathbb{N}}
-\renewcommand{\O}{\mathcal{O}}
-
-
-\newtheorem{proposition}{Proposition}
-\newtheorem{corollary}{Corollary}
-\newtheorem{problem}{Problem}
-\newtheorem{theorem}{Theorem}
-\newtheorem{claim}{Claim}
-\newtheorem{remark}{Remark}
-
-\title{Learn and/or Optimize}
-\author{}
-\date{}
-
-\begin{document}
-\maketitle
-
-\section{Preliminary Results}
-\label{sec:matrix_theory}
-
-\subsection{Generic Random Matrix Theory}
-We cite the following result from~\cite{vershynin2010introduction} (Remark 5.40):
-
-\begin{proposition}\label{prop:non_isotropic_isometry} 
-Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are independent
-sub-gaussian random vectors in $\mathbb{R}^n$ with second moment matrix
-$\Sigma$. Then for every $t \geq 0$, the following inequality holds with
-probability at least: $1- 2 \exp(-ct^2)$: \begin{equation} \|\frac{1}{N} A^T A
-- \Sigma \| \leq \max(\delta, \delta^2) \ \text{where}\ \delta =
-C\sqrt{\frac{n}{N}} + \frac{t}{\sqrt{N}} \end{equation} where $C$, $c$ depend
-only on the subgaussian norm of the rows $K \equiv \max_i \| A_i\|_{\psi_2}$.
-\end{proposition}
-
-The following result is a simple corollary of
-Proposition~\ref{prop:non_isotropic_isometry}:
-
-\begin{corollary}\label{cor:number_of_rows_needed} Let $n \in \mathbb{N}$ and
-  $k \in ]0,n[$. Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are
-  independent Bernoulli variable vectors in $\mathbb{R}^n$ such that
-  $\mathbb{P}(A_{i,j} = 1) = \frac{k}{n} = 1 - \mathbb{P}(A_{i,j} = 0)$ and let
-  $\sigma \equiv \frac{k}{n}(1- \frac{k}{n}) \neq 0$, then if $N > {(C+1)}^2
-  n/\sigma^2$, the matrix A has full row rank with probability at least $1 -
-  e^{-cn}$, where $C, c$ are constant depending only on  the subgaussian norm
-  of the rows $K \equiv \sup_{p \geq 1}
-  {\frac{1}{\sqrt{p}}(\mathbb{E}|A_1|^p)}^\frac{1}{p} = k$\footnote{Is this true?
-  And how do the constants behave?} \end{corollary}
-
-\begin{proof} It is easy to compare the kernels of $A$ and $A^TA$ and notice
-  that $rank(A) = rank(A^T A)$. Since $A^TA$ is an $n \times n$ matrix, it
-  follows that if $A^TA$ is invertible, then $A$ has full row rank. In other
-  words, if $A$ has smallest singular value $\sigma_{\min}(A) > 0$, then $A$
-  has full row rank.  Consider Prop.~\ref{prop:non_isotropic_isometry} with $t
-  \equiv \sqrt{n}$ and with $N > {(C+1)}^{2} n$, then with probability $1 -
-  2\exp(-cn)$: \begin{equation} \|\frac{1}{N} A^T A - \sigma I \| \leq
-    (C+1)\sqrt{\frac{n}{N}} \end{equation}
-
-It follows that for any vector $x \in \mathbb{R}^n$, $|\frac{1}{N}\|A x\|^2_2
--\sigma | \leq (C+1)\sqrt{\frac{n}{N}} \implies \|A x\|^2_2 \geq N\left(\sigma
-- (C+1)\sqrt{\frac{n}{N}}\right)$. If $N > {(C+1)}^2n/\sigma^2$, then
-$\sigma_{\min}(A) > 0$ with probability at least $1 - e^{-cn}$ \end{proof}
-
-\subsection{More Direct Approach}
-
-Here we work on $\mathbb{F}_2$. An $n\times n$ binary matrix can be seen as
-a matrix over $\mathbb{F}_2$. Let us assume that each row of $A$ is chosen
-uniformly at random among all binary rows of length $n$. A standard counting
-arguments shows that the number of non-singular matrices over $\mathbb{F}_2$
-is:
-\begin{displaymath}
-    N_n = (2^n-1)(2^n - 2)\dots (2^n- 2^{n-1})
-    = (2^n)^n\prod_{i=1}^n\left(1-\frac{1}{2^i}\right)
-\end{displaymath}
-
-Hence the probability that our random matrix $A$ is invertible is:
-\begin{displaymath}
-    P_n  = \prod_{i=1}^n\left(1-\frac{1}{2^i}\right)
-\end{displaymath}
-It is easy to see that $P_n$ is a decreasing sequence. Its limit is
-$\phi(\frac{1}{2})$ where $\phi$ is the Euler function. We have
-$\phi(\frac{1}{2})\approx
-0.289$ and $P_n$ converges
-exponentially fast to this constant (one way to see this is to use the
-Pentagonal number theorem).
-
-Hence, if we observe $\ell\cdot n$ uniformly random binary rows, the
-probability that they will have full column rank is at least:
-\begin{displaymath}
-    P_{\ell\cdot n}\geq 1 - \left(1-\phi\left(\frac{1}{2}\right)\right)^{\ell}
-\end{displaymath}
-
-Note that this is the probability of having full column rank over
-$\mathbb{F}_2$. A standard linear algebra argument shows that this implies full
-column rank over $\mathbb{R}$.
-
-\paragraph{TODO:} Study the case where we only observe sets of size exactly
-$k$, or at most $k$. This amounts to replacing $2^n$ in the computation above
-by:
-\begin{displaymath}
-{n\choose k}\quad\text{or}\quad\sum_{j=0}^k{n\choose j}
-\end{displaymath}
-
-Thinking about it, why do we assume that the sample sets are of size exactly
-$k$. I think it would make more sense from the learning perspective to consider
-uniformly random sets. In this case, the above approach allows us to conclude
-directly.
-
-More generally, I think the “right” way to think about this is to look at $A$
-as the adjacency matrix of a random $k$-regular graph. There are tons of
-results about the probability of the adjacency matrix to be non singular.
-
-\section{Examples}
-
-\subsection{Generic Functions}
-
-\paragraph{TODO:} Add citations!
-
-These are generic examples and serve as building blocks for the applications
-below:
-\begin{itemize}
-    \item $f(S) = g\left(\sum_{i\in S} w_i\right)$ for a concave
-        $g:\reals\to\reals$ and weights $(w_i)_{i\in N}\in \reals_+^{|N|}$. Note
-        that $f$ is monotone iff $g$ is monotone. In this case, $g$ does not
-        matter for the purpose of optimization: the sets are in the same order
-        with or without $g$, the only things which matter are the weights which
-        serve as natural parameters for this class of functions. This class of
-        functions contains:
-        \begin{itemize}
-            \item additive functions (when $g$ is the identity function).
-            \item $f(S) = |S\cap X|$ for some set $X\subseteq N$. This is the
-                case where the weights are $0/1$ and $g$ is the identity
-                function.
-            \item symmetric submodular functions: when the weights are all one.
-            \item budget-additive functions, when $g:x\mapsto \min(B, x)$ for
-                some $B$.
-        \end{itemize}
-    \item $f(S) = \max_{i\in S} w_i$ for weights $(w_i)_{i\in N}\in
-        \reals_+^{|N|}$. This class of functions is also naturally parametrized
-        by the weights.
-    \item (weighted) matroid rank functions. Given a matroid $M$ over a ground
-        set $N$, we define its rank function to be:
-        \begin{displaymath}
-            \forall S\subseteq N,\;
-            r(S) = \max_{\substack{I\subseteq S\\I\in M}} |I|
-        \end{displaymath}
-        more generally, given a weight function $w:N\to\reals_+$, we define the
-        weighted matroid rank function:
-        \begin{displaymath}
-            \forall S\subseteq N,\;
-            r(S) = \max_{\substack{I\subseteq S\\I\in M}} \sum_{i\in I} w_i
-        \end{displaymath}
-\end{itemize}
-
-\paragraph{Remark} The function $f(S)= \max_{i\in S}w_i$ is a weighted matroid
-rank function for the $1$-uniform matroid (the matroid where the independent
-sets are the sets of size at most one).
-
-\subsection{Applications}
-
-\begin{itemize}
-    \item \emph{Coverage functions:} they can be written as a positive linear
-        combination of matroid rank functions:
-        \begin{displaymath}
-            f(S) = \sum_{u\in\mathcal{U}} w_u c_u(S)
-        \end{displaymath}
-        where $c_u$ is the rank function of the matroid $M = \big\{ \emptyset,
-        \{u\}\big\}$.
-    \item \emph{Facility location:} (cite Bilmes) there is a universe
-        $\mathcal{U}$ of locations and a proximity score $s_{i,j}$ for each
-        pair of locations.  We pick a subset of locations $S$ and each point in
-        the universe is allocated to its closest location (the one with highest
-        proximity):
-        \begin{displaymath}
-            f(S) = \sum_{u\in\mathcal{U}} \max_{v\in S} s_{u,v}
-        \end{displaymath}
-        This can be seen as a sum of weighted matroid rank functions: one for
-        each location in the universe associated with a $1$-uniform matroid
-        (other applications: job scheduling).
-
-    \item \emph{Image segmentation:} (cite Jegelka) can be (in some cases)
-        written as a graph cut function. For image segmentation the goal is to
-        minimize the cut.
-        \begin{displaymath}
-            f(S) = \sum_{e\in E} w_e c_e(S)
-        \end{displaymath}
-        where $c_e(S)$ is one iff $e\in E(S,\bar{S})$. \textbf{TODO:} I think
-        this can be written as a matroid rank function.
-    \item \emph{Learning} (cite Krause) there is
-        a hypothesis $A$ (a random variable) which is “refined” when more
-        observations are made.  Imagine that there is a finite set $X_1,\dots,
-        X_n$ of possible observations (random variables). Then, assuming that
-        the observations are independent conditioned on $A$, the information
-        gain:
-        \begin{displaymath}
-            f(S) = H(A) - H\big(A\,|\,(X_i)_{i\in S}\big)
-        \end{displaymath}
-        is submodular. The $\log\det$ is the specific case of a linear
-        hypothesis observed with additional independent Gaussian noise.
-    \item \emph{Entropy:} Closely related to the previous one. If $(X_1,\dots,
-        X_n)$ are random variables, then:
-        $ f(S) = H(X_S) $ is submodular. In particular, if $(X_1,\dots,
-        X_n)$ are jointly multivariate gaussian, then:
-        \begin{displaymath}
-            f(S) = c|S| + \frac{1}{2}\log\det X_SX_S^T
-        \end{displaymath}
-        for $c= 2\pi e...$ and we fall back to the usual $\log\det$ function.
-    \item \emph{data subset selection/summarization:} in statistical machine
-        translation, Bilmes used sum of concave over modular:
-        \begin{displaymath}
-            f(S) = \sum_{f} \lambda_f \phi\left(\sum_{e\in S}w_f(e)\right)
-        \end{displaymath}
-        where each $f$ represents a feature, $w_f(e)$ represents how much of
-        $f$ element $e$ has, and $\phi$ captures decreasing marginal gain when
-        we have a lot of a given feature.
-        Facility location functions are also commonly used for subset selection.
-    \item \emph{concave spectral functions} One would be tempted to say that
-        any multivariate concave function of a modular function is submodular.
-        This would be the natural generalization of concave over modular.
-        However \textbf{this is not true in general}. However, a possible nice
-        generalization is the following. Let $M$ be a symmetric $n\times
-        n$ matrix, and $g$ is a ``matrix concave'' function. Then:
-        \begin{displaymath}
-            f(S) = \mathrm{Tr}\big(g(M_S)\big)
-        \end{displaymath}
-        is submodular. This contains the $\log\det$ (when $g$ is the matrix
-        $\log$) but tons of other functions (like quantum entropy).
-\end{itemize}
-In summary, the two most general classes of submodular functions (which capture
-all the examples known to man) are: sums of matrix concave functions and sums
-of matroid rank functions. Sums of concave over modular are also nice if we
-want to start with a simpler example.
-
-\section{Passive Optimization}
-
-Let $\Omega$ be the universe of elements and $f$ a function defined on subsets
-of $\Omega$: $f : S \in 2^{[\Omega]} \mapsto f(S) \in \mathbb{R}$. Let $K$ be a
-collection of sets of $2^{[\Omega]}$, which we call \emph{constraints}. Let
-$S^*_K$ be any solution to $\max_{S \in K} f(S)$, which we will also denote by
-$S^*$ when there is no ambiguity. Let $L$ be the problem size, which is often
-(but not always) equal to $|\Omega|$.
-
-In general, we say we can efficiently optimize a function $f$ under constraint
-$K$ when we have a polynomial-time algorithm making adaptive value queries to
-$f$,which returns a set $S$ such that $S \in K$ and $f(S) \geq \alpha f(S^*)$
-with high probability and $\alpha$ an absolute constant.
-
-Here, we consider the scenario where we cannot make adaptive value queries, and
-in fact, where we cannot make queries at all! Instead, we suppose that we
-observe a polynomial number of set-value pairs $(S, f(S))$ where $S$ is taken
-from a known distribution $D$. We say we can efficiently \emph{passively
-optimize} $f$ under distribution $D$ or $D-$optimize $f$ under constraints $K$
-when, after observing ${\cal O}(L^c)$ set-value pairs from $D$ where $c > 0$ is
-an absolute constant, we can return a set $S$ such that $S \in K$ and $f(S)
-\geq \alpha f(S^*)$ with high probability and $\alpha$ an absolute constant.
-
-In the case of \emph{passive} observations of set-value pairs under a
-distribution $D$ for a function $f$, recent research has focused on whether we
-can efficiently and approximately \emph{learn} $f$. This was formalized in the
-PMAC model from \cite{balcan2011learning}. When thinking about passive
-optimization, it is necessary to understand the link between being able to
- $D-PMAC$ learn $f$ and being able to $D-$optimize $f$.
-
-\subsection{Additive function}
-
-We consider here the simple case of additive functions. A function $f$ is
-additive if there exists a weight vector $(w_s)_{s \in \Omega}$ such that
-$\forall S \subseteq \Omega, \ f(S) = \sum_{s \in S} w_s$. We will sometimes
-adopt the notation $w \equiv f(\Omega)$.
-
-\subsubsection{Inverting the system}\label{subsec:invert_the_system}
-
-Suppose we have observed $n^c$ set-value pairs ${(S_j, f(S_j))}_{j \in N}$ with
-$S_j \sim D$ where $n \equiv |\Omega|$. Consider the $n^c \times n$ matrix $A$
-where for all $i$, the row $A_i = \chi_{S_i}$, the indicator vector of set
-$S_i$. Let $B$ be the $n^c \times 1$ vector such that $\forall i, b_j \equiv
-f(S_j)$. It is easy to see that if $w$ is the $n \times 1$ corresponding weight
-vector for $f$ then:
-\begin{displaymath}
-  A w = B
-\end{displaymath}
-
-Note that if $A$ has full column rank, then we can solve for $w$ exactly and
-also optimize $f$ under any cardinality constraint. We can therefore cast the
-question of $D-$learning and $D-$optimizing $f$ as a random matrix theory
-problem: what is the probability that after $n^c$ for $c > 0$ independent
-samples from $D$, the matrix $A$ will have full rank? See
-section~\ref{sec:matrix_theory}
-
-\paragraph{Extension}
-Note that the previous reasoning can easily be extended to any \emph{almost}
-additive function. Consider a function $g$ such that there exists $\alpha > 0$
-and $\beta > 0$ and an additive function $f$ such that $\forall S, \alpha f(S)
-\leq g(S) \leq \beta f(S)$, then by solving for $\max_{S\in K} f(S)$ we have a
-$\alpha/\beta$-approximation to the optimum of $g$ since:
-
-\begin{displaymath}
-g(S^*) \geq \alpha f(S^*) \geq \alpha f(OPT) \geq
-\frac{\alpha}{\beta} g(OPT)
-\end{displaymath}
-
-where $OPT \in \arg \max_{S \in K} g(S)$. This can be taken one step further by
-considering a function $g$ such that there exists $\alpha, \beta >0$, an
-additive function $f$ and a bijective univariate function $\phi$, such that
-$\forall S, \alpha \phi(f(S)) \leq g(S) \leq \beta \phi(f(S))$. In this case, we
-solve the system $A w = \phi^{-1}(B)$ and obtain once again an
-$\alpha/\beta$-approximation to the optimum of $g$.
-
-\begin{remark}
-Note that here $D-$optimizing $f$ is easy because $D-$learning $f$ is easy. We
-would like to understand whether being able to $D-$learn $f$ is really
-necessary to $D-$optimizing it. In fact, many results for PMAC-learning more
-complex functions, such as general submodular functions, are negative. Can we
-hope to find positive results in cases where PMAC-learning is impossible?
-\end{remark}
-
-\subsubsection{Average weight method}
-
-We consider here another method to $D-$optimizing $f$, which only requires
-$D-$learning $f$ approximately. Note that for every element $i \in \Omega$, and
-for a \emph{product} distribution $D$:
-\begin{displaymath}
-  \mathbb{E}_{S \sim D}(f(S)|i \in S) - \mathbb{E}_{S \sim D}(f(S) | i \notin
-  S) = \sum_{j \neq i \in \Omega} w_s \left(\mathbb{P}(j\in S|i \in S) -
-  \mathbb{P}(j \in S | i \notin S)\right) + w_i(1 - 0) = w_i
-\end{displaymath}
-
-Let $O$ be the collection of all sets $S$ for which we have observed $f(S)$ and
-let $O_i \equiv \{S \in O : i \in S\}$ and $O^c_i \equiv \{ S\in O: i \notin S
-\}$. If $O_i$ and $O^c_i$ are non-empty, define the following weight estimator:
-\begin{displaymath}
-  \hat W_i \equiv \frac{1}{|O_i|} \sum_{S \in O_i} f(S) - \frac{1}{|O_i^c|}
-  \sum_{S \in O_i^c} f(S)
-\end{displaymath}
-
-If $D$ is a product distribution such that $ \exists c > 0, \forall i,
-\mathbb{P}(i) \geq c$, it is
-easy to show that $\forall i \in
-\Omega,\ \hat W_i \rightarrow_{|O| \rightarrow +\infty} w_i$
-By using standard concentration bounds, we can hope to obtain within $poly(n,
-\frac{1}{\epsilon})$ observations:
-
-
-%For every node $i$, we compute the \emph{average weight of every set containing
-%element $i$}.  Let $w_i$ be
-%the weight of element $i$, $w \equiv f(\Omega) = \sum_{i \in \Omega} w_i$ and
-%$p \equiv \frac{k}{n}$, then $$\forall i \in \Omega, \mathbb{E}_{S \sim
-%D_p}\left[f(S)| i \in S\right] = pw + (1 -p)w_i$$
-
-%Note that the average weight of every set containing element $i$ preserves the
-%ranking of the weights of the elements. For observations ${(S_j, f(S_j))}_{j \in
-%N}$ and for $N_i \equiv |\{S : i \in S\}|$, we define the following estimator:
-
-%\begin{equation} \forall i \in \Omega, w_i^{N_i} \equiv \frac{1}{N_i}\sum_{S |
-%i \in S} \frac{f(S) - pw}{1-p} \end{equation}
-
-%As shown above, $w_i^{N_i} \rightarrow w_i$ as $N_i \rightarrow +\infty$. We
-%can obtain a concentration bound of $w_i^{N_i}$ around $w_i$, using Hoeffding's
-%lemma:
-
-%\begin{equation} \mathbb{P}\left(\middle|w_i^{N_i} - w_i \middle|  \geq
-%\epsilon w_i \right) \leq 2e^{-2(1-p)N_i\frac{\epsilon^2 w_i^2}{w^2}}
-%\end{equation}
-
-%\emph{TODO:multiplicative boudns are very bad for zero weights\dots Need to look
-%at additive bounds for these zeros.}
-
-%For $N_i$ sufficiently large for each element $i$, we have $\forall i \in
-%\Omega, (1-\epsilon) w_i \leq w_i^{N_i} \leq (1 + \epsilon) w_i$. Under this
-%assumption, if we choose the $k$ elements with largest estimated weight
-%$W_i^{N_i}$, we obtain a $\frac{1-\epsilon}{1+\epsilon}$-approximation to OPT,
-%where OPT is the value of the maximum weight set of $k$ elements for the
-%function $f$. To ensure that $N_i$ is sufficiently large for each element, we
-%note that $\mathbb{E}(N_i) = pN$ and use a Chernoff bound coupled with a
-%classic union bound:
-
-%\begin{equation} \mathbb{P}\left(\bigcup_{i \in \Omega} \left[N_i \leq
-%\frac{pN}{2}\right]\right) \leq \sum_{i\in \Omega} \mathbb{P}\left(N_i \leq
-%\frac{pN}{2}\right) \leq n e^{-\frac{pN}{8}} \end{equation}
-
-%As such, for $C > 0$ and $N \geq (C+1)\frac{8}{p}\log n$, we have $\forall i
-%\in \Omega, N_i \geq \frac{pN}{2}$ with probability at least $1-\frac{1}{n^C}$
-
-%\begin{proposition} Assume that $N$ pairs of set-value ${(S_j, f(S_j))}_{j \in
-  %N}$ are observed, where $S_j \sim D_p$ and $p \equiv \frac{k}{n}$. If $N >
-  %XXX$, then we can $\epsilon$-learn $f$ and optimize it to a
-  %$(1+\epsilon)/(1-\epsilon)$ factor for any cardinality constraint with
-  %probability $XXX$.  \end{proposition}
-
-\subsection{General submodular functions}
-
-\subsection{Parametric submodular functions}
-
-\textbf{Note:} all known examples of submodular functions are parametric. The
-question is not parametric vs non-parametric but whether or not the number of
-parameters is polynomial or exponential in $n$ (the size of the ground set).
-For all examples I know except martroid rank functions, it seems to be
-polynomial.
-
-\subsubsection{Influence Functions}
-
-Let $G = (V, E)$ be a weighted directed graph. Without loss of generality, we
-can assign a weight $p_{u,v} \in [0,1]$ to every possible edge $(u,v) \in V^2$.
-Let $m$ be the number of non-zero edges of $G$. Let $\sigma_G(S, p)$ be the
-influence of the set $S \subseteq V$ in $G$ under the IC model with parameters
-$p$.
-
-Recall from \cite{Kempe:03} that:
-\begin{equation}
-    \sigma_G(S, p) = \sum_{v\in V} \P_{G_p}\big[r_{G_p}(S\leadsto v)\big]
-\end{equation}
-where $G_p$ is a random graph where each edge $(u,v)\in E$ appears with
-probability $p_{u,v}$ and $r_{G_p}(S\leadsto v)$ is the indicator variable of
-\emph{there exists a path from $S$ to $v$ in $G_p$}.
-
-\begin{claim}
-\label{cla:oracle}
-If for all $(u,v) \in V^2$, $p_{u,v} \geq p'_{u,v} \geq p_{u,v}
-- \frac{1}{\alpha m}$, then:
-\begin{displaymath}
-    \forall S \subseteq V,\, \sigma_{G}(S, p) \geq \sigma_{G}(S,p') \geq (1
-    - 1/\alpha) \sigma_{G}(S,p)
-\end{displaymath}
-\end{claim}
-
-\begin{proof}
-    We define two coupled random graphs $G_p$ and $G_p'$ as follows: for each
-    edge $(u,v)\in E$, draw a uniform random variable $\mathcal{U}_{u,v}\in
-    [0,1]$. Include $(u,v)$ in $G_p$ (resp. $G_{p'}$) iff
-    $\mathcal{U}_{u,v}\leq p_{u,v}$ (resp. $\mathcal{U}_{u,v}\leq p'_{u,v}$).
-    It is clear from the construction that each edge $(u,v)$ will be present in
-    $G_p$ (resp. $G_p'$) with probability $p_{u,v}$ (resp. $p'_{u,v}$). Hence:
-    \begin{displaymath}
-        \sigma_G(S, p) = \sum_{v\in V} \P_{G_p}\big[r_{G_p}(S\leadsto v)\big]
-        \text{  and  }
-        \sigma_G(S, p') = \sum_{v\in V} \P_{G_p'}\big[r_{G_p'}(S\leadsto v)\big]
-    \end{displaymath}
-
-    By construction $G_{p'}$ is always a subgraph of $G_p$, i.e
-    $r_{G_p'}(S\leadsto v)\leq r_{G_p}(S\leadsto v)$. This proves the left-hand
-    side of the Claim's inequality.
-
-    The probability that an edge is present in $G_p$ but not in $G_p'$ is at
-    most $\frac{1}{\alpha m}$, so with probability $\left(1-\frac{1}{\alpha
-    m}\right)^m$, $G_p$ and $G_p'$ have the same set of edges. This implies that:
-    \begin{displaymath} \P_{G_{p'}}\big[r_{G_p'}(S\leadsto v)\big]\geq
-    \left(1-\frac{1}{\alpha m}\right)^m\P_{G_{p}}\big[r_{G_p}(S\leadsto v)\big]
-  \end{displaymath}
-    which proves the right-hand side of the claim after observing that
-    $\left(1-\frac{1}{\alpha m}\right)^m\geq 1-\frac{1}{\alpha}$ with equality
-    iff $m=1$.
-\end{proof}
-
- We can use Claim~\ref{cla:oracle} to find a constant factor approximation
- algorithm to maximising influence on $G$ by maximising influence on $G'$. For
- $k \in \mathbb{N}^*$, let $O_k \in \arg\max_{|S| \leq k} \sigma_G(S)$ and
- $\sigma_{G}^* = \sigma_G(O_k)$ where we omit the $k$ when it is unambiguous.
-
-
-\begin{proposition}
-\label{prop:approx_optim}
-Suppose we have an unknown graph $G$ and a known graph $G'$ such that $V = V'$
-and for all $(u,v) \in V^2, |p_{u,v} - p_{u,v}'| \leq  \frac{1}{\alpha m}$.
-Then for all $k \in \mathbb{N}^*$, we can find a set $\hat O_k$ such that
-$\sigma_{G}(\hat O_k) \geq (1 - e^{\frac{2}{\alpha} - 1}) \sigma^*_{G}$
-\end{proposition}
-
-\begin{proof} For every edge $(u,v) \in V^2$, let $\hat p = p'_{u,v} -
-  \frac{1}{\alpha m}$. We are now in the conditions of Claim~\ref{cla:oracle}
-  with $\alpha \leftarrow \alpha/2$. We return the set $\hat O_k$ obtained by
-  greedy maximisation on $\hat G$. It is a classic exercise to show then that
-  $\sigma_G(\hat O_k) \geq 1 - e^{\frac{2}{\alpha} - 1}$ (see Pset 1, CS284r).
-\end{proof}
-
-A small note on the approximation factor: it is only $>0$ for $\alpha > 2$.
-Note that $\alpha \geq 7 \implies 1 - e^{\frac{2}{\alpha} - 1} \geq
-\frac{1}{2}$ and that it converges to $(1 - 1/e)$ which is the best possible
-polynomial-time approximation ratio to influence maximisation unless $P = NP$.
-Also note that in the limit of large $m$, $(1 -\frac{1}{\alpha m})^m
-\rightarrow \exp(-1/\alpha)$ and the approximation ratio goes to $1 -
-\exp(-\exp(-2/\alpha))$.
-
-\subsubsection{Active set selection of stationary Gaussian Processes}
-
-\section{Passive Optimization vs. Passive Learning}
-
-\subsection{Failed Attempt: returning max of observations}
-
-This doesn't work. Give examples as to why! Remember that there are strong
-concentration results for submodular functions -> look at expected value of
-observed sets
-
-\subsection{Example where optimization possible, learning impossible}
-
-Recall the matroid construction from~\cite{balcan2011learning}:
-\begin{theorem}
-  For any $k$ with $k = 2^{o(n^{1/3})}$, there exists a family of sets ${\cal A}
-  \subseteq2^{[n]}$ and a family of matroids $\cal{M} = \{M_{\cal{B}} :
-  \cal{B} \subseteq\cal{A} \}$ such that:
-  \begin{itemize}
-    \item $|{\cal A}| = k$ and $|A| = n^{1/3}$ for every $A \in \cal{A}$
-    \item For every $\cal{B} \subseteq\cal{A}$ and every $A \in \cal{A}$, we
-      have:
-      \begin{align*}
-        \text{rank}_{M_{\cal{B}}}(A) & = 8 \log k \ if A\in{\cal B} \\
-                                     & = |A| \ if A\in {\cal A}\backslash{\cal B}
-      \end{align*}
-  \end{itemize}
-\end{theorem}
-
-Consider the following subset of the above family of matroids: ${\cal
-M}^{\epsilon} = \{M_{\cal B} : {\cal B} \subseteq{\cal A} \wedge |{\cal
-A}\backslash{\cal B}| \geq \epsilon|{\cal A}|\}$ for $k = 2^{n^{1/6}}$.
-Consider an \emph{unknown} function $f$, corresponding to the rank function of
-one of the matroids $M_{\cal B}$ from ${\cal M}^{\epsilon}$.  Note that as long
-as we observe \emph{one} set from ${\cal A} \backslash {\cal B}$, we can
-optimize $f$ exactly! Indeed, the largest possible value for $f$ under
-cardinality constraint $n^{1/3}$ is $\max_{A\in 2^{[n]}} |A| = n^{1/3}$.
-
-One example of a distribution under which this occurs with probability at least
-a constant is $D_u$, the uniform distribution over all sets of ${\cal A}$.  For
-$c>0$, after $n^c$ observations $O \equiv (S_i, f(S_i))_i$ for $S_i \sim D_u$,
-we will observe at least one element from $\cal{A}\backslash\cal{B}$ with
-constant probability:
-
-\begin{equation} \nonumber \mathbb{P}(\bigcup_{S_i} S_i \in {\cal
-A}\backslash{\cal B}) \geq 1 - (1 - \epsilon)^{n^c} \approx \epsilon n^c
-\end{equation}
-
-However, it follows from the analysis of~\cite{balcan2011learning} that we
-cannot learn $f$ under any distribution, even with active value queries!
-Indeed, for any polynomially-sized set of observations, there exists a
-super-polynomially number of functions in ${\cal M^1}$ which coincide on this
-set of observations, but which take very different values outside of this set
-of observations: $8n^{1/6}$ for $A \in {\cal B}$ and $n^{1/3}$ for $A \in {\cal
-A}\backslash {\cal B}$.
-
-{\bf TODO:} A cleaner simpler example would be nice.
-
-\section{Meeting notes: 04.03.2015}
-
-Consider the following function:
-\begin{displaymath}
-    g(S) = \max\left\{\sqrt{n}|X\cap S|, |S|\right\}
-\end{displaymath}
-where $|X|=\sqrt{n}$. Assume that you are given polynomially many samples where
-each element is included with probability $1/2$. Then with high probability all
-the samples will have size roughly $n/2$, so you will observe $g(S)=|S|$ with
-high probability.
-
-\paragraph{Claim 1:} $g$ is PAC-learnable because if you output $|S|$ then you
-will be correct with high probability is $S$ is drawn from the same
-distribution as above.
-
-\paragraph{Claim 2:} $g$ is not optimizable under budget $\sqrt{n}$ because you
-never learn anything about $X$.
-
-\paragraph{Open Question:} Cook a similar example where $g$ is submodular and
-where you are observing sets of the same size as your budget.
-
-\paragraph{Positive results:} Try to obtain guarantees about learning
-parameters of parametric submodular functions and show whether or not these
-guarantees are sufficient for optimization. First look at learning weights in
-a cover function. Maybe facility location? Sums of concave over modular are
-probably too hard because of the connection to neural networks.
-
-\section{Which learning guarantees imply optimization?}
-
-Here, we consider the following question: which learning guarantees imply
-optimization? For example, \cite{TODO} provides the following guarantee for
-cover functions:
-
-\begin{theorem}
-There exists an algorithm such that for any $\epsilon>0$ given random and
-uniform examples of a coverage function $c$ outputs a hypothesis, which is also
-a coverage function $h$, such that with probability $2/3$: $\mathbb{E}_{\cal
-U}[|h(x) - c(x)|] \leq \epsilon$. The algorithm runs in time $\tilde {\cal O}(n)
-\cdot \text{poly}(s/\epsilon)$ and uses $\log(n)\cdot \text{poly}(s/epsilon)$
-and examples, where $s = \min\{size(c), (1/\epsilon)^{\log(1/\epsilon)}\}$.
-\end{theorem}
-
-We would like to understand to what extent this $\ell1$-bound allows us to
-optimize the coverage function $c$ under cardinality constraints using the
-hypothesis $h$. Let us consider the simpler case of an additive function, and
-suppose we have a similar guarantee: $\mathbb{E}_{x \sim {\cal U}}[|h(x) -
-c(x)|]$ where $\forall x, c(x) \equiv \sum_i w_i x_i$ and $h(x) \equiv \sum_i
-\hat w_i x_i$. Can we find a bound on $\|w - \hat w\|_{\infty}$?
-
-As it turns out, yes. Let $\forall i, w_i - \hat w_i \equiv \alpha_i$ and let
-$V(x) \equiv |h(x) - c(x)| = |\sum_{i} \alpha_i x_i |$. Consider the collection
-of good sets $\cal G \equiv \{S : v(S) < 4\epsilon\}$. We claim that $|{\cal G}|
-\geq \frac{3}{4}c\dot2^n$. Suppose the contrary, there is at least a quarter of
-the sets which have value $v(S) > 4\epsilon$ such that $\mathbb{E}_{x \sim {\cal
-U}}[|v(x)|] \geq \frac{1}{2^n}\sum_{S \in {\cal G}^c} |v(S)| >
-\frac{1}{4}\cdot4\epsilon = \epsilon$ which is a contradiction. Consider element
-$i$ such that $|\alpha_i| \equiv \|\alpha\|_{\infty}$. Consider the collection
-of sets which contain $i$: ${\cal S}_i \equiv \{S : i \in S\}$.  Notice that
-$|{\cal S}_i| = |{\cal S}_i^c| = 2^{n-1}$. Therefore, $|{\cal G} \cap {\cal
-S}_i^c| \geq \frac{1}{4}\cdot 2^n$. For all sets $S$ in ${\cal G} \cap {\cal
-S}_i^c$, $v(S\cup\{j\}) \geq \alpha_i - 4\epsilon$. It follows that
-$\mathbb{E}_{x \sim {\cal U}}[|v(x)|] \geq \frac{1}{4}(\alpha_j - 4\epsilon)$
-and therefore we have $\|w - \hat w\|_{\infty} \leq 8 \epsilon$.
-
-\bibliography{optimize} \bibliographystyle{apalike}
-
-
-\end{document}
-- 
cgit v1.2.3-70-g09d2


From 90211ba1847251c552361a7d0b2eef1c540ef72a Mon Sep 17 00:00:00 2001
From: Thibaut Horel <thibaut.horel@gmail.com>
Date: Tue, 14 Apr 2015 15:27:24 -0400
Subject: Nips format and minor tweaks. We are already close to the page limit…
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 paper/main.tex              |  10 +-
 paper/nips15submit_e.sty    | 236 ++++++++++++++++++++++++++++++++++++++++++++
 paper/sections/negative.tex |   2 +-
 3 files changed, 244 insertions(+), 4 deletions(-)
 create mode 100644 paper/nips15submit_e.sty

diff --git a/paper/main.tex b/paper/main.tex
index 58b6b22..0b33bdd 100644
--- a/paper/main.tex
+++ b/paper/main.tex
@@ -1,6 +1,11 @@
 \documentclass[10pt]{article}
-\usepackage{fullpage, amsmath, amssymb, amsthm, bbm}
+\usepackage{nips15submit_e}
+\usepackage{amsmath, amssymb, amsthm, bbm, microtype}
 \usepackage[utf8x]{inputenc}
+\usepackage[capitalize, noabbrev]{cleveref}
+\usepackage[pagebackref=false,breaklinks=true,colorlinks=true]{hyperref}
+\usepackage{natbib}
+\usepackage{url}
 
 \DeclareMathOperator{\E}{\mathbb{E}}
 \let\P\relax
@@ -11,7 +16,6 @@
 \newcommand{\ints}{\mathbb{N}}
 \renewcommand{\O}{\mathcal{O}}
 
-
 \newtheorem{proposition}{Proposition}
 \newtheorem{corollary}{Corollary}
 \newtheorem{problem}{Problem}
@@ -35,7 +39,7 @@
 \section{Positive results}
 \input{sections/positive.tex}
 
-\bibliographystyle{apalike}
+\bibliographystyle{abbrv}
 \bibliography{optimize}
 
 
diff --git a/paper/nips15submit_e.sty b/paper/nips15submit_e.sty
new file mode 100644
index 0000000..75dc8a8
--- /dev/null
+++ b/paper/nips15submit_e.sty
@@ -0,0 +1,236 @@
+%%%% NIPS Macros (LaTex)
+%%%% Style File
+%%%% Dec 12, 1990   Rev Aug 14, 1991; Sept, 1995; April, 1997; April, 1999
+
+% This file can be used with Latex2e whether running in main mode, or
+% 2.09 compatibility mode.
+%
+% If using main mode, you need to include the commands
+%             \documentclass{article}
+%             \usepackage{nips10submit_e,times}
+% as the first lines in your document.  Or, if you do not have Times
+% Roman font available, you can just use
+%             \documentclass{article}
+%             \usepackage{nips10submit_e}
+% instead.
+%
+% If using 2.09 compatibility mode, you need to include the command
+%             \documentstyle[nips10submit_09,times]{article} 
+% as the first line in your document.  Or, if you do not have Times
+% Roman font available, you can include the command
+%             \documentstyle[nips10submit_09]{article}
+% instead.
+
+% Change the overall width of the page.  If these parameters are
+%       changed, they will require corresponding changes in the
+%       maketitle section.
+%
+\usepackage{eso-pic} % used by \AddToShipoutPicture 
+
+\renewcommand{\topfraction}{0.95}   % let figure take up nearly whole page
+\renewcommand{\textfraction}{0.05}  % let figure take up nearly whole page
+
+% Define nipsfinal, set to true if nipsfinalcopy is defined  
+\newif\ifnipsfinal
+\nipsfinalfalse
+\def\nipsfinalcopy{\nipsfinaltrue}
+\font\nipstenhv  = phvb at 8pt % *** IF THIS FAILS, SEE nips10submit_e.sty ***
+
+% Specify the dimensions of each page
+
+\setlength{\paperheight}{11in}
+\setlength{\paperwidth}{8.5in}
+
+\oddsidemargin .5in    %   Note \oddsidemargin = \evensidemargin
+\evensidemargin .5in
+\marginparwidth 0.07 true in
+%\marginparwidth 0.75 true in
+%\topmargin 0 true pt           % Nominal distance from top of page to top of
+%\topmargin 0.125in
+\topmargin -0.625in
+\addtolength{\headsep}{0.25in}
+\textheight 9.0 true in       % Height of text (including footnotes & figures)
+\textwidth 5.5 true in        % Width of text line.
+\widowpenalty=10000
+\clubpenalty=10000
+
+% \thispagestyle{empty}        \pagestyle{empty}
+\flushbottom \sloppy
+
+% We're never going to need a table of contents, so just flush it to 
+% save space --- suggested by drstrip@sandia-2
+\def\addcontentsline#1#2#3{}
+
+% Title stuff, taken from deproc.
+\def\maketitle{\par 
+\begingroup
+   \def\thefootnote{\fnsymbol{footnote}}
+   \def\@makefnmark{\hbox to 0pt{$^{\@thefnmark}$\hss}} % for perfect author
+                                                        % name centering
+%   The footnote-mark was overlapping the footnote-text,
+%   added the following to fix this problem               (MK)
+   \long\def\@makefntext##1{\parindent 1em\noindent
+                            \hbox to1.8em{\hss $\m@th ^{\@thefnmark}$}##1}
+   \@maketitle \@thanks
+\endgroup
+\setcounter{footnote}{0}
+\let\maketitle\relax \let\@maketitle\relax
+\gdef\@thanks{}\gdef\@author{}\gdef\@title{}\let\thanks\relax}
+
+% The toptitlebar has been raised to top-justify the first page
+
+% Title (includes both anonimized and non-anonimized versions)
+\def\@maketitle{\vbox{\hsize\textwidth
+\linewidth\hsize \vskip 0.1in \toptitlebar \centering
+{\LARGE\bf \@title\par}  \bottomtitlebar % \vskip 0.1in %  minus
+\ifnipsfinal
+   \def\And{\end{tabular}\hfil\linebreak[0]\hfil
+            \begin{tabular}[t]{c}\bf\rule{\z@}{24pt}\ignorespaces}% 
+  \def\AND{\end{tabular}\hfil\linebreak[4]\hfil
+            \begin{tabular}[t]{c}\bf\rule{\z@}{24pt}\ignorespaces}% 
+    \begin{tabular}[t]{c}\bf\rule{\z@}{24pt}\@author\end{tabular}% 
+\else 
+     \begin{tabular}[t]{c}\bf\rule{\z@}{24pt}
+Anonymous Author(s) \\
+Affiliation \\
+Address \\
+\texttt{email} \\
+\end{tabular}% 
+\fi
+\vskip 0.3in minus 0.1in}}
+
+\renewenvironment{abstract}{\vskip.075in\centerline{\large\bf
+Abstract}\vspace{0.5ex}\begin{quote}}{\par\end{quote}\vskip 1ex}
+
+% sections with less space
+\def\section{\@startsection {section}{1}{\z@}{-2.0ex plus
+    -0.5ex minus -.2ex}{1.5ex plus 0.3ex
+minus0.2ex}{\large\bf\raggedright}}
+
+\def\subsection{\@startsection{subsection}{2}{\z@}{-1.8ex plus    
+-0.5ex minus -.2ex}{0.8ex plus .2ex}{\normalsize\bf\raggedright}}
+\def\subsubsection{\@startsection{subsubsection}{3}{\z@}{-1.5ex
+plus      -0.5ex minus -.2ex}{0.5ex plus
+.2ex}{\normalsize\bf\raggedright}}
+\def\paragraph{\@startsection{paragraph}{4}{\z@}{1.5ex plus   
+0.5ex minus .2ex}{-1em}{\normalsize\bf}}
+\def\subparagraph{\@startsection{subparagraph}{5}{\z@}{1.5ex plus 
+  0.5ex minus .2ex}{-1em}{\normalsize\bf}}
+\def\subsubsubsection{\vskip
+5pt{\noindent\normalsize\rm\raggedright}}
+
+
+% Footnotes
+\footnotesep 6.65pt %
+\skip\footins 9pt plus 4pt minus 2pt
+\def\footnoterule{\kern-3pt \hrule width 12pc \kern 2.6pt }
+\setcounter{footnote}{0}
+
+% Lists and paragraphs
+\parindent 0pt
+\topsep 4pt plus 1pt minus 2pt
+\partopsep 1pt plus 0.5pt minus 0.5pt
+\itemsep 2pt plus 1pt minus 0.5pt
+\parsep 2pt plus 1pt minus 0.5pt
+\parskip .5pc
+
+
+%\leftmargin2em 
+\leftmargin3pc
+\leftmargini\leftmargin \leftmarginii 2em
+\leftmarginiii 1.5em \leftmarginiv 1.0em \leftmarginv .5em 
+
+%\labelsep \labelsep 5pt
+
+\def\@listi{\leftmargin\leftmargini}
+\def\@listii{\leftmargin\leftmarginii
+   \labelwidth\leftmarginii\advance\labelwidth-\labelsep
+   \topsep 2pt plus 1pt minus 0.5pt
+   \parsep 1pt plus 0.5pt minus 0.5pt
+   \itemsep \parsep}
+\def\@listiii{\leftmargin\leftmarginiii
+    \labelwidth\leftmarginiii\advance\labelwidth-\labelsep
+    \topsep 1pt plus 0.5pt minus 0.5pt 
+    \parsep \z@ \partopsep 0.5pt plus 0pt minus 0.5pt
+    \itemsep \topsep}
+\def\@listiv{\leftmargin\leftmarginiv
+     \labelwidth\leftmarginiv\advance\labelwidth-\labelsep}
+\def\@listv{\leftmargin\leftmarginv
+     \labelwidth\leftmarginv\advance\labelwidth-\labelsep}
+\def\@listvi{\leftmargin\leftmarginvi
+     \labelwidth\leftmarginvi\advance\labelwidth-\labelsep}
+
+\abovedisplayskip 7pt plus2pt minus5pt%
+\belowdisplayskip \abovedisplayskip
+\abovedisplayshortskip  0pt plus3pt%
+\belowdisplayshortskip  4pt plus3pt minus3pt%
+
+% Less leading in most fonts (due to the narrow columns)
+% The choices were between 1-pt and 1.5-pt leading
+%\def\@normalsize{\@setsize\normalsize{11pt}\xpt\@xpt} % got rid of @ (MK)
+\def\normalsize{\@setsize\normalsize{11pt}\xpt\@xpt}
+\def\small{\@setsize\small{10pt}\ixpt\@ixpt}
+\def\footnotesize{\@setsize\footnotesize{10pt}\ixpt\@ixpt}
+\def\scriptsize{\@setsize\scriptsize{8pt}\viipt\@viipt}
+\def\tiny{\@setsize\tiny{7pt}\vipt\@vipt}
+\def\large{\@setsize\large{14pt}\xiipt\@xiipt}
+\def\Large{\@setsize\Large{16pt}\xivpt\@xivpt}
+\def\LARGE{\@setsize\LARGE{20pt}\xviipt\@xviipt}
+\def\huge{\@setsize\huge{23pt}\xxpt\@xxpt}
+\def\Huge{\@setsize\Huge{28pt}\xxvpt\@xxvpt}
+
+\def\toptitlebar{\hrule height4pt\vskip .25in\vskip-\parskip}
+
+\def\bottomtitlebar{\vskip .29in\vskip-\parskip\hrule height1pt\vskip
+.09in} %
+%Reduced second vskip to compensate for adding the strut in \@author
+
+% Vertical Ruler
+% This code is, largely, from the CVPR 2010 conference style file
+% ----- define vruler
+\makeatletter
+\newbox\nipsrulerbox
+\newcount\nipsrulercount
+\newdimen\nipsruleroffset
+\newdimen\cv@lineheight
+\newdimen\cv@boxheight
+\newbox\cv@tmpbox
+\newcount\cv@refno
+\newcount\cv@tot
+% NUMBER with left flushed zeros  \fillzeros[<WIDTH>]<NUMBER>
+\newcount\cv@tmpc@ \newcount\cv@tmpc
+\def\fillzeros[#1]#2{\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi
+\cv@tmpc=1 %
+\loop\ifnum\cv@tmpc@<10 \else \divide\cv@tmpc@ by 10 \advance\cv@tmpc by 1 \fi
+   \ifnum\cv@tmpc@=10\relax\cv@tmpc@=11\relax\fi \ifnum\cv@tmpc@>10 \repeat
+\ifnum#2<0\advance\cv@tmpc1\relax-\fi
+\loop\ifnum\cv@tmpc<#1\relax0\advance\cv@tmpc1\relax\fi \ifnum\cv@tmpc<#1 \repeat
+\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi \relax\the\cv@tmpc@}%
+% \makevruler[<SCALE>][<INITIAL_COUNT>][<STEP>][<DIGITS>][<HEIGHT>]
+\def\makevruler[#1][#2][#3][#4][#5]{\begingroup\offinterlineskip
+\textheight=#5\vbadness=10000\vfuzz=120ex\overfullrule=0pt%
+\global\setbox\nipsrulerbox=\vbox to \textheight{%
+{\parskip=0pt\hfuzz=150em\cv@boxheight=\textheight
+\cv@lineheight=#1\global\nipsrulercount=#2%
+\cv@tot\cv@boxheight\divide\cv@tot\cv@lineheight\advance\cv@tot2%
+\cv@refno1\vskip-\cv@lineheight\vskip1ex%
+\loop\setbox\cv@tmpbox=\hbox to0cm{{\nipstenhv\hfil\fillzeros[#4]\nipsrulercount}}%
+\ht\cv@tmpbox\cv@lineheight\dp\cv@tmpbox0pt\box\cv@tmpbox\break
+\advance\cv@refno1\global\advance\nipsrulercount#3\relax
+\ifnum\cv@refno<\cv@tot\repeat}}\endgroup}%
+\makeatother
+% ----- end of vruler
+
+% \makevruler[<SCALE>][<INITIAL_COUNT>][<STEP>][<DIGITS>][<HEIGHT>]
+\def\nipsruler#1{\makevruler[12pt][#1][1][3][0.993\textheight]\usebox{\nipsrulerbox}}
+\AddToShipoutPicture{%
+\ifnipsfinal\else
+\nipsruleroffset=\textheight
+\advance\nipsruleroffset by -3.7pt
+  \color[rgb]{.7,.7,.7}
+  \AtTextUpperLeft{%
+    \put(\LenToUnit{-35pt},\LenToUnit{-\nipsruleroffset}){%left ruler
+      \nipsruler{\nipsrulercount}}
+  }
+\fi
+}
diff --git a/paper/sections/negative.tex b/paper/sections/negative.tex
index f88117a..2eaeb47 100644
--- a/paper/sections/negative.tex
+++ b/paper/sections/negative.tex
@@ -25,7 +25,7 @@ guarantees are sufficient for optimization. First look at learning weights in
 a cover function. Maybe facility location? Sums of concave over modular are
 probably too hard because of the connection to neural networks.
 
-\subsection{Optimizable $n\Rightarrow$ Learnable}
+\subsection{Optimizable $\nRightarrow$ Learnable}
 
 Recall the matroid construction from~\cite{balcan2011learning}:
 \begin{theorem}
-- 
cgit v1.2.3-70-g09d2


From ead8ef85b50848f078a360b040f59ea962821cb4 Mon Sep 17 00:00:00 2001
From: Thibaut Horel <thibaut.horel@gmail.com>
Date: Tue, 14 Apr 2015 16:46:34 -0400
Subject: Sketch of the introduction

---
 paper/sections/introduction.tex | 81 +++++++++++++++++++++++++++--------------
 paper/sections/negative.tex     | 30 +++++++++++++++
 2 files changed, 84 insertions(+), 27 deletions(-)

diff --git a/paper/sections/introduction.tex b/paper/sections/introduction.tex
index bcb406f..6eda59d 100644
--- a/paper/sections/introduction.tex
+++ b/paper/sections/introduction.tex
@@ -1,27 +1,54 @@
-Let $\Omega$ be the universe of elements and $f$ a function defined on subsets
-of $\Omega$: $f : S \in 2^{[\Omega]} \mapsto f(S) \in \mathbb{R}$. Let $K$ be a
-collection of sets of $2^{[\Omega]}$, which we call \emph{constraints}. Let
-$S^*_K$ be any solution to $\max_{S \in K} f(S)$, which we will also denote by
-$S^*$ when there is no ambiguity. Let $L$ be the problem size, which is often
-(but not always) equal to $|\Omega|$.
-
-In general, we say we can efficiently optimize a function $f$ under constraint
-$K$ when we have a polynomial-time algorithm making adaptive value queries to
-$f$,which returns a set $S$ such that $S \in K$ and $f(S) \geq \alpha f(S^*)$
-with high probability and $\alpha$ an absolute constant.
-
-Here, we consider the scenario where we cannot make adaptive value queries, and
-in fact, where we cannot make queries at all! Instead, we suppose that we
-observe a polynomial number of set-value pairs $(S, f(S))$ where $S$ is taken
-from a known distribution $D$. We say we can efficiently \emph{passively
-optimize} $f$ under distribution $D$ or $D-$optimize $f$ under constraints $K$
-when, after observing ${\cal O}(L^c)$ set-value pairs from $D$ where $c > 0$ is
-an absolute constant, we can return a set $S$ such that $S \in K$ and $f(S)
-\geq \alpha f(S^*)$ with high probability and $\alpha$ an absolute constant.
-
-In the case of \emph{passive} observations of set-value pairs under a
-distribution $D$ for a function $f$, recent research has focused on whether we
-can efficiently and approximately \emph{learn} $f$. This was formalized in the
-PMAC model from \cite{balcan2011learning}. When thinking about passive
-optimization, it is necessary to understand the link between being able to
- $D-PMAC$ learn $f$ and being able to $D-$optimize $f$.
+The natural “diminishing return” property of submodular functions makes them
+suitable to model many real-life scenarios. However, even when they are
+suitable as a model, it is not realistic to assume that they are entirely
+known. For example, most submodular functions depend on unknown parameters that
+have to be learned from observations.
+
+A recent line of work has focused on learning submodular functions. The
+PMAC-learnable framework formalizes what it means to learn a submodular
+function in an “overall” sense. For specific examples of submodular functions,
+results are known on how well the parameters of the function can be learned
+from observations.
+
+When the experimenter wishes to optimize the unknown submodular function, the
+well-known approximation algorithms cannot be readily applied because they rely
+on a value query oracle, which is much stronger than what the present scenario
+suggests (at best, random independent samples). The results on learning
+submodular functions also seem to be insufficient in that they are myopic to
+the ultimate use which will be made of the submodular function (optimizing).
+The experimenter experimenter only cares about learning the function to the
+extent that what he knows is sufficient to optimize it. As such, the main
+question underlying the present work is the following:
+
+\begin{center}
+    \emph{What is a good model of learning for submodular functions when the
+    ultimate goal is to be able to optimize them?}
+\end{center}
+
+We make the following contributions:
+\begin{itemize}
+    \item We show how the PMAC learning framework does not provide a satisfying
+    framework for optimizing under uncertainty. In particular, we show that
+    there are functions which are PMAC-learnable but not optimizable and
+    vice-versa.
+    \item For specific class of submodular functions, we show how to optimize
+    them under uncertainty.
+\end{itemize}
+
+\subsection{Related Work}
+
+\paragraph{Maximizing submodular functions} Vast literature on maximizing
+submodular functions under many kind of constraints. All the algorithms rely on
+some kind of oracle (e.g. value query or demand query) which is stronger than
+what we have: access to polynomially many observations coming from
+a distribution. In particular, an algorithm cannot make arbitrary (and
+certainly not adaptive) queries to the oracle.
+
+\paragraph{PMAC-learning} Interesting line of work, but oblivious to the
+ultimate goal.
+
+\paragraph{Bandits} Interesting in that it ties together learning and
+optimizing. However, it assumes that arbitrary queries can be made to an oracle
+allowing the experimenter to explore more efficiently (even though the standard
+regret benchmark means that he will incur some cost for exploring vs
+exploiting.
diff --git a/paper/sections/negative.tex b/paper/sections/negative.tex
index 2eaeb47..555aa1d 100644
--- a/paper/sections/negative.tex
+++ b/paper/sections/negative.tex
@@ -1,3 +1,33 @@
+\subsection{Model}
+
+Let $\Omega$ be the universe of elements and $f$ a function defined on subsets
+of $\Omega$: $f : S \in 2^{[\Omega]} \mapsto f(S) \in \mathbb{R}$. Let $K$ be a
+collection of sets of $2^{[\Omega]}$, which we call \emph{constraints}. Let
+$S^*_K$ be any solution to $\max_{S \in K} f(S)$, which we will also denote by
+$S^*$ when there is no ambiguity. Let $L$ be the problem size, which is often
+(but not always) equal to $|\Omega|$.
+
+In general, we say we can efficiently optimize a function $f$ under constraint
+$K$ when we have a polynomial-time algorithm making adaptive value queries to
+$f$,which returns a set $S$ such that $S \in K$ and $f(S) \geq \alpha f(S^*)$
+with high probability and $\alpha$ an absolute constant.
+
+Here, we consider the scenario where we cannot make adaptive value queries, and
+in fact, where we cannot make queries at all! Instead, we suppose that we
+observe a polynomial number of set-value pairs $(S, f(S))$ where $S$ is taken
+from a known distribution $D$. We say we can efficiently \emph{passively
+optimize} $f$ under distribution $D$ or $D-$optimize $f$ under constraints $K$
+when, after observing ${\cal O}(L^c)$ set-value pairs from $D$ where $c > 0$ is
+an absolute constant, we can return a set $S$ such that $S \in K$ and $f(S)
+\geq \alpha f(S^*)$ with high probability and $\alpha$ an absolute constant.
+
+In the case of \emph{passive} observations of set-value pairs under a
+distribution $D$ for a function $f$, recent research has focused on whether we
+can efficiently and approximately \emph{learn} $f$. This was formalized in the
+PMAC model from \cite{balcan2011learning}. When thinking about passive
+optimization, it is necessary to understand the link between being able to
+ $D-PMAC$ learn $f$ and being able to $D-$optimize $f$.
+
 \subsection{Learnable $\nRightarrow$ Optimizable}
 
 Consider the following function:
-- 
cgit v1.2.3-70-g09d2