paper/sections/negative.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168

\subsection{Model}

We consider a submodular function $f$ defined over a ground set $N$ of size
$n$. We assume that $f$ is non-negative and normalized:
\begin{displaymath}
    f:2^N\to \reals^+,\quad f(\emptyset) = 0
\end{displaymath}

Given a subset $K$ of $2^N$ representing our constraints (e.g. $K$ is an
independent system), we are approximating $S^*$ solution to the following
optimization problem:
\begin{displaymath}
    S^*\in\argmax_{S\in K} f(S)
\end{displaymath}

Unlike the standard submodular optimization framework, we do not assume access
to a value query oracle for $f$. Denoting by $U_K$ the uniform distribution
over $K$, we assume that we are given access to polynomially many independent
samples drawn from $U_K$, this is the only thing which we observe about $f$.
The goal is to understand whether or not it is possible to optimize $f$ given
these observations. This leads to the following definition.

\begin{definition}
    Let $\mathcal{F}$ be a family of functions $2^N\to\reals^+$. For
    $\alpha:\reals^+\to\reals^+_*$ and constraints $K$, we say that
    $\mathcal{F}$ is $(\alpha, K)$-optimizable iff for any $f\in\mathcal{F}$
    and $\eps>0$, there exists $\ell = \mathrm{poly}(n,\frac{1}{\eps})$ and an
    algorithm $A$ running in time $\mathrm{poly}(\ell)$ such that given random
    inputs $\mathcal{S} = \{(S_i, f(S_i)\}_{1\leq i\leq \ell}$, where the
    variables $S_1,\dots, S_\ell$ are drawn independently from $U_K$, $A$
    outputs a set $S$ such that:
    \begin{displaymath}
        \P_\mathcal{S}\big[f(S)\geq \alpha(\eps) f(S^*)\big]\geq 1-\eps
    \end{displaymath}
\end{definition}

One of the goals of the present work is to compare this notion $(\alpha,
K)$-optimizability, to the following notion of learnability.

\begin{definition}
    \label{def:learnable}
    Let $\mathcal{F}$ be a family of functions $2^N\to\reals^+$. For
    $\alpha:\reals^+\to\reals^+_*$  and constraints $K$, we say that
    $\mathcal{F}$ is $(\alpha, K)$-learnable iff for any $f\in\mathcal{F}$,
    $\eps>0$ and $\delta>0$, there exists $\ell
    = \mathrm{poly}(n,\frac{1}{\eps},\frac{1}{\delta})$ and an
    algorithm $A$ such that given random inputs $\mathcal{S} = \{(S_i,
    f(S_i)\}_{1\leq i\leq \ell}$, where the variables $S_1,\dots, S_\ell$ are
    drawn independently from $U_K$, $A$ outputs a function $\tilde{f}$ such that:
    \begin{displaymath}
        \P_\mathcal{S}\left[\P_{S\sim U_K} [f(S)\geq \tilde{f}(S)\geq
        \alpha(\eps) f(S)]\geq 1-\eps\right]\geq 1-\delta
    \end{displaymath}
\end{definition}

\subsection{Learnable $\nRightarrow$ Optimizable}

Consider the following function:
\begin{displaymath}
    g(S) = \max\left\{\sqrt{n}|X\cap S|, |S|\right\}
\end{displaymath}
where $|X|=\sqrt{n}$. Assume that you are given polynomially many samples where
each element is included with probability $1/2$. Then with high probability all
the samples will have size roughly $n/2$, so you will observe $g(S)=|S|$ with
high probability.

\paragraph{Claim 1:} $g$ is PAC-learnable because if you output $|S|$ then you
will be correct with high probability is $S$ is drawn from the same
distribution as above.

\paragraph{Claim 2:} $g$ is not optimizable under budget $\sqrt{n}$ because you
never learn anything about $X$.

\paragraph{Open Question:} Cook a similar example where $g$ is submodular and
where you are observing sets of the same size as your budget.

\paragraph{Positive results:} Try to obtain guarantees about learning
parameters of parametric submodular functions and show whether or not these
guarantees are sufficient for optimization. First look at learning weights in
a cover function. Maybe facility location? Sums of concave over modular are
probably too hard because of the connection to neural networks.

\subsection{Optimizable $\nRightarrow$ Learnable}

Recall the matroid construction from~\cite{balcan2011learning}:
\begin{theorem}
  For any $k$ with $k = 2^{o(n^{1/3})}$, there exists a family of sets ${\cal A}
  \subseteq2^{[n]}$ and a family of matroids $\cal{M} = \{M_{\cal{B}} :
  \cal{B} \subseteq\cal{A} \}$ such that:
  \begin{itemize}
    \item $|{\cal A}| = k$ and $|A| = n^{1/3}$ for every $A \in \cal{A}$
    \item For every $\cal{B} \subseteq\cal{A}$ and every $A \in \cal{A}$, we
      have:
      \begin{align*}
        \text{rank}_{M_{\cal{B}}}(A) & = 8 \log k \ if A\in{\cal B} \\
                                     & = |A| \ if A\in {\cal A}\backslash{\cal B}
      \end{align*}
  \end{itemize}
\end{theorem}

Consider the following subset of the above family of matroids: ${\cal
M}^{\epsilon} = \{M_{\cal B} : {\cal B} \subseteq{\cal A} \wedge |{\cal
A}\backslash{\cal B}| \geq \epsilon|{\cal A}|\}$ for $k = 2^{n^{1/6}}$.
Consider an \emph{unknown} function $f$, corresponding to the rank function of
one of the matroids $M_{\cal B}$ from ${\cal M}^{\epsilon}$.  Note that as long
as we observe \emph{one} set from ${\cal A} \backslash {\cal B}$, we can
optimize $f$ exactly! Indeed, the largest possible value for $f$ under
cardinality constraint $n^{1/3}$ is $\max_{A\in 2^{[n]}} |A| = n^{1/3}$.

One example of a distribution under which this occurs with probability at least
a constant is $D_u$, the uniform distribution over all sets of ${\cal A}$.  For
$c>0$, after $n^c$ observations $O \equiv (S_i, f(S_i))_i$ for $S_i \sim D_u$,
we will observe at least one element from $\cal{A}\backslash\cal{B}$ with
constant probability:

\begin{equation} \nonumber \mathbb{P}(\bigcup_{S_i} S_i \in {\cal
A}\backslash{\cal B}) \geq 1 - (1 - \epsilon)^{n^c} \approx \epsilon n^c
\end{equation}

However, it follows from the analysis of~\cite{balcan2011learning} that we
cannot learn $f$ under any distribution, even with active value queries!
Indeed, for any polynomially-sized set of observations, there exists a
super-polynomially number of functions in ${\cal M^1}$ which coincide on this
set of observations, but which take very different values outside of this set
of observations: $8n^{1/6}$ for $A \in {\cal B}$ and $n^{1/3}$ for $A \in {\cal
A}\backslash {\cal B}$.

{\bf TODO:} A cleaner simpler example would be nice.

\subsection{Which learning guarantees imply optimization?}

Here, we consider the following question: which learning guarantees imply
optimization? For example, \cite{TODO} provides the following guarantee for
cover functions:

\begin{theorem}
There exists an algorithm such that for any $\epsilon>0$ given random and
uniform examples of a coverage function $c$ outputs a hypothesis, which is also
a coverage function $h$, such that with probability $2/3$: $\mathbb{E}_{\cal
U}[|h(x) - c(x)|] \leq \epsilon$. The algorithm runs in time $\tilde {\cal O}(n)
\cdot \text{poly}(s/\epsilon)$ and uses $\log(n)\cdot \text{poly}(s/epsilon)$
and examples, where $s = \min\{size(c), (1/\epsilon)^{\log(1/\epsilon)}\}$.
\end{theorem}

We would like to understand to what extent this $\ell1$-bound allows us to
optimize the coverage function $c$ under cardinality constraints using the
hypothesis $h$. Let us consider the simpler case of an additive function, and
suppose we have a similar guarantee: $\mathbb{E}_{x \sim {\cal U}}[|h(x) -
c(x)|]$ where $\forall x, c(x) \equiv \sum_i w_i x_i$ and $h(x) \equiv \sum_i
\hat w_i x_i$. Can we find a bound on $\|w - \hat w\|_{\infty}$?

As it turns out, yes. Let $\forall i, w_i - \hat w_i \equiv \alpha_i$ and let
$V(x) \equiv |h(x) - c(x)| = |\sum_{i} \alpha_i x_i |$. Consider the collection
of good sets ${\cal G} \equiv \{S : v(S) < 4\epsilon\}$. We claim that $|{\cal G}|
\geq \frac{3}{4}\cdot2^n$. Suppose the contrary, there is at least a quarter of
the sets which have value $v(S) > 4\epsilon$ such that $\mathbb{E}_{x \sim {\cal
U}}[|v(x)|] \geq \frac{1}{2^n}\sum_{S \in {\cal G}^c} |v(S)| >
\frac{1}{4}\cdot4\epsilon = \epsilon$ which is a contradiction. Consider element
$i$ such that $|\alpha_i| \equiv \|\alpha\|_{\infty}$. Consider the collection
of sets which contain $i$: ${\cal S}_i \equiv \{S : i \in S\}$.  Notice that
$|{\cal S}_i| = |{\cal S}_i^c| = 2^{n-1}$. Therefore, $|{\cal G} \cap {\cal
S}_i^c| \geq \frac{1}{4}\cdot 2^n$. For all sets $S$ in ${\cal G} \cap {\cal
S}_i^c$, $v(S\cup\{j\}) \geq \alpha_i - 4\epsilon$. It follows that
$\mathbb{E}_{x \sim {\cal U}}[|v(x)|] \geq \frac{1}{4}(\alpha_j - 4\epsilon)$
and therefore we have $\|w - \hat w\|_{\infty} \leq 8 \epsilon$.