aboutsummaryrefslogtreecommitdiffstats
path: root/results.tex
blob: ce5368d3b27951df98068b2cbc9693fa9603eb79 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
\documentclass[10pt]{article}
\usepackage{fullpage, amsmath, amssymb, amsthm, bbm}
\usepackage[utf8x]{inputenc}

\DeclareMathOperator{\E}{\mathbb{E}}
\let\P\relax
\DeclareMathOperator{\P}{\mathbb{P}}
\newcommand{\ex}[1]{\E\left[#1\right]}
\newcommand{\prob}[1]{\P\left[#1\right]}
\newcommand{\reals}{\mathbb{R}}
\newcommand{\ints}{\mathbb{N}}
\renewcommand{\O}{\mathcal{O}}


\newtheorem{proposition}{Proposition}
\newtheorem{corollary}{Corollary}
\newtheorem{problem}{Problem}
\newtheorem{theorem}{Theorem}
\newtheorem{claim}{Claim}
\newtheorem{remark}{Remark}

\title{Learn and/or Optimize}
\author{}
\date{}

\begin{document}
\maketitle

\section{Preliminary Results}
\label{sec:matrix_theory}

\subsection{Generic Random Matrix Theory}
We cite the following result from~\cite{vershynin2010introduction} (Remark 5.40):

\begin{proposition}\label{prop:non_isotropic_isometry} 
Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are independent
sub-gaussian random vectors in $\mathbb{R}^n$ with second moment matrix
$\Sigma$. Then for every $t \geq 0$, the following inequality holds with
probability at least: $1- 2 \exp(-ct^2)$: \begin{equation} \|\frac{1}{N} A^T A
- \Sigma \| \leq \max(\delta, \delta^2) \ \text{where}\ \delta =
C\sqrt{\frac{n}{N}} + \frac{t}{\sqrt{N}} \end{equation} where $C$, $c$ depend
only on the subgaussian norm of the rows $K \equiv \max_i \| A_i\|_{\psi_2}$.
\end{proposition}

The following result is a simple corollary of
Proposition~\ref{prop:non_isotropic_isometry}:

\begin{corollary}\label{cor:number_of_rows_needed} Let $n \in \mathbb{N}$ and
  $k \in ]0,n[$. Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are
  independent Bernoulli variable vectors in $\mathbb{R}^n$ such that
  $\mathbb{P}(A_{i,j} = 1) = \frac{k}{n} = 1 - \mathbb{P}(A_{i,j} = 0)$ and let
  $\sigma \equiv \frac{k}{n}(1- \frac{k}{n}) \neq 0$, then if $N > {(C+1)}^2
  n/\sigma^2$, the matrix A has full row rank with probability at least $1 -
  e^{-cn}$, where $C, c$ are constant depending only on  the subgaussian norm
  of the rows $K \equiv \sup_{p \geq 1}
  {\frac{1}{\sqrt{p}}(\mathbb{E}|A_1|^p)}^\frac{1}{p} = k$\footnote{Is this true?
  And how do the constants behave?} \end{corollary}

\begin{proof} It is easy to compare the kernels of $A$ and $A^TA$ and notice
  that $rank(A) = rank(A^T A)$. Since $A^TA$ is an $n \times n$ matrix, it
  follows that if $A^TA$ is invertible, then $A$ has full row rank. In other
  words, if $A$ has smallest singular value $\sigma_{\min}(A) > 0$, then $A$
  has full row rank.  Consider Prop.~\ref{prop:non_isotropic_isometry} with $t
  \equiv \sqrt{n}$ and with $N > {(C+1)}^{2} n$, then with probability $1 -
  2\exp(-cn)$: \begin{equation} \|\frac{1}{N} A^T A - \sigma I \| \leq
    (C+1)\sqrt{\frac{n}{N}} \end{equation}

It follows that for any vector $x \in \mathbb{R}^n$, $|\frac{1}{N}\|A x\|^2_2
-\sigma | \leq (C+1)\sqrt{\frac{n}{N}} \implies \|A x\|^2_2 \geq N\left(\sigma
- (C+1)\sqrt{\frac{n}{N}}\right)$. If $N > {(C+1)}^2n/\sigma^2$, then
$\sigma_{\min}(A) > 0$ with probability at least $1 - e^{-cn}$ \end{proof}

\subsection{More Direct Approach}

Here we work on $\mathbb{F}_2$. An $n\times n$ binary matrix can be seen as
a matrix over $\mathbb{F}_2$. Let us assume that each row of $A$ is chosen
uniformly at random among all binary rows of length $n$. A standard counting
arguments shows that the number of non-singular matrices over $\mathbb{F}_2$
is:
\begin{displaymath}
    N_n = (2^n-1)(2^n - 2)\dots (2^n- 2^{n-1})
    = (2^n)^n\prod_{i=1}^n\left(1-\frac{1}{2^i}\right)
\end{displaymath}

Hence the probability that our random matrix $A$ is invertible is:
\begin{displaymath}
    P_n  = \prod_{i=1}^n\left(1-\frac{1}{2^i}\right)
\end{displaymath}
It is easy to see that $P_n$ is a decreasing sequence. Its limit is
$\phi(\frac{1}{2})$ where $\phi$ is the Euler function. We have
$\phi(\frac{1}{2})\approx
0.289$ and $P_n$ converges
exponentially fast to this constant (one way to see this is to use the
Pentagonal number theorem).

Hence, if we observe $\ell\cdot n$ uniformly random binary rows, the
probability that they will have full column rank is at least:
\begin{displaymath}
    P_{\ell\cdot n}\geq 1 - \left(1-\phi\left(\frac{1}{2}\right)\right)^{\ell}
\end{displaymath}

Note that this is the probability of having full column rank over
$\mathbb{F}_2$. A standard linear algebra argument shows that this implies full
column rank over $\mathbb{R}$.

\paragraph{TODO:} Study the case where we only observe sets of size exactly
$k$, or at most $k$. This amounts to replacing $2^n$ in the computation above
by:
\begin{displaymath}
{n\choose k}\quad\text{or}\quad\sum_{j=0}^k{n\choose j}
\end{displaymath}

Thinking about it, why do we assume that the sample sets are of size exactly
$k$. I think it would make more sense from the learning perspective to consider
uniformly random sets. In this case, the above approach allows us to conclude
directly.

More generally, I think the “right” way to think about this is to look at $A$
as the adjacency matrix of a random $k$-regular graph. There are tons of
results about the probability of the adjacency matrix to be non singular.

\section{Examples}

\subsection{Generic Functions}

\paragraph{TODO:} Add citations!

These are generic examples and serve as building blocks for the applications
below:
\begin{itemize}
    \item $f(S) = g\left(\sum_{i\in S} w_i\right)$ for a concave
        $g:\reals\to\reals$ and weights $(w_i)_{i\in N}\in \reals_+^{|N|}$. Note
        that $f$ is monotone iff $g$ is monotone. In this case, $g$ does not
        matter for the purpose of optimization: the sets are in the same order
        with or without $g$, the only things which matter are the weights which
        serve as natural parameters for this class of functions. This class of
        functions contains:
        \begin{itemize}
            \item additive functions (when $g$ is the identity function).
            \item $f(S) = |S\cap X|$ for some set $X\subseteq N$. This is the
                case where the weights are $0/1$ and $g$ is the identity
                function.
            \item symmetric submodular functions: when the weights are all one.
            \item budget-additive functions, when $g:x\mapsto \min(B, x)$ for
                some $B$.
        \end{itemize}
    \item $f(S) = \max_{i\in S} w_i$ for weights $(w_i)_{i\in N}\in
        \reals_+^{|N|}$. This class of functions is also naturally parametrized
        by the weights.
    \item (weighted) matroid rank functions. Given a matroid $M$ over a ground
        set $N$, we define its rank function to be:
        \begin{displaymath}
            \forall S\subseteq N,\;
            r(S) = \max_{\substack{I\subseteq S\\I\in M}} |I|
        \end{displaymath}
        more generally, given a weight function $w:N\to\reals_+$, we define the
        weighted matroid rank function:
        \begin{displaymath}
            \forall S\subseteq N,\;
            r(S) = \max_{\substack{I\subseteq S\\I\in M}} \sum_{i\in I} w_i
        \end{displaymath}
\end{itemize}

\paragraph{Remark} The function $f(S)= \max_{i\in S}w_i$ is a weighted matroid
rank function for the $1$-uniform matroid (the matroid where the independent
sets are the sets of size at most one).

\subsection{Applications}

\begin{itemize}
    \item \emph{Coverage functions:} they can be written as a positive linear
        combination of matroid rank functions:
        \begin{displaymath}
            f(S) = \sum_{u\in\mathcal{U}} w_u c_u(S)
        \end{displaymath}
        where $c_u$ is the rank function of the matroid $M = \big\{ \emptyset,
        \{u\}\big\}$.
    \item \emph{Facility location:} (cite Bilmes) there is a universe
        $\mathcal{U}$ of locations and a proximity score $s_{i,j}$ for each
        pair of locations.  We pick a subset of locations $S$ and each point in
        the universe is allocated to its closest location (the one with highest
        proximity):
        \begin{displaymath}
            f(S) = \sum_{u\in\mathcal{U}} \max_{v\in S} s_{u,v}
        \end{displaymath}
        This can be seen as a sum of weighted matroid rank functions: one for
        each location in the universe associated with a $1$-uniform matroid
        (other applications: job scheduling).

    \item \emph{Image segmentation:} (cite Jegelka) can be (in some cases)
        written as a graph cut function. For image segmentation the goal is to
        minimize the cut.
        \begin{displaymath}
            f(S) = \sum_{e\in E} w_e c_e(S)
        \end{displaymath}
        where $c_e(S)$ is one iff $e\in E(S,\bar{S})$. \textbf{TODO:} I think
        this can be written as a matroid rank function.
    \item \emph{Learning/value of information}: (cite Krause) there is
        a hypothesis $A$ (a random variable) which is “refined” when more
        observations are made.  Imagine that there is a finite set $X_1,\dots,
        X_n$ of possible observations (random variables). Then, assuming that
        the observations are independent conditioned on $A$, the information
        gain:
        \begin{displaymath}
            f(S) = H(A) - H\big(A\,|\,(X_i)_{i\in S}\big)
        \end{displaymath}
        is submodular. The $\log\det$ is the specific case of a linear
        hypothesis observed with additional independent Gaussian noise.
        This can be written as multivariate concave over modular
        (\textbf{TODO:} I think multivariate concave over modular is not
        submodular in general, it is for $\log\det$. Understand this better).
    \item \emph{data subset selection/summarization:} in statistical machine
        translation, Bilmes used sum of concave over modular:
        \begin{displaymath}
            f(S) = \sum_{f} \lambda_f \phi\left(\sum_{e\in S}w_f(e)\right)
        \end{displaymath}
        where each $f$ represents a feature, $w_f(e)$ represents how much of
        $f$ element $e$ has, and $\phi$ captures decreasing marginal gain when
        we have a lot of a given feature.
        Facility location functions are also commonly used for subset selection.
\end{itemize}

\section{Passive Optimization}

Let $\Omega$ be the universe of elements and $f$ a function defined on subsets
of $\Omega$: $f : S \in 2^{[\Omega]} \mapsto f(S) \in \mathbb{R}$. Let $K$ be a
collection of sets of $2^{[\Omega]}$, which we call \emph{constraints}. Let
$S^*_K$ be any solution to $\max_{S \in K} f(S)$, which we will also denote by
$S^*$ when there is no ambiguity. Let $L$ be the problem size, which is often
(but not always) equal to $|\Omega|$.

In general, we say we can efficiently optimize a function $f$ under constraint
$K$ when we have a polynomial-time algorithm making adaptive value queries to
$f$,which returns a set $S$ such that $S \in K$ and $f(S) \geq \alpha f(S^*)$
with high probability and $\alpha$ an absolute constant.

Here, we consider the scenario where we cannot make adaptive value queries, and
in fact, where we cannot make queries at all! Instead, we suppose that we
observe a polynomial number of set-value pairs $(S, f(S))$ where $S$ is taken
from a known distribution $D$. We say we can efficiently \emph{passively
optimize} $f$ under distribution $D$ or $D-$optimize $f$ under constraints $K$
when, after observing ${\cal O}(L^c)$ set-value pairs from $D$ where $c > 0$ is
an absolute constant, we can return a set $S$ such that $S \in K$ and $f(S)
\geq \alpha f(S^*)$ with high probability and $\alpha$ an absolute constant.

In the case of \emph{passive} observations of set-value pairs under a
distribution $D$ for a function $f$, recent research has focused on whether we
can efficiently and approximately \emph{learn} $f$. This was formalized in the
PMAC model from \cite{balcan2011learning}. When thinking about passive
optimization, it is necessary to understand the link between being able to
 $D-PMAC$ learn $f$ and being able to $D-$optimize $f$.

\subsection{Additive function}

We consider here the simple case of additive functions. A function $f$ is
additive if there exists a weight vector $(w_s)_{s \in \Omega}$ such that
$\forall S \subseteq \Omega, \ f(S) = \sum_{s \in S} w_s$. We will sometimes
adopt the notation $w \equiv f(\Omega)$.

\subsubsection{Inverting the system}\label{subsec:invert_the_system}

Suppose we have observed $n^c$ set-value pairs ${(S_j, f(S_j))}_{j \in N}$ with
$S_j \sim D$ where $n \equiv |\Omega|$. Consider the $n^c \times n$ matrix $A$ where for all $i$, the row
$A_i = \chi_{S_i}$, the indicator vector of set $S_i$. Let $B$ be the $n^c
\times 1$ vector such that $\forall i, b_j \equiv f(S_j)$. It is easy to see
that if $w$ is the $n \times 1$ corresponding weight vector for $f$ then:
\begin{displaymath}
  A w = B
\end{displaymath}

Note that if $A$ has full column rank, then we can solve for $w$ exactly and
also optimize $f$ under any cardinality constraint. We can therefore cast the
question of $D-$learning and $D-$optimizing $f$ as a random matrix theory
problem: what is the probability that after $n^c$ for $c > 0$ independent
samples from $D$, the matrix $A$ will have full rank? See
section~\ref{sec:matrix_theory}

\paragraph{Extension}
Note that the previous reasoning can easily be extended to any \emph{almost}
additive function. Consider a function $g$ such that there exists $\alpha > 0$
and $\beta > 0$ and an additive function $f$ such that $\forall S, \alpha f(S)
\leq g(S) \leq \beta f(S)$, then by solving for $\max_{S\in K} f(S)$ we have a
$\alpha/\beta$-approximation to the optimum of $g$ since:

\begin{displaymath}
g(S^*) \geq \alpha f(S^*) \geq \alpha f(OPT) \geq
\frac{\alpha}{\beta} g(OPT)
\end{displaymath}

where $OPT \in \arg \max_{S \in K} g(S)$. This can be taken one step further by
considering a function $g$ such that there exists $\alpha, \beta >0$, an
additive function $f$ and a bijective univariate function $\phi$, such that
$\forall S, \alpha \phi(f(S)) \leq g(S) \leq \beta \phi(f(S))$. In this case, we solve the system $A w = \phi^{-1}(B)$ and obtain once again an $\alpha/\beta$-approximation to the optimum of $g$.

\begin{remark}
Note that here $D-$optimizing $f$ is easy because $D-$learning $f$ is easy. We
would like to understand whether being able to $D-$learn $f$ is really
necessary to $D-$optimizing it. In fact, many results for PMAC-learning more
complex functions, such as general submodular functions, are negative. Can we
hope to find positive results in cases where PMAC-learning is impossible?
\end{remark}

\subsubsection{Average weight method}

We consider here another method to $D-$optimizing $f$, which only requires
$D-$learning $f$ approximately. Note that for every element $i \in \Omega$, and
for a \emph{product} distribution $D$:
\begin{displaymath}
  \mathbb{E}_{S \sim D}(f(S)|i \in S) - \mathbb{E}_{S \sim D}(f(S) | i \notin
  S) = \sum_{j \neq i \in \Omega} w_s \left(\mathbb{P}(j\in S|i \in S) -
  \mathbb{P}(j \in S | i \notin S)\right) + w_i(1 - 0) = w_i
\end{displaymath}

Let $O$ be the collection of all sets $S$ for which we have observed $f(S)$ and
let $O_i \equiv \{S \in O : i \in S\}$ and $O^c_i \equiv \{ S\in O: i \notin S
\}$. If $O_i$ and $O^c_i$ are non-empty, define the following weight estimator:
\begin{displaymath}
  \hat W_i \equiv \frac{1}{|O_i|} \sum_{S \in O_i} f(S) - \frac{1}{|O_i^c|}
  \sum_{S \in O_i^c} f(S)
\end{displaymath}

If $D$ is a product distribution such that $ \exists c > 0, \forall i,
\mathbb{P}(i) \geq c$, it is
easy to show that $\forall i \in
\Omega,\ \hat W_i \rightarrow_{|O| \rightarrow +\infty} w_i$
By using standard concentration bounds, we can hope to obtain within $poly(n, \frac{1}{\epsilon})$ observations:


%For every node
%$i$, we compute the \emph{average weight of every set containing element $i$}. Let $w_i$ be
%the weight of element $i$, $w \equiv f(\Omega) = \sum_{i \in \Omega} w_i$ and
%$p \equiv \frac{k}{n}$, then $$\forall i \in \Omega, \mathbb{E}_{S \sim
%D_p}\left[f(S)| i \in S\right] = pw + (1 -p)w_i$$

%Note that the average weight of every set containing element $i$ preserves the
%ranking of the weights of the elements. For observations ${(S_j, f(S_j))}_{j \in
%N}$ and for $N_i \equiv |\{S : i \in S\}|$, we define the following estimator:

%\begin{equation} \forall i \in \Omega, w_i^{N_i} \equiv \frac{1}{N_i}\sum_{S |
%i \in S} \frac{f(S) - pw}{1-p} \end{equation}

%As shown above, $w_i^{N_i} \rightarrow w_i$ as $N_i \rightarrow +\infty$. We
%can obtain a concentration bound of $w_i^{N_i}$ around $w_i$, using Hoeffding's
%lemma:

%\begin{equation} \mathbb{P}\left(\middle|w_i^{N_i} - w_i \middle|  \geq
%\epsilon w_i \right) \leq 2e^{-2(1-p)N_i\frac{\epsilon^2 w_i^2}{w^2}}
%\end{equation}

%\emph{TODO:multiplicative boudns are very bad for zero weights\dots Need to look
%at additive bounds for these zeros.}

%For $N_i$ sufficiently large for each element $i$, we have $\forall i \in
%\Omega, (1-\epsilon) w_i \leq w_i^{N_i} \leq (1 + \epsilon) w_i$. Under this
%assumption, if we choose the $k$ elements with largest estimated weight
%$W_i^{N_i}$, we obtain a $\frac{1-\epsilon}{1+\epsilon}$-approximation to OPT,
%where OPT is the value of the maximum weight set of $k$ elements for the
%function $f$. To ensure that $N_i$ is sufficiently large for each element, we
%note that $\mathbb{E}(N_i) = pN$ and use a Chernoff bound coupled with a
%classic union bound:

%\begin{equation} \mathbb{P}\left(\bigcup_{i \in \Omega} \left[N_i \leq
%\frac{pN}{2}\right]\right) \leq \sum_{i\in \Omega} \mathbb{P}\left(N_i \leq
%\frac{pN}{2}\right) \leq n e^{-\frac{pN}{8}} \end{equation}

%As such, for $C > 0$ and $N \geq (C+1)\frac{8}{p}\log n$, we have $\forall i
%\in \Omega, N_i \geq \frac{pN}{2}$ with probability at least $1-\frac{1}{n^C}$

%\begin{proposition} Assume that $N$ pairs of set-value ${(S_j, f(S_j))}_{j \in
  %N}$ are observed, where $S_j \sim D_p$ and $p \equiv \frac{k}{n}$. If $N >
  %XXX$, then we can $\epsilon$-learn $f$ and optimize it to a
  %$(1+\epsilon)/(1-\epsilon)$ factor for any cardinality constraint with
  %probability $XXX$.  \end{proposition}

\subsection{General submodular functions}

\subsection{Parametric submodular functions}

\textbf{Note:} all known examples of submodular functions are parametric. The
question is not parametric vs non-parametric but whether or not the number of
parameters is polynomial or exponential in $n$ (the size of the ground set).
For all examples I know except martroid rank functions, it seems to be
polynomial.

\subsubsection{Influence Functions}

Let $G = (V, E)$ be a weighted directed graph. Without loss of generality, we
can assign a weight $p_{u,v} \in [0,1]$ to every possible edge $(u,v) \in V^2$.
Let $m$ be the number of non-zero edges of $G$. Let $\sigma_G(S, p)$ be the
influence of the set $S \subseteq V$ in $G$ under the IC model with parameters
$p$.

Recall from \cite{Kempe:03} that:
\begin{equation}
    \sigma_G(S, p) = \sum_{v\in V} \P_{G_p}\big[r_{G_p}(S\leadsto v)\big]
\end{equation}
where $G_p$ is a random graph where each edge $(u,v)\in E$ appears with
probability $p_{u,v}$ and $r_{G_p}(S\leadsto v)$ is the indicator variable of
\emph{there exists a path from $S$ to $v$ in $G_p$}.

\begin{claim}
\label{cla:oracle}
If for all $(u,v) \in V^2$, $p_{u,v} \geq p'_{u,v} \geq p_{u,v}
- \frac{1}{\alpha m}$, then:
\begin{displaymath}
    \forall S \subseteq V,\, \sigma_{G}(S, p) \geq \sigma_{G}(S,p') \geq (1
    - 1/\alpha) \sigma_{G}(S,p)
\end{displaymath}
\end{claim}

\begin{proof}
    We define two coupled random graphs $G_p$ and $G_p'$ as follows: for each
    edge $(u,v)\in E$, draw a uniform random variable $\mathcal{U}_{u,v}\in
    [0,1]$. Include $(u,v)$ in $G_p$ (resp. $G_{p'}$) iff
    $\mathcal{U}_{u,v}\leq p_{u,v}$ (resp. $\mathcal{U}_{u,v}\leq p'_{u,v}$).
    It is clear from the construction that each edge $(u,v)$ will be present in
    $G_p$ (resp. $G_p'$) with probability $p_{u,v}$ (resp. $p'_{u,v}$). Hence:
    \begin{displaymath}
        \sigma_G(S, p) = \sum_{v\in V} \P_{G_p}\big[r_{G_p}(S\leadsto v)\big]
        \text{  and  }
        \sigma_G(S, p') = \sum_{v\in V} \P_{G_p'}\big[r_{G_p'}(S\leadsto v)\big]
    \end{displaymath}

    By construction $G_{p'}$ is always a subgraph of $G_p$, i.e
    $r_{G_p'}(S\leadsto v)\leq r_{G_p}(S\leadsto v)$. This proves the left-hand
    side of the Claim's inequality.

    The probability that an edge is present in $G_p$ but not in $G_p'$ is at
    most $\frac{1}{\alpha m}$, so with probability $\left(1-\frac{1}{\alpha
    m}\right)^m$, $G_p$ and $G_p'$ have the same set of edges. This implies that:
    \begin{displaymath} \P_{G_{p'}}\big[r_{G_p'}(S\leadsto v)\big]\geq
    \left(1-\frac{1}{\alpha m}\right)^m\P_{G_{p}}\big[r_{G_p}(S\leadsto v)\big]
  \end{displaymath}
    which proves the right-hand side of the claim after observing that
    $\left(1-\frac{1}{\alpha m}\right)^m\geq 1-\frac{1}{\alpha}$ with equality
    iff $m=1$.
\end{proof}

 We can use Claim~\ref{cla:oracle} to find a constant factor approximation
 algorithm to maximising influence on $G$ by maximising influence on $G'$. For
 $k \in \mathbb{N}^*$, let $O_k \in \arg\max_{|S| \leq k} \sigma_G(S)$ and
 $\sigma_{G}^* = \sigma_G(O_k)$ where we omit the $k$ when it is unambiguous.


\begin{proposition}
\label{prop:approx_optim}
Suppose we have an unknown graph $G$ and a known graph $G'$ such that $V = V'$
and for all $(u,v) \in V^2, |p_{u,v} - p_{u,v}'| \leq  \frac{1}{\alpha m}$.
Then for all $k \in \mathbb{N}^*$, we can find a set $\hat O_k$ such that
$\sigma_{G}(\hat O_k) \geq (1 - e^{\frac{2}{\alpha} - 1}) \sigma^*_{G}$
\end{proposition}

\begin{proof} For every edge $(u,v) \in V^2$, let $\hat p = p'_{u,v} -
  \frac{1}{\alpha m}$. We are now in the conditions of Claim~\ref{cla:oracle}
  with $\alpha \leftarrow \alpha/2$. We return the set $\hat O_k$ obtained by
  greedy maximisation on $\hat G$. It is a classic exercise to show then that
  $\sigma_G(\hat O_k) \geq 1 - e^{\frac{2}{\alpha} - 1}$ (see Pset 1, CS284r).
\end{proof}

A small note on the approximation factor: it is only $>0$ for $\alpha > 2$.
Note that $\alpha \geq 7 \implies 1 - e^{\frac{2}{\alpha} - 1} \geq
\frac{1}{2}$ and that it converges to $(1 - 1/e)$ which is the best possible
polynomial-time approximation ratio to influence maximisation unless $P = NP$.
Also note that in the limit of large $m$, $(1 -\frac{1}{\alpha m})^m
\rightarrow \exp(-1/\alpha)$ and the approximation ratio goes to $1 -
\exp(-\exp(-2/\alpha))$.

\subsubsection{Active set selection of stationary Gaussian Processes}

\section{Passive Optimization vs. Passive Learning}

\subsection{Failed Attempt: returning max of observations}

This doesn't work. Give examples as to why! Remember that there are strong
concentration results for submodular functions -> look at expected value of
observed sets

\subsection{Example where optimization possible, learning impossible}

Recall the matroid construction from~\cite{balcan2011learning}:
\begin{theorem}
  For any $k$ with $k = 2^{o(n^{1/3})}$, there exists a family of sets ${\cal A}
  \subseteq2^{[n]}$ and a family of matroids $\cal{M} = \{M_{\cal{B}} :
  \cal{B} \subseteq\cal{A} \}$ such that:
  \begin{itemize}
    \item $|{\cal A}| = k$ and $|A| = n^{1/3}$ for every $A \in \cal{A}$
    \item For every $\cal{B} \subseteq\cal{A}$ and every $A \in \cal{A}$, we
      have:
      \begin{align*}
        \text{rank}_{M_{\cal{B}}}(A) & = 8 \log k \ if A\in{\cal B} \\
                                     & = |A| \ if A\in {\cal A}\backslash{\cal B}
      \end{align*}
  \end{itemize}
\end{theorem}

Consider the following subset of the above family of matroids: ${\cal
M}^{\epsilon} = \{M_{\cal B} : {\cal B} \subseteq{\cal A} \wedge |{\cal
A}\backslash{\cal B}| \geq \epsilon|{\cal A}|\}$ for $k = 2^{n^{1/6}}$.
Consider an \emph{unknown} function $f$, corresponding to the rank function of
one of the matroids $M_{\cal B}$ from ${\cal M}^{\epsilon}$.  Note that as long
as we observe \emph{one} set from ${\cal A} \backslash {\cal B}$, we can
optimize $f$ exactly! Indeed, the largest possible value for $f$ under
cardinality constraint $n^{1/3}$ is $\max_{A\in 2^{[n]}} |A| = n^{1/3}$.

One example of a distribution under which this occurs with probability at least
a constant is $D_u$, the uniform distribution over all sets of ${\cal A}$.  For
$c>0$, after $n^c$ observations $O \equiv (S_i, f(S_i))_i$ for $S_i \sim D_u$,
we will observe at least one element from $\cal{A}\backslash\cal{B}$ with
constant probability:

\begin{equation} \nonumber \mathbb{P}(\bigcup_{S_i} S_i \in {\cal
A}\backslash{\cal B}) \geq 1 - (1 - \epsilon)^{n^c} \approx \epsilon n^c
\end{equation}

However, it follows from the analysis of~\cite{balcan2011learning} that we
cannot learn $f$ under any distribution, even with active value queries!
Indeed, for any polynomially-sized set of observations, there exists a
super-polynomially number of functions in ${\cal M^1}$ which coincide on this
set of observations, but which take very different values outside of this set
of observations: $8n^{1/6}$ for $A \in {\cal B}$ and $n^{1/3}$ for $A \in {\cal
A}\backslash {\cal B}$.

{\bf TODO:} A cleaner simpler example would be nice.



\bibliography{optimize} \bibliographystyle{apalike}


\end{document}