brainstorm/results.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693

\documentclass[10pt]{article}
\usepackage{fullpage, amsmath, amssymb, amsthm, bbm}
\usepackage[utf8x]{inputenc}

\DeclareMathOperator{\E}{\mathbb{E}}
\let\P\relax
\DeclareMathOperator{\P}{\mathbb{P}}
\newcommand{\ex}[1]{\E\left[#1\right]}
\newcommand{\prob}[1]{\P\left[#1\right]}
\newcommand{\reals}{\mathbb{R}}
\newcommand{\ints}{\mathbb{N}}
\renewcommand{\O}{\mathcal{O}}


\newtheorem{proposition}{Proposition}
\newtheorem{corollary}{Corollary}
\newtheorem{problem}{Problem}
\newtheorem{theorem}{Theorem}
\newtheorem{claim}{Claim}
\newtheorem{remark}{Remark}

\title{Learn and/or Optimize}
\author{}
\date{}

\begin{document}
\maketitle

\section{Preliminary Results}
\label{sec:matrix_theory}

\subsection{Generic Random Matrix Theory}
We cite the following result from~\cite{vershynin2010introduction} (Remark 5.40):

\begin{proposition}\label{prop:non_isotropic_isometry} 
Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are independent
sub-gaussian random vectors in $\mathbb{R}^n$ with second moment matrix
$\Sigma$. Then for every $t \geq 0$, the following inequality holds with
probability at least: $1- 2 \exp(-ct^2)$: \begin{equation} \|\frac{1}{N} A^T A
- \Sigma \| \leq \max(\delta, \delta^2) \ \text{where}\ \delta =
C\sqrt{\frac{n}{N}} + \frac{t}{\sqrt{N}} \end{equation} where $C$, $c$ depend
only on the subgaussian norm of the rows $K \equiv \max_i \| A_i\|_{\psi_2}$.
\end{proposition}

The following result is a simple corollary of
Proposition~\ref{prop:non_isotropic_isometry}:

\begin{corollary}\label{cor:number_of_rows_needed} Let $n \in \mathbb{N}$ and
  $k \in ]0,n[$. Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are
  independent Bernoulli variable vectors in $\mathbb{R}^n$ such that
  $\mathbb{P}(A_{i,j} = 1) = \frac{k}{n} = 1 - \mathbb{P}(A_{i,j} = 0)$ and let
  $\sigma \equiv \frac{k}{n}(1- \frac{k}{n}) \neq 0$, then if $N > {(C+1)}^2
  n/\sigma^2$, the matrix A has full row rank with probability at least $1 -
  e^{-cn}$, where $C, c$ are constant depending only on  the subgaussian norm
  of the rows $K \equiv \sup_{p \geq 1}
  {\frac{1}{\sqrt{p}}(\mathbb{E}|A_1|^p)}^\frac{1}{p} = k$\footnote{Is this true?
  And how do the constants behave?} \end{corollary}

\begin{proof} It is easy to compare the kernels of $A$ and $A^TA$ and notice
  that $rank(A) = rank(A^T A)$. Since $A^TA$ is an $n \times n$ matrix, it
  follows that if $A^TA$ is invertible, then $A$ has full row rank. In other
  words, if $A$ has smallest singular value $\sigma_{\min}(A) > 0$, then $A$
  has full row rank.  Consider Prop.~\ref{prop:non_isotropic_isometry} with $t
  \equiv \sqrt{n}$ and with $N > {(C+1)}^{2} n$, then with probability $1 -
  2\exp(-cn)$: \begin{equation} \|\frac{1}{N} A^T A - \sigma I \| \leq
    (C+1)\sqrt{\frac{n}{N}} \end{equation}

It follows that for any vector $x \in \mathbb{R}^n$, $|\frac{1}{N}\|A x\|^2_2
-\sigma | \leq (C+1)\sqrt{\frac{n}{N}} \implies \|A x\|^2_2 \geq N\left(\sigma
- (C+1)\sqrt{\frac{n}{N}}\right)$. If $N > {(C+1)}^2n/\sigma^2$, then
$\sigma_{\min}(A) > 0$ with probability at least $1 - e^{-cn}$ \end{proof}

\subsection{More Direct Approach}

Here we work on $\mathbb{F}_2$. An $n\times n$ binary matrix can be seen as
a matrix over $\mathbb{F}_2$. Let us assume that each row of $A$ is chosen
uniformly at random among all binary rows of length $n$. A standard counting
arguments shows that the number of non-singular matrices over $\mathbb{F}_2$
is:
\begin{displaymath}
    N_n = (2^n-1)(2^n - 2)\dots (2^n- 2^{n-1})
    = (2^n)^n\prod_{i=1}^n\left(1-\frac{1}{2^i}\right)
\end{displaymath}

Hence the probability that our random matrix $A$ is invertible is:
\begin{displaymath}
    P_n  = \prod_{i=1}^n\left(1-\frac{1}{2^i}\right)
\end{displaymath}
It is easy to see that $P_n$ is a decreasing sequence. Its limit is
$\phi(\frac{1}{2})$ where $\phi$ is the Euler function. We have
$\phi(\frac{1}{2})\approx
0.289$ and $P_n$ converges
exponentially fast to this constant (one way to see this is to use the
Pentagonal number theorem).

Hence, if we observe $\ell\cdot n$ uniformly random binary rows, the
probability that they will have full column rank is at least:
\begin{displaymath}
    P_{\ell\cdot n}\geq 1 - \left(1-\phi\left(\frac{1}{2}\right)\right)^{\ell}
\end{displaymath}

Note that this is the probability of having full column rank over
$\mathbb{F}_2$. A standard linear algebra argument shows that this implies full
column rank over $\mathbb{R}$.

\paragraph{TODO:} Study the case where we only observe sets of size exactly
$k$, or at most $k$. This amounts to replacing $2^n$ in the computation above
by:
\begin{displaymath}
{n\choose k}\quad\text{or}\quad\sum_{j=0}^k{n\choose j}
\end{displaymath}

Thinking about it, why do we assume that the sample sets are of size exactly
$k$. I think it would make more sense from the learning perspective to consider
uniformly random sets. In this case, the above approach allows us to conclude
directly.

More generally, I think the “right” way to think about this is to look at $A$
as the adjacency matrix of a random $k$-regular graph. There are tons of
results about the probability of the adjacency matrix to be non singular.

\section{Examples}

\subsection{Generic Functions}

\paragraph{TODO:} Add citations!

These are generic examples and serve as building blocks for the applications
below:
\begin{itemize}
    \item $f(S) = g\left(\sum_{i\in S} w_i\right)$ for a concave
        $g:\reals\to\reals$ and weights $(w_i)_{i\in N}\in \reals_+^{|N|}$. Note
        that $f$ is monotone iff $g$ is monotone. In this case, $g$ does not
        matter for the purpose of optimization: the sets are in the same order
        with or without $g$, the only things which matter are the weights which
        serve as natural parameters for this class of functions. This class of
        functions contains:
        \begin{itemize}
            \item additive functions (when $g$ is the identity function).
            \item $f(S) = |S\cap X|$ for some set $X\subseteq N$. This is the
                case where the weights are $0/1$ and $g$ is the identity
                function.
            \item symmetric submodular functions: when the weights are all one.
            \item budget-additive functions, when $g:x\mapsto \min(B, x)$ for
                some $B$.
        \end{itemize}
    \item $f(S) = \max_{i\in S} w_i$ for weights $(w_i)_{i\in N}\in
        \reals_+^{|N|}$. This class of functions is also naturally parametrized
        by the weights.
    \item (weighted) matroid rank functions. Given a matroid $M$ over a ground
        set $N$, we define its rank function to be:
        \begin{displaymath}
            \forall S\subseteq N,\;
            r(S) = \max_{\substack{I\subseteq S\\I\in M}} |I|
        \end{displaymath}
        more generally, given a weight function $w:N\to\reals_+$, we define the
        weighted matroid rank function:
        \begin{displaymath}
            \forall S\subseteq N,\;
            r(S) = \max_{\substack{I\subseteq S\\I\in M}} \sum_{i\in I} w_i
        \end{displaymath}
\end{itemize}

\paragraph{Remark} The function $f(S)= \max_{i\in S}w_i$ is a weighted matroid
rank function for the $1$-uniform matroid (the matroid where the independent
sets are the sets of size at most one).

\subsection{Applications}

\begin{itemize}
    \item \emph{Coverage functions:} they can be written as a positive linear
        combination of matroid rank functions:
        \begin{displaymath}
            f(S) = \sum_{u\in\mathcal{U}} w_u c_u(S)
        \end{displaymath}
        where $c_u$ is the rank function of the matroid $M = \big\{ \emptyset,
        \{u\}\big\}$.
    \item \emph{Facility location:} (cite Bilmes) there is a universe
        $\mathcal{U}$ of locations and a proximity score $s_{i,j}$ for each
        pair of locations.  We pick a subset of locations $S$ and each point in
        the universe is allocated to its closest location (the one with highest
        proximity):
        \begin{displaymath}
            f(S) = \sum_{u\in\mathcal{U}} \max_{v\in S} s_{u,v}
        \end{displaymath}
        This can be seen as a sum of weighted matroid rank functions: one for
        each location in the universe associated with a $1$-uniform matroid
        (other applications: job scheduling).

    \item \emph{Image segmentation:} (cite Jegelka) can be (in some cases)
        written as a graph cut function. For image segmentation the goal is to
        minimize the cut.
        \begin{displaymath}
            f(S) = \sum_{e\in E} w_e c_e(S)
        \end{displaymath}
        where $c_e(S)$ is one iff $e\in E(S,\bar{S})$. \textbf{TODO:} I think
        this can be written as a matroid rank function.
    \item \emph{Learning} (cite Krause) there is
        a hypothesis $A$ (a random variable) which is “refined” when more
        observations are made.  Imagine that there is a finite set $X_1,\dots,
        X_n$ of possible observations (random variables). Then, assuming that
        the observations are independent conditioned on $A$, the information
        gain:
        \begin{displaymath}
            f(S) = H(A) - H\big(A\,|\,(X_i)_{i\in S}\big)
        \end{displaymath}
        is submodular. The $\log\det$ is the specific case of a linear
        hypothesis observed with additional independent Gaussian noise.
    \item \emph{Entropy:} Closely related to the previous one. If $(X_1,\dots,
        X_n)$ are random variables, then:
        $ f(S) = H(X_S) $ is submodular. In particular, if $(X_1,\dots,
        X_n)$ are jointly multivariate gaussian, then:
        \begin{displaymath}
            f(S) = c|S| + \frac{1}{2}\log\det X_SX_S^T
        \end{displaymath}
        for $c= 2\pi e...$ and we fall back to the usual $\log\det$ function.
    \item \emph{data subset selection/summarization:} in statistical machine
        translation, Bilmes used sum of concave over modular:
        \begin{displaymath}
            f(S) = \sum_{f} \lambda_f \phi\left(\sum_{e\in S}w_f(e)\right)
        \end{displaymath}
        where each $f$ represents a feature, $w_f(e)$ represents how much of
        $f$ element $e$ has, and $\phi$ captures decreasing marginal gain when
        we have a lot of a given feature.
        Facility location functions are also commonly used for subset selection.
    \item \emph{concave spectral functions} One would be tempted to say that
        any multivariate concave function of a modular function is submodular.
        This would be the natural generalization of concave over modular.
        However \textbf{this is not true in general}. However, a possible nice
        generalization is the following. Let $M$ be a symmetric $n\times
        n$ matrix, and $g$ is a ``matrix concave'' function. Then:
        \begin{displaymath}
            f(S) = \mathrm{Tr}\big(g(M_S)\big)
        \end{displaymath}
        is submodular. This contains the $\log\det$ (when $g$ is the matrix
        $\log$) but tons of other functions (like quantum entropy).
\end{itemize}
In summary, the two most general classes of submodular functions (which capture
all the examples known to man) are: sums of matrix concave functions and sums
of matroid rank functions. Sums of concave over modular are also nice if we
want to start with a simpler example.

\section{Passive Optimization}

Let $\Omega$ be the universe of elements and $f$ a function defined on subsets
of $\Omega$: $f : S \in 2^{[\Omega]} \mapsto f(S) \in \mathbb{R}$. Let $K$ be a
collection of sets of $2^{[\Omega]}$, which we call \emph{constraints}. Let
$S^*_K$ be any solution to $\max_{S \in K} f(S)$, which we will also denote by
$S^*$ when there is no ambiguity. Let $L$ be the problem size, which is often
(but not always) equal to $|\Omega|$.

In general, we say we can efficiently optimize a function $f$ under constraint
$K$ when we have a polynomial-time algorithm making adaptive value queries to
$f$,which returns a set $S$ such that $S \in K$ and $f(S) \geq \alpha f(S^*)$
with high probability and $\alpha$ an absolute constant.

Here, we consider the scenario where we cannot make adaptive value queries, and
in fact, where we cannot make queries at all! Instead, we suppose that we
observe a polynomial number of set-value pairs $(S, f(S))$ where $S$ is taken
from a known distribution $D$. We say we can efficiently \emph{passively
optimize} $f$ under distribution $D$ or $D-$optimize $f$ under constraints $K$
when, after observing ${\cal O}(L^c)$ set-value pairs from $D$ where $c > 0$ is
an absolute constant, we can return a set $S$ such that $S \in K$ and $f(S)
\geq \alpha f(S^*)$ with high probability and $\alpha$ an absolute constant.

In the case of \emph{passive} observations of set-value pairs under a
distribution $D$ for a function $f$, recent research has focused on whether we
can efficiently and approximately \emph{learn} $f$. This was formalized in the
PMAC model from \cite{balcan2011learning}. When thinking about passive
optimization, it is necessary to understand the link between being able to
 $D-PMAC$ learn $f$ and being able to $D-$optimize $f$.

\subsection{Additive function}

We consider here the simple case of additive functions. A function $f$ is
additive if there exists a weight vector $(w_s)_{s \in \Omega}$ such that
$\forall S \subseteq \Omega, \ f(S) = \sum_{s \in S} w_s$. We will sometimes
adopt the notation $w \equiv f(\Omega)$.

\subsubsection{Inverting the system}\label{subsec:invert_the_system}

Suppose we have observed $n^c$ set-value pairs ${(S_j, f(S_j))}_{j \in N}$ with
$S_j \sim D$ where $n \equiv |\Omega|$. Consider the $n^c \times n$ matrix $A$
where for all $i$, the row $A_i = \chi_{S_i}$, the indicator vector of set
$S_i$. Let $B$ be the $n^c \times 1$ vector such that $\forall i, b_j \equiv
f(S_j)$. It is easy to see that if $w$ is the $n \times 1$ corresponding weight
vector for $f$ then:
\begin{displaymath}
  A w = B
\end{displaymath}

Note that if $A$ has full column rank, then we can solve for $w$ exactly and
also optimize $f$ under any cardinality constraint. We can therefore cast the
question of $D-$learning and $D-$optimizing $f$ as a random matrix theory
problem: what is the probability that after $n^c$ for $c > 0$ independent
samples from $D$, the matrix $A$ will have full rank? See
section~\ref{sec:matrix_theory}

\paragraph{Extension}
Note that the previous reasoning can easily be extended to any \emph{almost}
additive function. Consider a function $g$ such that there exists $\alpha > 0$
and $\beta > 0$ and an additive function $f$ such that $\forall S, \alpha f(S)
\leq g(S) \leq \beta f(S)$, then by solving for $\max_{S\in K} f(S)$ we have a
$\alpha/\beta$-approximation to the optimum of $g$ since:

\begin{displaymath}
g(S^*) \geq \alpha f(S^*) \geq \alpha f(OPT) \geq
\frac{\alpha}{\beta} g(OPT)
\end{displaymath}

where $OPT \in \arg \max_{S \in K} g(S)$. This can be taken one step further by
considering a function $g$ such that there exists $\alpha, \beta >0$, an
additive function $f$ and a bijective univariate function $\phi$, such that
$\forall S, \alpha \phi(f(S)) \leq g(S) \leq \beta \phi(f(S))$. In this case, we
solve the system $A w = \phi^{-1}(B)$ and obtain once again an
$\alpha/\beta$-approximation to the optimum of $g$.

\begin{remark}
Note that here $D-$optimizing $f$ is easy because $D-$learning $f$ is easy. We
would like to understand whether being able to $D-$learn $f$ is really
necessary to $D-$optimizing it. In fact, many results for PMAC-learning more
complex functions, such as general submodular functions, are negative. Can we
hope to find positive results in cases where PMAC-learning is impossible?
\end{remark}

\subsubsection{Average weight method}

We consider here another method to $D-$optimizing $f$, which only requires
$D-$learning $f$ approximately. Note that for every element $i \in \Omega$, and
for a \emph{product} distribution $D$:
\begin{displaymath}
  \mathbb{E}_{S \sim D}(f(S)|i \in S) - \mathbb{E}_{S \sim D}(f(S) | i \notin
  S) = \sum_{j \neq i \in \Omega} w_s \left(\mathbb{P}(j\in S|i \in S) -
  \mathbb{P}(j \in S | i \notin S)\right) + w_i(1 - 0) = w_i
\end{displaymath}

Let $O$ be the collection of all sets $S$ for which we have observed $f(S)$ and
let $O_i \equiv \{S \in O : i \in S\}$ and $O^c_i \equiv \{ S\in O: i \notin S
\}$. If $O_i$ and $O^c_i$ are non-empty, define the following weight estimator:
\begin{displaymath}
  \hat W_i \equiv \frac{1}{|O_i|} \sum_{S \in O_i} f(S) - \frac{1}{|O_i^c|}
  \sum_{S \in O_i^c} f(S)
\end{displaymath}

If $D$ is a product distribution such that $ \exists c > 0, \forall i,
\mathbb{P}(i) \geq c$, it is
easy to show that $\forall i \in
\Omega,\ \hat W_i \rightarrow_{|O| \rightarrow +\infty} w_i$
By using standard concentration bounds, we can hope to obtain within $poly(n,
\frac{1}{\epsilon})$ observations:


%For every node $i$, we compute the \emph{average weight of every set containing
%element $i$}.  Let $w_i$ be
%the weight of element $i$, $w \equiv f(\Omega) = \sum_{i \in \Omega} w_i$ and
%$p \equiv \frac{k}{n}$, then $$\forall i \in \Omega, \mathbb{E}_{S \sim
%D_p}\left[f(S)| i \in S\right] = pw + (1 -p)w_i$$

%Note that the average weight of every set containing element $i$ preserves the
%ranking of the weights of the elements. For observations ${(S_j, f(S_j))}_{j \in
%N}$ and for $N_i \equiv |\{S : i \in S\}|$, we define the following estimator:

%\begin{equation} \forall i \in \Omega, w_i^{N_i} \equiv \frac{1}{N_i}\sum_{S |
%i \in S} \frac{f(S) - pw}{1-p} \end{equation}

%As shown above, $w_i^{N_i} \rightarrow w_i$ as $N_i \rightarrow +\infty$. We
%can obtain a concentration bound of $w_i^{N_i}$ around $w_i$, using Hoeffding's
%lemma:

%\begin{equation} \mathbb{P}\left(\middle|w_i^{N_i} - w_i \middle|  \geq
%\epsilon w_i \right) \leq 2e^{-2(1-p)N_i\frac{\epsilon^2 w_i^2}{w^2}}
%\end{equation}

%\emph{TODO:multiplicative boudns are very bad for zero weights\dots Need to look
%at additive bounds for these zeros.}

%For $N_i$ sufficiently large for each element $i$, we have $\forall i \in
%\Omega, (1-\epsilon) w_i \leq w_i^{N_i} \leq (1 + \epsilon) w_i$. Under this
%assumption, if we choose the $k$ elements with largest estimated weight
%$W_i^{N_i}$, we obtain a $\frac{1-\epsilon}{1+\epsilon}$-approximation to OPT,
%where OPT is the value of the maximum weight set of $k$ elements for the
%function $f$. To ensure that $N_i$ is sufficiently large for each element, we
%note that $\mathbb{E}(N_i) = pN$ and use a Chernoff bound coupled with a
%classic union bound:

%\begin{equation} \mathbb{P}\left(\bigcup_{i \in \Omega} \left[N_i \leq
%\frac{pN}{2}\right]\right) \leq \sum_{i\in \Omega} \mathbb{P}\left(N_i \leq
%\frac{pN}{2}\right) \leq n e^{-\frac{pN}{8}} \end{equation}

%As such, for $C > 0$ and $N \geq (C+1)\frac{8}{p}\log n$, we have $\forall i
%\in \Omega, N_i \geq \frac{pN}{2}$ with probability at least $1-\frac{1}{n^C}$

%\begin{proposition} Assume that $N$ pairs of set-value ${(S_j, f(S_j))}_{j \in
  %N}$ are observed, where $S_j \sim D_p$ and $p \equiv \frac{k}{n}$. If $N >
  %XXX$, then we can $\epsilon$-learn $f$ and optimize it to a
  %$(1+\epsilon)/(1-\epsilon)$ factor for any cardinality constraint with
  %probability $XXX$.  \end{proposition}

\subsection{General submodular functions}

\subsection{Parametric submodular functions}

\textbf{Note:} all known examples of submodular functions are parametric. The
question is not parametric vs non-parametric but whether or not the number of
parameters is polynomial or exponential in $n$ (the size of the ground set).
For all examples I know except martroid rank functions, it seems to be
polynomial.

\subsubsection{Influence Functions}

Let $G = (V, E)$ be a weighted directed graph. Without loss of generality, we
can assign a weight $p_{u,v} \in [0,1]$ to every possible edge $(u,v) \in V^2$.
Let $m$ be the number of non-zero edges of $G$. Let $\sigma_G(S, p)$ be the
influence of the set $S \subseteq V$ in $G$ under the IC model with parameters
$p$.

Recall from \cite{Kempe:03} that:
\begin{equation}
    \sigma_G(S, p) = \sum_{v\in V} \P_{G_p}\big[r_{G_p}(S\leadsto v)\big]
\end{equation}
where $G_p$ is a random graph where each edge $(u,v)\in E$ appears with
probability $p_{u,v}$ and $r_{G_p}(S\leadsto v)$ is the indicator variable of
\emph{there exists a path from $S$ to $v$ in $G_p$}.

\begin{claim}
\label{cla:oracle}
If for all $(u,v) \in V^2$, $p_{u,v} \geq p'_{u,v} \geq p_{u,v}
- \frac{1}{\alpha m}$, then:
\begin{displaymath}
    \forall S \subseteq V,\, \sigma_{G}(S, p) \geq \sigma_{G}(S,p') \geq (1
    - 1/\alpha) \sigma_{G}(S,p)
\end{displaymath}
\end{claim}

\begin{proof}
    We define two coupled random graphs $G_p$ and $G_p'$ as follows: for each
    edge $(u,v)\in E$, draw a uniform random variable $\mathcal{U}_{u,v}\in
    [0,1]$. Include $(u,v)$ in $G_p$ (resp. $G_{p'}$) iff
    $\mathcal{U}_{u,v}\leq p_{u,v}$ (resp. $\mathcal{U}_{u,v}\leq p'_{u,v}$).
    It is clear from the construction that each edge $(u,v)$ will be present in
    $G_p$ (resp. $G_p'$) with probability $p_{u,v}$ (resp. $p'_{u,v}$). Hence:
    \begin{displaymath}
        \sigma_G(S, p) = \sum_{v\in V} \P_{G_p}\big[r_{G_p}(S\leadsto v)\big]
        \text{  and  }
        \sigma_G(S, p') = \sum_{v\in V} \P_{G_p'}\big[r_{G_p'}(S\leadsto v)\big]
    \end{displaymath}

    By construction $G_{p'}$ is always a subgraph of $G_p$, i.e
    $r_{G_p'}(S\leadsto v)\leq r_{G_p}(S\leadsto v)$. This proves the left-hand
    side of the Claim's inequality.

    The probability that an edge is present in $G_p$ but not in $G_p'$ is at
    most $\frac{1}{\alpha m}$, so with probability $\left(1-\frac{1}{\alpha
    m}\right)^m$, $G_p$ and $G_p'$ have the same set of edges. This implies that:
    \begin{displaymath} \P_{G_{p'}}\big[r_{G_p'}(S\leadsto v)\big]\geq
    \left(1-\frac{1}{\alpha m}\right)^m\P_{G_{p}}\big[r_{G_p}(S\leadsto v)\big]
  \end{displaymath}
    which proves the right-hand side of the claim after observing that
    $\left(1-\frac{1}{\alpha m}\right)^m\geq 1-\frac{1}{\alpha}$ with equality
    iff $m=1$.
\end{proof}

 We can use Claim~\ref{cla:oracle} to find a constant factor approximation
 algorithm to maximising influence on $G$ by maximising influence on $G'$. For
 $k \in \mathbb{N}^*$, let $O_k \in \arg\max_{|S| \leq k} \sigma_G(S)$ and
 $\sigma_{G}^* = \sigma_G(O_k)$ where we omit the $k$ when it is unambiguous.


\begin{proposition}
\label{prop:approx_optim}
Suppose we have an unknown graph $G$ and a known graph $G'$ such that $V = V'$
and for all $(u,v) \in V^2, |p_{u,v} - p_{u,v}'| \leq  \frac{1}{\alpha m}$.
Then for all $k \in \mathbb{N}^*$, we can find a set $\hat O_k$ such that
$\sigma_{G}(\hat O_k) \geq (1 - e^{\frac{2}{\alpha} - 1}) \sigma^*_{G}$
\end{proposition}

\begin{proof} For every edge $(u,v) \in V^2$, let $\hat p = p'_{u,v} -
  \frac{1}{\alpha m}$. We are now in the conditions of Claim~\ref{cla:oracle}
  with $\alpha \leftarrow \alpha/2$. We return the set $\hat O_k$ obtained by
  greedy maximisation on $\hat G$. It is a classic exercise to show then that
  $\sigma_G(\hat O_k) \geq 1 - e^{\frac{2}{\alpha} - 1}$ (see Pset 1, CS284r).
\end{proof}

A small note on the approximation factor: it is only $>0$ for $\alpha > 2$.
Note that $\alpha \geq 7 \implies 1 - e^{\frac{2}{\alpha} - 1} \geq
\frac{1}{2}$ and that it converges to $(1 - 1/e)$ which is the best possible
polynomial-time approximation ratio to influence maximisation unless $P = NP$.
Also note that in the limit of large $m$, $(1 -\frac{1}{\alpha m})^m
\rightarrow \exp(-1/\alpha)$ and the approximation ratio goes to $1 -
\exp(-\exp(-2/\alpha))$.

\subsubsection{Active set selection of stationary Gaussian Processes}

\section{Passive Optimization vs. Passive Learning}

\subsection{Failed Attempt: returning max of observations}

This doesn't work. Give examples as to why! Remember that there are strong
concentration results for submodular functions -> look at expected value of
observed sets

\subsection{Example where optimization possible, learning impossible}

Recall the matroid construction from~\cite{balcan2011learning}:
\begin{theorem}
  For any $k$ with $k = 2^{o(n^{1/3})}$, there exists a family of sets ${\cal A}
  \subseteq2^{[n]}$ and a family of matroids $\cal{M} = \{M_{\cal{B}} :
  \cal{B} \subseteq\cal{A} \}$ such that:
  \begin{itemize}
    \item $|{\cal A}| = k$ and $|A| = n^{1/3}$ for every $A \in \cal{A}$
    \item For every $\cal{B} \subseteq\cal{A}$ and every $A \in \cal{A}$, we
      have:
      \begin{align*}
        \text{rank}_{M_{\cal{B}}}(A) & = 8 \log k \ if A\in{\cal B} \\
                                     & = |A| \ if A\in {\cal A}\backslash{\cal B}
      \end{align*}
  \end{itemize}
\end{theorem}

Consider the following subset of the above family of matroids: ${\cal
M}^{\epsilon} = \{M_{\cal B} : {\cal B} \subseteq{\cal A} \wedge |{\cal
A}\backslash{\cal B}| \geq \epsilon|{\cal A}|\}$ for $k = 2^{n^{1/6}}$.
Consider an \emph{unknown} function $f$, corresponding to the rank function of
one of the matroids $M_{\cal B}$ from ${\cal M}^{\epsilon}$.  Note that as long
as we observe \emph{one} set from ${\cal A} \backslash {\cal B}$, we can
optimize $f$ exactly! Indeed, the largest possible value for $f$ under
cardinality constraint $n^{1/3}$ is $\max_{A\in 2^{[n]}} |A| = n^{1/3}$.

One example of a distribution under which this occurs with probability at least
a constant is $D_u$, the uniform distribution over all sets of ${\cal A}$.  For
$c>0$, after $n^c$ observations $O \equiv (S_i, f(S_i))_i$ for $S_i \sim D_u$,
we will observe at least one element from $\cal{A}\backslash\cal{B}$ with
constant probability:

\begin{equation} \nonumber \mathbb{P}(\bigcup_{S_i} S_i \in {\cal
A}\backslash{\cal B}) \geq 1 - (1 - \epsilon)^{n^c} \approx \epsilon n^c
\end{equation}

However, it follows from the analysis of~\cite{balcan2011learning} that we
cannot learn $f$ under any distribution, even with active value queries!
Indeed, for any polynomially-sized set of observations, there exists a
super-polynomially number of functions in ${\cal M^1}$ which coincide on this
set of observations, but which take very different values outside of this set
of observations: $8n^{1/6}$ for $A \in {\cal B}$ and $n^{1/3}$ for $A \in {\cal
A}\backslash {\cal B}$.

{\bf TODO:} A cleaner simpler example would be nice.

\section{Meeting notes: 04.03.2015}

Consider the following function:
\begin{displaymath}
    g(S) = \max\left\{\sqrt{n}|X\cap S|, |S|\right\}
\end{displaymath}
where $|X|=\sqrt{n}$. Assume that you are given polynomially many samples where
each element is included with probability $1/2$. Then with high probability all
the samples will have size roughly $n/2$, so you will observe $g(S)=|S|$ with
high probability.

\paragraph{Claim 1:} $g$ is PAC-learnable because if you output $|S|$ then you
will be correct with high probability is $S$ is drawn from the same
distribution as above.

\paragraph{Claim 2:} $g$ is not optimizable under budget $\sqrt{n}$ because you
never learn anything about $X$. The set you output will necessarily be random
with respect to $X$: $\sqrt{n}|X \cap S| \approx
n^{\frac{1}{3}}(\frac{\sqrt{n}}{n})\sqrt{n}$ and therefore $g(S) \approx
\sqrt{n}$ with high probability. However if we had been able to output$X$, we
would have $g(X) = n^{\frac{5}{6}}$.

\paragraph{Open Question:} Cook a similar example where $g$ is submodular and
where you are observing sets of the same size as your budget.

\paragraph{Positive results:} Try to obtain guarantees about learning
parameters of parametric submodular functions and show whether or not these
guarantees are sufficient for optimization. First look at learning weights in
a cover function. Maybe facility location? Sums of concave over modular are
probably too hard because of the connection to neural networks.

\paragraph{Notes}

Are you sure the example shouldn't hav $n^{1/3}$ rather than $\sqrt{n}$?
Otherwise a random set $S$ will be such that $\sqrt{n}|X \cap S|
\sqrt{n}\frac{\sqrt{n}}{2} = \frac{n}{2}$, which is roughly equal to $|S|$.

\section{Which learning guarantees imply optimization?}

Here, we consider the following question: which learning guarantees imply
optimization? For example, \cite{TODO} provides the following guarantee for
cover functions:

\begin{theorem}
There exists an algorithm such that for any $\epsilon>0$ given random and
uniform examples of a coverage function $c$ outputs a hypothesis, which is also
a coverage function $h$, such that with probability $2/3$: $\mathbb{E}_{\cal
U}[|h(x) - c(x)|] \leq \epsilon$. The algorithm runs in time $\tilde {\cal O}(n)
\cdot \text{poly}(s/\epsilon)$ and uses $\log(n)\cdot \text{poly}(s/epsilon)$
and examples, where $s = \min\{size(c), (1/\epsilon)^{\log(1/\epsilon)}\}$.
\end{theorem}

We would like to understand to what extent this $\ell1$-bound allows us to
optimize the coverage function $c$ under cardinality constraints using the
hypothesis $h$. Let us consider the simpler case of an additive function, and
suppose we have a similar guarantee: $\mathbb{E}_{x \sim {\cal U}}[|h(x) -
c(x)|]$ where $\forall x, c(x) \equiv \sum_i w_i x_i$ and $h(x) \equiv \sum_i
\hat w_i x_i$. Can we find a bound on $\|w - \hat w\|_{\infty}$?

As it turns out, yes. Let $\forall i, w_i - \hat w_i \equiv \alpha_i$ and let
$V(x) \equiv |h(x) - c(x)| = |\sum_{i} \alpha_i x_i |$. Consider the collection
of good sets ${\cal G} \equiv \{S : v(S) < 4\epsilon\}$. We claim that $|{\cal G}|
\geq \frac{3}{4}\cdot2^n$. Suppose the contrary, there is at least a quarter of
the sets which have value $v(S) > 4\epsilon$ such that $\mathbb{E}_{x \sim {\cal
U}}[|v(x)|] \geq \frac{1}{2^n}\sum_{S \in {\cal G}^c} |v(S)| >
\frac{1}{4}\cdot4\epsilon = \epsilon$ which is a contradiction. Consider element
$i$ such that $|\alpha_i| \equiv \|\alpha\|_{\infty}$. Consider the collection
of sets which contain $i$: ${\cal S}_i \equiv \{S : i \in S\}$.  Notice that
$|{\cal S}_i| = |{\cal S}_i^c| = 2^{n-1}$. Therefore, $|{\cal G} \cap {\cal
S}_i^c| \geq \frac{1}{4}\cdot 2^n$. For all sets $S$ in ${\cal G} \cap {\cal
S}_i^c$, $v(S\cup\{j\}) \geq \alpha_i - 4\epsilon$. It follows that
$\mathbb{E}_{x \sim {\cal U}}[|v(x)|] \geq \frac{1}{4}(\alpha_j - 4\epsilon)$
and therefore we have $\|w - \hat w\|_{\infty} \leq 8 \epsilon$.

\section{Tricky coverage function example}

Let $U = \{x_1, x_2\}$ be a universe of size $2$ and $w : x \in U \mapsto
\mathbb{R}$ a weight function on $U$. Consider a family ${\cal A}$ of $n$ sets
$A_1, A_2, \dots A_n$ such that $A_1 \equiv \{x_1\}$, $A_2 \equiv \{x_2\}$ and
$\forall i \neq 1,2, A_i = \{x_1, x_2\}$. Let $k \in \mathbb{R}^+$ be any
positive constant and consider the family of coverage functions defined on $U$
and the family of sets ${\cal A}$:
$${\cal C} \equiv \{c : {\cal A} \mapsto \mathbb{R}^+ : \forall S \subset {\cal
A}, c(S) = w(\cup_{S_i \in S} S_i) \text{ and } w(U) = k \}$$

Note that any coverage function from this family of coverage functions is
constant almost everywhere, equal to $w(x_1)+w(x_2) = k$:

\begin{itemize}
\item $\forall S \neq A_1,
A_2, c(S) = k$
\item $c(\{A_1\}) = w(x_1)$, and $c(\{A_2\}) = w(x_2)$
\end{itemize}

\subsection{Easy to learn under any distribution}

TODO

\subsection{Hard to optimize under cardinality constraint with $K=1$}

Given any distribution ${\cal D}$ on $2^U$ such that
$\mathbb{P}(\{A_1\}) + \mathbb{P}(\{A_2\}) = o(n^{-c})$ for any constant $c$,
i.e. $\exists f : n \mapsto \mathbb{R}^+ \text{ s.t. } \lim_{n\rightarrow
+\infty} f(n) = +\infty \text{ and } \mathbb{P}(\{A_1\}) + \mathbb{P}(\{A_2\}) =
{\cal O}(n^{-f(n)})$, and for any constant $C \in \mathbb{R}$ and $m \equiv
n^C$, then for $n$ sufficiently large, the set $\hat S$ returned by any one-shot
optimization algorithm after observing random sets $S_1, S_2, \dots S_m$ from
${\cal D}$ is such that:

$$\mathbb{P}_{S_1, \dots S_m \sim {\cal D}}\left(\mathbb{P}_{\hat S}
\left(f(\hat S) > 0\right) \geq \frac{1}{n}\right) \leq \frac{n^C}{2f(n)}
\rightarrow 0 \text{ when } n \rightarrow +\infty$$

The previous proposition can be reformulated in the slightly weaker formulation
which is more intuitive:

$$\mathbb{P}_{S_1, \dots S_m \sim {\cal D}}\left(\mathbb{E}(f(\hat S)) \leq
\frac{1}{n}OPT \right) \geq 1 - \frac{n^C}{2f(n)} \rightarrow 1 \text{ when } n
\rightarrow +\infty$$

Note that the restriction on ${\cal D}$ is verified by many distributions. In
the case of the uniform distribution $ \mathbb{P}(\{A_1\}) + \mathbb{P}(\{A_2\})
= \frac{1}{2^{n-1}}$ and in the case of observing all sets of a certain fixed
size $k>1$ uniformly at random: $\mathbb{P}(\{A_1\}) + \mathbb{P}(\{A_2\})= 0$
This example is not fully satisfying for two reasons:

\begin{itemize}
  \item the coverage function is not monotone.
  \item the sets we observe most do not satisfy the constraint.
\end{itemize}

Note that even if we observe sets which \emph{almost} satisfy the cardinality
constraint, such as all sets of size 2 and only such sets, we would still have
no hope to output a set with expected value within $\frac{1}{n}$ of OPT\@ under
cardinality constraint $K=1$. This example can be modified to address the second
issue.

\subsection{Fixing second issue}

We extend the previous example by slightly modifying ${\cal A}$

\bibliography{optimize} \bibliographystyle{apalike}


\end{document}