results.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368

\documentclass[10pt]{article}
\usepackage{fullpage, amsmath, amssymb, amsthm, bbm}
\usepackage[utf8x]{inputenc}

\DeclareMathOperator{\E}{\mathbb{E}}
\let\P\relax
\DeclareMathOperator{\P}{\mathbb{P}}
\newcommand{\ex}[1]{\E\left[#1\right]}
\newcommand{\prob}[1]{\P\left[#1\right]}


\newtheorem{proposition}{Proposition}
\newtheorem{corollary}{Corollary}
\newtheorem{problem}{Problem}
\newtheorem{theorem}{Theorem}
\newtheorem{claim}{Claim}

\title{Learn and/or Optimize}
\author{}
\date{}

\begin{document}
\maketitle

\section{Preliminary Results}

\subsection{Generic Random Matrix Theory}
We cite the following result from~\cite{vershynin2010introduction} (Remark 5.40):

\begin{proposition}\label{prop:non_isotropic_isometry} 
Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are independent
sub-gaussian random vectors in $\mathbb{R}^n$ with second moment matrix
$\Sigma$. Then for every $t \geq 0$, the following inequality holds with
probability at least: $1- 2 \exp(-ct^2)$: \begin{equation} \|\frac{1}{N} A^T A
- \Sigma \| \leq \max(\delta, \delta^2) \ \text{where}\ \delta =
C\sqrt{\frac{n}{N}} + \frac{t}{\sqrt{N}} \end{equation} where $C$, $c$ depend
only on the subgaussian norm of the rows $K \equiv \max_i \| A_i\|_{\psi_2}$.
\end{proposition}

The following result is a simple corollary of
Proposition~\ref{prop:non_isotropic_isometry}:

\begin{corollary}\label{cor:number_of_rows_needed} Let $n \in \mathbb{N}$ and
  $k \in ]0,n[$. Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are
  independent Bernoulli variable vectors in $\mathbb{R}^n$ such that
  $\mathbb{P}(A_{i,j} = 1) = \frac{k}{n} = 1 - \mathbb{P}(A_{i,j} = 0)$ and let
  $\sigma \equiv \frac{k}{n}(1- \frac{k}{n}) \neq 0$, then if $N > {(C+1)}^2
  n/\sigma^2$, the matrix A has full row rank with probability at least $1 -
  e^{-cn}$, where $C, c$ are constant depending only on  the subgaussian norm
  of the rows $K \equiv \sup_{p \geq 1}
  {\frac{1}{\sqrt{p}}(\mathbb{E}|A_1|^p)}^\frac{1}{p} = k$\footnote{Is this true?
  And how do the constants behave?} \end{corollary}

\begin{proof} It is easy to compare the kernels of $A$ and $A^TA$ and notice
  that $rank(A) = rank(A^T A)$. Since $A^TA$ is an $n \times n$ matrix, it
  follows that if $A^TA$ is invertible, then $A$ has full row rank. In other
  words, if $A$ has smallest singular value $\sigma_{\min}(A) > 0$, then $A$
  has full row rank.  Consider Prop.~\ref{prop:non_isotropic_isometry} with $t
  \equiv \sqrt{n}$ and with $N > {(C+1)}^{2} n$, then with probability $1 -
  2\exp(-cn)$: \begin{equation} \|\frac{1}{N} A^T A - \sigma I \| \leq
    (C+1)\sqrt{\frac{n}{N}} \end{equation}

It follows that for any vector $x \in \mathbb{R}^n$, $|\frac{1}{N}\|A x\|^2_2
-\sigma | \leq (C+1)\sqrt{\frac{n}{N}} \implies \|A x\|^2_2 \geq N\left(\sigma
- (C+1)\sqrt{\frac{n}{N}}\right)$. If $N > {(C+1)}^2n/\sigma^2$, then
$\sigma_{\min}(A) > 0$ with probability at least $1 - e^{-cn}$ \end{proof}

\subsection{More Direct Approach}

Here we work on $\mathbb{F}_2$. An $n\times n$ binary matrix can be seen as
a matrix over $\mathbb{F}_2$. Let us assume that each row of $A$ is chosen
uniformly at random among all binary rows of length $n$. A standard counting
arguments shows that the number of non-singular matrices over $\mathbb{F}_2$
is:
\begin{displaymath}
    N_n = (2^n-1)(2^n - 2)\dots (2^n- 2^{n-1})
    = (2^n)^n\prod_{i=1}^n\left(1-\frac{1}{2^i}\right)
\end{displaymath}

Hence the probability that our random matrix $A$ is invertible is:
\begin{displaymath}
    P_n  = \prod_{i=1}^n\left(1-\frac{1}{2^i}\right)
\end{displaymath}
It is easy to see that $P_n$ is a decreasing sequence. Its limit is
$\phi(\frac{1}{2})$ where $\phi$ is the Euler function. We have
$\phi(\frac{1}{2})\approx
0.289$ and $P_n$ converges
exponentially fast to this constant (one way to see this is to use the
Pentagonal number theorem).

Hence, if we observe $\ell\cdot n$ uniformly random binary rows, the
probability that they will have full column rank is at least:
\begin{displaymath}
    P_{\ell\cdot n}\geq 1 - \left(1-\phi\left(\frac{1}{2}\right)\right)^{\ell}
\end{displaymath}

Note that this is the probability of having full column rank over
$\mathbb{F}_2$. A standard linear algebra argument shows that this implies full
column rank over $\mathbb{R}$.

\paragraph{TODO:} Study the case where we only observe sets of size exactly
$k$, or at most $k$. This amounts to replacing $2^n$ in the computation above
by:
\begin{displaymath}
{n\choose k}\quad\text{or}\quad\sum_{j=0}^k{n\choose j}
\end{displaymath}

Thinking about it, why do we assume that the sample sets are of size exactly
$k$. I think it would make more sense from the learning perspective to consider
uniformly random sets. In this case, the above approach allows us to conclude
directly.

More generally, I think the “right” way to think about this is to look at $A$
as the adjacency matrix of a random $k$-regular graph. There are tons of
results about the probability of the adjacency matrix to be non singular.

\section{Passive Optimization}

In general, we say we can efficiently optimize a function $f$ under constraints
$K$ when we have a polynomial-time algorithm making adaptive value queries to
$f$,which returns a set $S$ such that:  $S \in K$ and $f(S) \geq \alpha f(S^*)$
with high probability, where $\alpha$ is an absolute constant and $S^* \in \arg
\max_{S \in K}$.

Here, we consider the scenario where we cannot make adaptive value queries, and
in fact, where we cannot make queries at all! Instead, we suppose that we
observe a polynomial number of set-value pairs $(S, f(S))$ where $S$ is taken
from a known distribution $D$. Let $L$ be the problem size. We say we can
efficiently \emph{passively optimize} $f$ under distribution $D$ and
constraints $K$ when, after observing ${\cal O}(L^c)$ set-value pairs from $D$
where $c > 0$ is an absolute constant, we can return a set $S$ such that:  $S
\in K$ and $f(S) \geq \alpha f(S^*)$ with high probability, where $\alpha$ is
an absolute constant and $S^* \in \arg \max_{S \in K} f(S)$.

\subsection{Additive function}

\subsubsection{Inverting the system}\label{subsec:invert_the_system}

Note that for observations ${(S_j, f(S_j))}_{j \in N}$ we can write the system $A
w = b$, where each row of $A_j$ corresponds to the indicator vector of the set
$S_j$ and $b_j \equiv f(S_j)$. From Corollary~\ref{cor:number_of_rows_needed},
it is easy to see that ${(C_K+1)}^2 \frac{n^3}{k(n - k)}$ rows are sufficient for
$A$ to have full row rank with probability $1-e^{-c_k n}$. If $A$ has full row
rank, then it is easy to invert the system in polynomial time, and both learn
and optimize $f$ for any cardinality constraint.

\begin{proposition} Assume that $N$ pairs of set-value ${(S_j, f(S_j))}_{j \in
  N}$ are observed, where $S_j \sim D_p$ and $p \equiv \frac{k}{n}$. If $N >
  {(C_k +1)}^2 \frac{n}{\sigma^2}$, then we can learn $f$ exactly, and therefore
  optimize it under any cardinality constraint, with probability $1-e^{-cn}$.
\end{proposition}

\subsubsection{Average weight method}

Another method is to compute for every node $i$ the \emph{average weight of
every set containing element $i$}. Let $w_i$ be the weight of element $i$, $w
\equiv f(\Omega) = \sum_{i \in \Omega} w_i$ and $p \equiv \frac{k}{n}$, then
$$\forall i \in \Omega, \mathbb{E}_{S \sim D_p}\left[f(S)| i \in S\right] = pw
+ (1 -p)w_i$$

Note that the average weight of every set containing element $i$ preserves the
ranking of the weights of the elements. For observations ${(S_j, f(S_j))}_{j \in
N}$ and for $N_i \equiv |\{S : i \in S\}|$, we define the following estimator:

\begin{equation} \forall i \in \Omega, w_i^{N_i} \equiv \frac{1}{N_i}\sum_{S |
i \in S} \frac{f(S) - pw}{1-p} \end{equation}

As shown above, $w_i^{N_i} \rightarrow w_i$ as $N_i \rightarrow +\infty$. We
can obtain a concentration bound of $w_i^{N_i}$ around $w_i$, using Hoeffding's
lemma:

\begin{equation} \mathbb{P}\left(\middle|w_i^{N_i} - w_i \middle|  \geq
\epsilon w_i \right) \leq 2e^{-2(1-p)N_i\frac{\epsilon^2 w_i^2}{w^2}}
\end{equation}

\emph{TODO:multiplicative boudns are very bad for zero weights\dots Need to look
at additive bounds for these zeros.}

For $N_i$ sufficiently large for each element $i$, we have $\forall i \in
\Omega, (1-\epsilon) w_i \leq w_i^{N_i} \leq (1 + \epsilon) w_i$. Under this
assumption, if we choose the $k$ elements with largest estimated weight
$W_i^{N_i}$, we obtain a $\frac{1-\epsilon}{1+\epsilon}$-approximation to OPT,
where OPT is the value of the maximum weight set of $k$ elements for the
function $f$. To ensure that $N_i$ is sufficiently large for each element, we
note that $\mathbb{E}(N_i) = pN$ and use a Chernoff bound coupled with a
classic union bound:

\begin{equation} \mathbb{P}\left(\bigcup_{i \in \Omega} \left[N_i \leq
\frac{pN}{2}\right]\right) \leq \sum_{i\in \Omega} \mathbb{P}\left(N_i \leq
\frac{pN}{2}\right) \leq n e^{-\frac{pN}{8}} \end{equation}

As such, for $C > 0$ and $N \geq (C+1)\frac{8}{p}\log n$, we have $\forall i
\in \Omega, N_i \geq \frac{pN}{2}$ with probability at least $1-\frac{1}{n^C}$

\begin{proposition} Assume that $N$ pairs of set-value ${(S_j, f(S_j))}_{j \in
  N}$ are observed, where $S_j \sim D_p$ and $p \equiv \frac{k}{n}$. If $N >
  XXX$, then we can $\epsilon$-learn $f$ and optimize it to a
  $(1+\epsilon)/(1-\epsilon)$ factor for any cardinality constraint with
  probability $XXX$.  \end{proposition}

\subsection{General submodular functions}

\subsubsection{Inverting the system}

A result by~\cite{balcan2011learning} states that:

\begin{proposition}\label{prop:balcan_learned} Let ${\cal F}$ be the class of
  non-negative, monotone, submodular functions over $\Omega$. There is an
  algorithm that PMAC-learns ${\cal F}$ with approximation factor $\sqrt{n+1}$,
  by outputting a function $\tilde f: S\mapsto \sqrt{\frac{1}{(n+1)z}w^T
  \chi(S)}$.  \end{proposition}

{\bf TODO}: Does this imply a good guarantee if we solve the system
$Ax=f^{-1}(b)$? Probably not, though we might not care about not learning the
zeros of the function perfectly, not being able to estimate correctly a
fraction of the non-zero sets is a big problem

\subsubsection{Average weight method}


\subsection{Parametric submodular functions}

\subsubsection{Influence Functions}

Let $G = (V, E)$ be a weighted directed graph. Without loss of generality, we
can assign a weight $p_{u,v} \in [0,1]$ to every possible edge $(u,v) \in V^2$.
Let $m$ be the number of non-zero edges of $G$. Let $\sigma_G(S, p)$ be the
influence of the set $S \subseteq V$ in $G$ under the IC model with parameters
$p$. 

Recall from \cite{Kempe:03} that:
\begin{equation}
    \sigma_G(S, p) = \sum_{v\in V} \P_{G_p}\big[r_{G_p}(S\leadsto v)\big]
\end{equation}
where $G_p$ is a random graph where each edge $(u,v)\in E$ appears with
probability $p_{u,v}$ and $r_{G_p}(S\leadsto v)$ is the indicator variable of
\emph{there exists a path from $S$ to $v$ in $G_p$}.

\begin{claim}
\label{cla:oracle}
If for all $(u,v) \in V^2$, $p_{u,v} \geq p'_{u,v} \geq p_{u,v}
- \frac{1}{\alpha m}$, then:
\begin{displaymath}
    \forall S \subseteq V,\, \sigma_{G}(S, p) \geq \sigma_{G}(S,p') \geq (1
    - 1/\alpha) \sigma_{G}(S,p)
\end{displaymath}
\end{claim}

\begin{proof}
    We define two coupled random graphs $G_p$ and $G_p'$ as follows: for each
    edge $(u,v)\in E$, draw a uniform random variable $\mathcal{U}_{u,v}\in
    [0,1]$. Include $(u,v)$ in $G_p$ (resp. $G_{p'}$) iff
    $\mathcal{U}_{u,v}\leq p_{u,v}$ (resp. $\mathcal{U}_{u,v}\leq p'_{u,v}$).
    It is clear from the construction that each edge $(u,v)$ will be present in
    $G_p$ (resp. $G_p'$) with probability $p_{u,v}$ (resp. $p'_{u,v}$). Hence:
    \begin{displaymath}
        \sigma_G(S, p) = \sum_{v\in V} \P_{G_p}\big[r_{G_p}(S\leadsto v)\big]
        \text{  and  }
        \sigma_G(S, p') = \sum_{v\in V} \P_{G_p'}\big[r_{G_p'}(S\leadsto v)\big]
    \end{displaymath}

    By construction $G_{p'}$ is always a subgraph of $G_p$, i.e
    $r_{G_p'}(S\leadsto v)\leq r_{G_p}(S\leadsto v)$. This proves the left-hand
    side of the Claim's inequality.

    The probability that an edge is present in $G_p$ but not in $G_p'$ is at
    most $\frac{1}{\alpha m}$, so with probability $\left(1-\frac{1}{\alpha
    m}\right)^m$, $G_p$ and $G_p'$ have the same set of edges. This implies that:
    \begin{displaymath} \P_{G_{p'}}\big[r_{G_p'}(S\leadsto v)\big]\geq
    \left(1-\frac{1}{\alpha m}\right)^m\P_{G_{p}}\big[r_{G_p}(S\leadsto v)\big]
  \end{displaymath}
    which proves the right-hand side of the claim after observing that
    $\left(1-\frac{1}{\alpha m}\right)^m\geq 1-\frac{1}{\alpha}$ with equality
    iff $m=1$.
\end{proof}

 We can use Claim~\ref{cla:oracle} to find a constant factor approximation
 algorithm to maximising influence on $G$ by maximising influence on $G'$. For
 $k \in \mathbb{N}^*$, let $O_k \in \arg\max_{|S| \leq k} \sigma_G(S)$ and
 $\sigma_{G}^* = \sigma_G(O_k)$ where we omit the $k$ when it is unambiguous.


\begin{proposition}
\label{prop:approx_optim}
Suppose we have an unknown graph $G$ and a known graph $G'$ such that $V = V'$
and for all $(u,v) \in V^2, |p_{u,v} - p_{u,v}'| \leq  \frac{1}{\alpha m}$.
Then for all $k \in \mathbb{N}^*$, we can find a set $\hat O_k$ such that
$\sigma_{G}(\hat O_k) \geq (1 - e^{\frac{2}{\alpha} - 1}) \sigma^*_{G}$
\end{proposition}

\begin{proof} For every edge $(u,v) \in V^2$, let $\hat p = p'_{u,v} -
  \frac{1}{\alpha m}$. We are now in the conditions of Claim~\ref{cla:oracle}
  with $\alpha \leftarrow \alpha/2$. We return the set $\hat O_k$ obtained by
  greedy maximisation on $\hat G$. It is a classic exercise to show then that
  $\sigma_G(\hat O_k) \geq 1 - e^{\frac{2}{\alpha} - 1}$ (see Pset 1, CS284r).
\end{proof}

A small note on the approximation factor: it is only $>0$ for $\alpha > 2$.
Note that $\alpha \geq 7 \implies 1 - e^{\frac{2}{\alpha} - 1} \geq
\frac{1}{2}$ and that it converges to $(1 - 1/e)$ which is the best possible
polynomial-time approximation ratio to influence maximisation unless $P = NP$.
Also note that in the limit of large $m$, $(1 -\frac{1}{\alpha m})^m
\rightarrow \exp(-1/\alpha)$ and the approximation ratio goes to $1 -
\exp(-\exp(-2/\alpha))$.

\subsubsection{Active set selection of stationary Gaussian Processes}

\section{Passive Optimization vs. Passive Learning}

\subsection{Failed Attempt: returning max of observations}

This doesn't work. Give examples as to why! Remember that there are strong
concentration results for submodular functions -> look at expected value of
observed sets

\subsection{Example where optimization possible, learning impossible}

Recall the matroid construction from~\cite{balcan2011learning}:
\begin{theorem}
  For any $k$ with $k = 2^{o(n^{1/3})}$, there exists a family of sets ${\cal A}
  \subseteq2^{[n]}$ and a family of matroids $\cal{M} = \{M_{\cal{B}} :
  \cal{B} \subseteq\cal{A} \}$ such that:
  \begin{itemize}
    \item $|{\cal A}| = k$ and $|A| = n^{1/3}$ for every $A \in \cal{A}$
    \item For every $\cal{B} \subseteq\cal{A}$ and every $A \in \cal{A}$, we
      have:
      \begin{align*}
        \text{rank}_{M_{\cal{B}}}(A) & = 8 \log k \ if A\in{\cal B} \\
                                     & = |A| \ if A\in {\cal A}\backslash{\cal B}
      \end{align*}
  \end{itemize}
\end{theorem}

Consider the following subset of the above family of matroids: ${\cal
M}^{\epsilon} = \{M_{\cal B} : {\cal B} \subseteq{\cal A} \wedge |{\cal
A}\backslash{\cal B}| \geq \epsilon|{\cal A}|\}$ for $k = 2^{n^{1/6}}$.
Consider an \emph{unknown} function $f$, corresponding to the rank function of
one of the matroids $M_{\cal B}$ from ${\cal M}^{\epsilon}$.  Note that as long
as we observe \emph{one} set from ${\cal A} \backslash {\cal B}$, we can
optimize $f$ exactly! Indeed, the largest possible value for $f$ under
cardinality constraint $n^{1/3}$ is $\max_{A\in 2^{[n]}} |A| = n^{1/3}$.

One example of a distribution under which this occurs with probability at least
a constant is $D_u$, the uniform distribution over all sets of ${\cal A}$.  For
$c>0$, after $n^c$ observations $O \equiv (S_i, f(S_i))_i$ for $S_i \sim D_u$,
we will observe at least one element from $\cal{A}\backslash\cal{B}$ with
constant probability:

\begin{equation} \nonumber \mathbb{P}(\bigcup_{S_i} S_i \in {\cal
A}\backslash{\cal B}) \geq 1 - (1 - \epsilon)^{n^c} \approx \epsilon n^c
\end{equation}

However, it follows from the analysis of~\cite{balcan2011learning} that we
cannot learn $f$ under any distribution, even with active value queries!
Indeed, for any polynomially-sized set of observations, there exists a
super-polynomially number of functions in ${\cal M^1}$ which coincide on this
set of observations, but which take very different values outside of this set
of observations: $8n^{1/6}$ for $A \in {\cal B}$ and $n^{1/3}$ for $A \in {\cal
A}\backslash {\cal B}$.

{\bf TODO:} A cleaner simpler example would be nice.


\bibliography{optimize} \bibliographystyle{apalike}


\end{document}