results.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194

\documentclass[10pt]{article}
\usepackage{fullpage, amsmath, amssymb, amsthm}
\usepackage[utf8x]{inputenc}

\newtheorem{proposition}{Proposition}
\newtheorem{corollary}{Corollary}
\newtheorem{problem}{Problem}

\title{Learn and/or Optimize}
\author{}
\date{}

\begin{document}
\maketitle

\section{Preliminary Results}

\subsection{Generic Random Matrix Theory}
We cite the following result from \cite{vershynin2010introduction} (Remark 5.40):

\begin{proposition}
\label{prop:non_isotropic_isometry}
Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are independent sub-gaussian random vectors in $\mathbb{R}^n$ with second moment matrix $\Sigma$. Then for every $t \geq 0$, the following inequality holds with probability at least: $1- 2 \exp(-ct^2)$:
\begin{equation}
\|\frac{1}{N} A^T A - \Sigma \| \leq \max(\delta, \delta^2) \text{ where } \delta = C\sqrt{\frac{n}{N}} + \frac{t}{\sqrt{N}}
\end{equation}
where $C$, $c$ depend only on the subgaussian norm of the rows $K \equiv \max_i \| A_i\|_{\psi_2}$.
\end{proposition}

The following result is a simple corollary of Proposition~\ref{prop:non_isotropic_isometry}:

\begin{corollary}
\label{cor:number_of_rows_needed}
Let $n \in \mathbb{N}$ and $k \in ]0,n[$. Assume that $A$ is an $N \times n$ 
matrix whose rows $A_i$ are independent Bernoulli variable vectors in 
$\mathbb{R}^n$ such that $\mathbb{P}(A_{i,j} = 1) = \frac{k}{n} = 1 - 
\mathbb{P}(A_{i,j} = 0)$ and let $\sigma \equiv \frac{k}{n}(1- \frac{k}{n}) 
\neq 0$, then if $N > (C+1)^2 n/\sigma^2$, the matrix A has full row rank 
with probability at least $1 - e^{-cn}$, where $C, c$ are constant depending only on  the subgaussian norm of the rows $K \equiv \sup_{p \geq 1} \frac{1}{\sqrt{p}}(\mathbb{E}|A_1|^p)^\frac{1}{p} = k$\footnote{Is this true? And how do the constants behave?}
\end{corollary}

\begin{proof}
It is easy to compare the kernels of $A$ and $A^TA$ and notice that $rank(A) = rank(A^T A)$. Since $A^TA$ is an $n \times n$ matrix, it follows that if $A^TA$ is invertible, then $A$ has full row rank. In other words, if $A$ has smallest singular value $\sigma_{\min}(A) > 0$, then $A$ has full row rank. 
Consider Prop.~\ref{prop:non_isotropic_isometry} with $t \equiv \sqrt{n}$ and with $N > (C+1)^2 n$, then with probability $1 - 2\exp(-cn)$:
\begin{equation}
\|\frac{1}{N} A^T A - \sigma I \| \leq (C+1)\sqrt{\frac{n}{N}}
\end{equation}

It follows that for any vector $x \in \mathbb{R}^n$, $|\frac{1}{N}\|A x\|^2_2 -\sigma | \leq (C+1)\sqrt{\frac{n}{N}} \implies \|A x\|^2_2 \geq N\left(\sigma - (C+1)\sqrt{\frac{n}{N}}\right)$. If $N > (C+1)^2n/\sigma^2$, then $\sigma_{\min}(A) > 0$ with probability at least $1 - e^{-cn}$
\end{proof}

Note that this is clearly not optimal. For example, note that for $k=1$, the solution to the coupon collector problem states that only ${\cal O}(n\log n)$ rows are sufficient before the matrix $A$ has full row rank with high probability, and not ${\cal O}(n^2)$ as stated by Corollary~\ref{cor:number_of_rows_needed}.

\paragraph{Note:} $\Theta(n\log n)$ is in expectation for the coupon collector
problem. If one want an exponentially small probability of failure, then
I think results on the coupon collector problem also imply that a quadratic
number of rows are required.

\subsection{More Direct Approach}

Here we work on $\mathbb{F}_2$. An $n\times n$ binary matrix can be seen as
a matrix over $\mathbb{F}_2$. Let us assume that each row of $A$ is chosen
uniformly at random among all binary rows of length $n$. A standard counting
arguments shows that the number of non-singular matrices over $\mathbb{F}_2$
is:
\begin{displaymath}
    N_n = (2^n-1)(2^n - 2)\dots (2^n- 2^{n-1})
    = (2^n)^n\prod_{i=1}^n\left(1-\frac{1}{2^i}\right)
\end{displaymath}

Hence the probability that our random matrix $A$ is invertible is:
\begin{displaymath}
    P_n  = \prod_{i=1}^n\left(1-\frac{1}{2^i}\right)
\end{displaymath}
It is easy to see that $P_n$ is a decreasing sequence. Its limit is
$\phi(\frac{1}{2})$ where $\phi$ is the Euler function. We have
$\phi(\frac{1}{2})\approx
0.2887880950866024212788997219292307800889119048406857$ and $P_n$ converges
exponentially fast to this constant (one way to see this is to use the
Pentagonal number theorem).

Hence, if we observe $\ell\cdot n$ uniformly random binary rows, the
probability that they will have full column rank is at least:
\begin{displaymath}
    P_{\ell\cdot n}\geq 1 - \left(1-\phi\left(\frac{1}{2}\right)\right)^{\ell}
\end{displaymath}

Note that this is the probability of having full column rank over
$\mathbb{F}_2$. A standard linear algebra argument shows that this implies full
column rank over $\mathbb{R}$.

\paragraph{TODO:} Study the case where we only observe sets of size exactly
$k$, or at most $k$. This amounts to replacing $2^n$ in the computation above
by:
\begin{displaymath}
{n\choose k}\quad\text{or}\quad\sum_{j=0}^k{n\choose j}
\end{displaymath}

Thinking about it, why do we assume that the sample sets are of size exactly
$k$. I think it would make more sense from the learning perspective to consider
uniformly random sets. In this case, the above approach allows us to conclude
directly.

More generally, I think the “right” way to think about this is to look at $A$
as the adjacency matrix of a random $k$-regular graph. There are tons of
results about the probability of the adjacency matrix to be non singular.

\section{Passive Optimization of Additive Functions}

Let $\Omega$ be a universe of elements of size $n$. For $k \in [n]$, let $D_k$ be the uniform probability distribution over all sets of size $k$, and let $D_p$ be the product probability distribution over sets such that $\forall j \in \Omega, \mathbb{P}_{S \sim D_p}(j \in S) = p$. Note that $D_{k}$ and $D_{p}$ for $p \equiv \frac{k}{n}$ are very similar, and will be considered almost interchangeably throughout the following notes.

\textbf{TODO:} Make the argument about independently choosing elements with
proba $k/n$ vs choosing a set of size $k$ rigorous. The “more direct” approach
above should allow us to consider the case of sets of size $k$ directly.

Consider the following problem:

\begin{problem}
Let $f = (w_i)_{i \in \Omega}$ be an unknown additive function: $f(S) \equiv \sum_{u \in S} w_u$. How many (set, value) pairs $(S, f(S))$ where $S \sim D_k$ must be observed until we can optimize $f$ exactly or to a constant approximation factor under cardinality constraint $k$?
\end{problem}


\subsection{Inverting the system}
\label{subsec:invert_the_system}

Note that for observations $(S_j, f(S_j))_{j \in N}$ we can write the system $A w = b$, where each row of $A_j$ corresponds to the indicator vector of the set $S_j$ and $b_j \equiv f(S_j)$. From Corollary~\ref{cor:number_of_rows_needed}, it is easy to see that $(C_K+1)^2 \frac{n^3}{k(n - k)}$ rows are sufficient for $A$ to have full row rank with probability $1-e^{-c_k n}$. If $A$ has full row rank, then it is easy to invert the system in polynomial time, and both learn and optimize $f$ for any cardinality constraint.

\begin{proposition}
Assume that $N$ pairs of set-value $(S_j, f(S_j))_{j \in N}$ are observed, where $S_j \sim D_p$ and $p \equiv \frac{k}{n}$. If $N > (C_k +1)^2 \frac{n}{\sigma^2}$, then we can learn $f$ exactly, and therefore optimize it under any cardinality constraint, with probability $1-e^{-cn}$.
\end{proposition}

\subsection{Average weight method}

Another method is to compute for every node $i$ the \emph{average weight of every set containing element $i$}. Let $w_i$ be the weight of element $i$, $w \equiv f(\Omega) = \sum_{i \in \Omega} w_i$ and $p \equiv \frac{k}{n}$, then
$$\forall i \in \Omega, \mathbb{E}_{S \sim D_p}\left[f(S)| i \in S\right] = pw + (1 -p)w_i$$

Note that the average weight of every set containing element $i$ preserves the ranking of the weights of the elements. For observations $(S_j, f(S_j))_{j \in N}$ and for $N_i \equiv |\{S : i \in S\}|$, we define the following estimator:

\begin{equation}
\forall i \in \Omega, w_i^{N_i} \equiv \frac{1}{N_i}\sum_{S | i \in S} \frac{f(S) - pw}{1-p}
\end{equation}

As shown above, $w_i^{N_i} \rightarrow w_i$ as $N_i \rightarrow +\infty$. We can obtain a concentration bound of $w_i^{N_i}$ around $w_i$, using Hoeffding's lemma:

\begin{equation}
\mathbb{P}\left(\middle|w_i^{N_i} - w_i \middle|  \geq \epsilon w_i \right) \leq 2e^{-2(1-p)N_i\frac{\epsilon^2 w_i^2}{w^2}}
\end{equation}

\emph{TODO: multiplicative boudns are very bad for zero weights...Need to look at additive bounds for these zeros.}

For $N_i$ sufficiently large for each element $i$, we have $\forall i \in \Omega, (1-\epsilon) w_i \leq w_i^{N_i} \leq (1 + \epsilon) w_i$. Under this assumption, if we choose the $k$ elements with largest estimated weight $W_i^{N_i}$, we obtain a $\frac{1-\epsilon}{1+\epsilon}$-approximation to OPT, where OPT is the value of the maximum weight set of $k$ elements for the function $f$. To ensure that $N_i$ is sufficiently large for each element, we note that $\mathbb{E}(N_i) = pN$ and use a Chernoff bound coupled with a classic union bound:

\begin{equation}
\mathbb{P}\left(\bigcup_{i \in \Omega} \left[N_i \leq \frac{pN}{2}\right]\right) \leq \sum_{i\in \Omega} \mathbb{P}\left(N_i \leq \frac{pN}{2}\right) \leq n e^{-\frac{pN}{8}}
\end{equation}

As such, for $C > 0$ and $N \geq (C+1)\frac{8}{p}\log n$, we have $\forall i \in \Omega, N_i \geq \frac{pN}{2}$ with probability at least $1-\frac{1}{n^C}$

\begin{proposition}
Assume that $N$ pairs of set-value $(S_j, f(S_j))_{j \in N}$ are observed, where $S_j \sim D_p$ and $p \equiv \frac{k}{n}$. If $N > XXX$, then we can $\epsilon$-learn $f$ and optimize it to a $(1+\epsilon)/(1-\epsilon)$ factor for any cardinality constraint with probability $XXX$.
\end{proposition}

\section{Passive Optimization of Submodular functions}

\subsection{Inverting the system}

A result by \cite{balcan2011learning} states that:

\begin{proposition}
\label{prop:balcan_learned}
Let ${\cal F}$ be the class of non-negative, monotone, submodular functions over $\Omega$. There is an algorithm that PMAC-learns ${\cal F}$ with approximation factor $\sqrt{n+1}$, by outputting a function $\tilde f: S\mapsto \sqrt{\frac{1}{(n+1)z}w^T \chi(S)}$.
\end{proposition}

By adapting Section~\ref{subsec:invert_the_system}, we get the following result:

\begin{proposition}
Any non-negative monotone submodular function can be passively optimized under cardinality constraint $k \in \mathbb{N}$ from $D_p$ for \emph{any} $p \in ]0, 1[$ with polynomially many samples up to a factor $1/\sqrt{n+1}$. 
\end{proposition}

\begin{proof}
We optimize $\tilde f$ from Proposition~\ref{prop:balcan_learned}, which can easily be done by solving for the system $Ax = b^2$ where $(b^2)_i = (b_i)^2$. TODO: complete proof
\end{proof}


\subsection{Average weight method}

Same thing can probably be said for the average weight method.

\bibliography{optimize}
\bibliographystyle{apalike}


\end{document}