aboutsummaryrefslogtreecommitdiffstats
path: root/results.tex
blob: 648f8f65baf0117cbdf2d1671639ff5e73590d13 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
\documentclass[10pt]{article}
\usepackage{fullpage, amsmath, amssymb, amsthm}

\newtheorem{proposition}{Proposition}
\newtheorem{corollary}{Corollary}
\newtheorem{problem}{Problem}

\title{Learn and/or Optimize}
\author{}
\date{}

\begin{document}
\maketitle

\section{Preliminary Results}

We cite the following result from \cite{vershynin2010introduction} (Remark 5.40):

\begin{proposition}
\label{prop:non_isotropic_isometry}
Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are independent sub-gaussian random vectors in $\mathbb{R}^n$ with second moment matrix $\Sigma$. Then for every $t \geq 0$, the following inequality holds with probability at least: $1- 2 \exp(-ct^2)$:
\begin{equation}
\|\frac{1}{N} A^T A - \Sigma \| \leq \max(\delta, \delta^2) \text{ where } \delta = C\sqrt{\frac{n}{N}} + \frac{t}{\sqrt{N}}
\end{equation}
where $C$, $c$ depend only on the subgaussian norm of the rows $K \equiv \max_i \| A_i\|_{\psi_2}$.
\end{proposition}

The following result is a simple corollary of Proposition~\ref{prop:non_isotropic_isometry}:

\begin{corollary}
\label{cor:number_of_rows_needed}
Let $n \in \mathbb{N}$ and $k \in ]0,n[$. Assume that $A$ is an $N \times n$ 
matrix whose rows $A_i$ are independent Bernoulli variable vectors in 
$\mathbb{R}^n$ such that $\mathbb{P}(A_{i,j} = 1) = \frac{k}{n} = 1 - 
\mathbb{P}(A_{i,j} = 0)$ and let $\sigma \equiv \frac{k}{n}(1- \frac{k}{n}) 
\neq 0$, then if $N > (C+1)^2 n/\sigma^2$, the matrix A has full row rank 
with probability at least $1 - e^{-cn}$, where $C, c$ are constant depending only on  the subgaussian norm of the rows $K \equiv \sup_{p \geq 1} \frac{1}{\sqrt{p}}(\mathbb{E}|A_1|^p)^\frac{1}{p} = k$\footnote{Is this true? And how do the constants behave?}
\end{corollary}

\begin{proof}
It is easy to compare the kernels of $A$ and $A^TA$ and notice that $rank(A) = rank(A^T A)$. Since $A^TA$ is an $n \times n$ matrix, it follows that if $A^TA$ is invertible, then $A$ has full row rank. In other words, if $A$ has smallest singular value $\sigma_{\min}(A) > 0$, then $A$ has full row rank. 
Consider Prop.~\ref{prop:non_isotropic_isometry} with $t \equiv \sqrt{n}$ and with $N > (C+1)^2 n$, then with probability $1 - 2\exp(-cn)$:
\begin{equation}
\|\frac{1}{N} A^T A - \sigma I \| \leq (C+1)\sqrt{\frac{n}{N}}
\end{equation}

It follows that for any vector $x \in \mathbb{R}^n$, $|\frac{1}{N}\|A x\|^2_2 -\sigma | \leq (C+1)\sqrt{\frac{n}{N}} \implies \|A x\|^2_2 \geq N\left(\sigma - (C+1)\sqrt{\frac{n}{N}}\right)$. If $N > (C+1)^2n/\sigma^2$, then $\sigma_{\min}(A) > 0$ with probability at least $1 - e^{-cn}$
\end{proof}

Note that this is clearly not optimal. For example, note that for $k=1$, the solution to the coupon collector problem states that only ${\cal O}(n\log n)$ rows are sufficient before the matrix $A$ has full row rank with high probability, and not ${\cal O}(n^2)$ as stated by Corollary~\ref{cor:number_of_rows_needed}.

\section{Optimizing Additive Functions}

Let $\Omega$ be a universe of elements of size $n$. For $k \in [n]$, let $D_k$ be the uniform probability distribution over all sets of size $k$, and let $D_p$ be the product probability distribution over sets such that $\forall j \in \Omega, \mathbb{P}_{S \sim D_p}(j \in S) = p$. Note that $D_{k}$ and $D_{p}$ for $p \equiv \frac{k}{n}$ are very similar, and will be considered almost interchangeably throughout the following notes. Consider the following problem:

\begin{problem}
Let $f = (w_i)_{i \in \Omega}$ be an unknown additive function: $f(S) \equiv \sum_{u \in S} w_u$. How many (set, value) pairs $(S, f(S))$ where $S \sim D_k$ must be observed until we can optimize $f$ exactly or to a constant approximation factor under cardinality constraint $k$?
\end{problem}

Note that for observations $(S_i, f(S_i))_{i \in N}$ we can write the system $A w = b$, where each row of $A_i$ corresponds to the indicator vector of the set $S_i$ and $b_i \equiv f(S_i)$. From Corollary~\ref{cor:number_of_rows_needed}, it is easy to see that $\Omega(n^2)$ rows are sufficient for the system $Aw =b$ to be non-degenerate. Therefore, with $\Omega(n^2)$ observations from $D_{p}$ where $p \equiv \frac{k}{n}$, we can optimize any additive function exactly.

\bibliography{optimize}
\bibliographystyle{plain}


\end{document}