From d8a68c6917f5b6053117e0145f6d4d80a8bec26b Mon Sep 17 00:00:00 2001 From: Thibaut Horel Date: Tue, 14 Apr 2015 15:20:19 -0400 Subject: Starting paper draft --- brainstorm/optimize.bib | 15 + brainstorm/results.tex | 616 ++++++++++++++++++++++++++++++++++++++++ optimize.bib | 15 - paper/main.tex | 42 +++ paper/optimize.bib | 15 + paper/sections/introduction.tex | 27 ++ paper/sections/negative.tex | 113 ++++++++ paper/sections/positive.tex | 365 ++++++++++++++++++++++++ results.tex | 616 ---------------------------------------- 9 files changed, 1193 insertions(+), 631 deletions(-) create mode 100644 brainstorm/optimize.bib create mode 100644 brainstorm/results.tex delete mode 100644 optimize.bib create mode 100644 paper/main.tex create mode 100644 paper/optimize.bib create mode 100644 paper/sections/introduction.tex create mode 100644 paper/sections/negative.tex create mode 100644 paper/sections/positive.tex delete mode 100644 results.tex diff --git a/brainstorm/optimize.bib b/brainstorm/optimize.bib new file mode 100644 index 0000000..3fb5309 --- /dev/null +++ b/brainstorm/optimize.bib @@ -0,0 +1,15 @@ +@article{vershynin2010introduction, + title={Introduction to the non-asymptotic analysis of random matrices}, + author={Vershynin, Roman}, + journal={arXiv preprint arXiv:1011.3027}, + year={2010} +} + +@inproceedings{balcan2011learning, + title={Learning submodular functions}, + author={Balcan, Maria-Florina and Harvey, Nicholas JA}, + booktitle={Proceedings of the forty-third annual ACM symposium on Theory of computing}, + pages={793--802}, + year={2011}, + organization={ACM} +} \ No newline at end of file diff --git a/brainstorm/results.tex b/brainstorm/results.tex new file mode 100644 index 0000000..4f2a70d --- /dev/null +++ b/brainstorm/results.tex @@ -0,0 +1,616 @@ +\documentclass[10pt]{article} +\usepackage{fullpage, amsmath, amssymb, amsthm, bbm} +\usepackage[utf8x]{inputenc} + +\DeclareMathOperator{\E}{\mathbb{E}} +\let\P\relax +\DeclareMathOperator{\P}{\mathbb{P}} +\newcommand{\ex}[1]{\E\left[#1\right]} +\newcommand{\prob}[1]{\P\left[#1\right]} +\newcommand{\reals}{\mathbb{R}} +\newcommand{\ints}{\mathbb{N}} +\renewcommand{\O}{\mathcal{O}} + + +\newtheorem{proposition}{Proposition} +\newtheorem{corollary}{Corollary} +\newtheorem{problem}{Problem} +\newtheorem{theorem}{Theorem} +\newtheorem{claim}{Claim} +\newtheorem{remark}{Remark} + +\title{Learn and/or Optimize} +\author{} +\date{} + +\begin{document} +\maketitle + +\section{Preliminary Results} +\label{sec:matrix_theory} + +\subsection{Generic Random Matrix Theory} +We cite the following result from~\cite{vershynin2010introduction} (Remark 5.40): + +\begin{proposition}\label{prop:non_isotropic_isometry} +Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are independent +sub-gaussian random vectors in $\mathbb{R}^n$ with second moment matrix +$\Sigma$. Then for every $t \geq 0$, the following inequality holds with +probability at least: $1- 2 \exp(-ct^2)$: \begin{equation} \|\frac{1}{N} A^T A +- \Sigma \| \leq \max(\delta, \delta^2) \ \text{where}\ \delta = +C\sqrt{\frac{n}{N}} + \frac{t}{\sqrt{N}} \end{equation} where $C$, $c$ depend +only on the subgaussian norm of the rows $K \equiv \max_i \| A_i\|_{\psi_2}$. +\end{proposition} + +The following result is a simple corollary of +Proposition~\ref{prop:non_isotropic_isometry}: + +\begin{corollary}\label{cor:number_of_rows_needed} Let $n \in \mathbb{N}$ and + $k \in ]0,n[$. Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are + independent Bernoulli variable vectors in $\mathbb{R}^n$ such that + $\mathbb{P}(A_{i,j} = 1) = \frac{k}{n} = 1 - \mathbb{P}(A_{i,j} = 0)$ and let + $\sigma \equiv \frac{k}{n}(1- \frac{k}{n}) \neq 0$, then if $N > {(C+1)}^2 + n/\sigma^2$, the matrix A has full row rank with probability at least $1 - + e^{-cn}$, where $C, c$ are constant depending only on the subgaussian norm + of the rows $K \equiv \sup_{p \geq 1} + {\frac{1}{\sqrt{p}}(\mathbb{E}|A_1|^p)}^\frac{1}{p} = k$\footnote{Is this true? + And how do the constants behave?} \end{corollary} + +\begin{proof} It is easy to compare the kernels of $A$ and $A^TA$ and notice + that $rank(A) = rank(A^T A)$. Since $A^TA$ is an $n \times n$ matrix, it + follows that if $A^TA$ is invertible, then $A$ has full row rank. In other + words, if $A$ has smallest singular value $\sigma_{\min}(A) > 0$, then $A$ + has full row rank. Consider Prop.~\ref{prop:non_isotropic_isometry} with $t + \equiv \sqrt{n}$ and with $N > {(C+1)}^{2} n$, then with probability $1 - + 2\exp(-cn)$: \begin{equation} \|\frac{1}{N} A^T A - \sigma I \| \leq + (C+1)\sqrt{\frac{n}{N}} \end{equation} + +It follows that for any vector $x \in \mathbb{R}^n$, $|\frac{1}{N}\|A x\|^2_2 +-\sigma | \leq (C+1)\sqrt{\frac{n}{N}} \implies \|A x\|^2_2 \geq N\left(\sigma +- (C+1)\sqrt{\frac{n}{N}}\right)$. If $N > {(C+1)}^2n/\sigma^2$, then +$\sigma_{\min}(A) > 0$ with probability at least $1 - e^{-cn}$ \end{proof} + +\subsection{More Direct Approach} + +Here we work on $\mathbb{F}_2$. An $n\times n$ binary matrix can be seen as +a matrix over $\mathbb{F}_2$. Let us assume that each row of $A$ is chosen +uniformly at random among all binary rows of length $n$. A standard counting +arguments shows that the number of non-singular matrices over $\mathbb{F}_2$ +is: +\begin{displaymath} + N_n = (2^n-1)(2^n - 2)\dots (2^n- 2^{n-1}) + = (2^n)^n\prod_{i=1}^n\left(1-\frac{1}{2^i}\right) +\end{displaymath} + +Hence the probability that our random matrix $A$ is invertible is: +\begin{displaymath} + P_n = \prod_{i=1}^n\left(1-\frac{1}{2^i}\right) +\end{displaymath} +It is easy to see that $P_n$ is a decreasing sequence. Its limit is +$\phi(\frac{1}{2})$ where $\phi$ is the Euler function. We have +$\phi(\frac{1}{2})\approx +0.289$ and $P_n$ converges +exponentially fast to this constant (one way to see this is to use the +Pentagonal number theorem). + +Hence, if we observe $\ell\cdot n$ uniformly random binary rows, the +probability that they will have full column rank is at least: +\begin{displaymath} + P_{\ell\cdot n}\geq 1 - \left(1-\phi\left(\frac{1}{2}\right)\right)^{\ell} +\end{displaymath} + +Note that this is the probability of having full column rank over +$\mathbb{F}_2$. A standard linear algebra argument shows that this implies full +column rank over $\mathbb{R}$. + +\paragraph{TODO:} Study the case where we only observe sets of size exactly +$k$, or at most $k$. This amounts to replacing $2^n$ in the computation above +by: +\begin{displaymath} +{n\choose k}\quad\text{or}\quad\sum_{j=0}^k{n\choose j} +\end{displaymath} + +Thinking about it, why do we assume that the sample sets are of size exactly +$k$. I think it would make more sense from the learning perspective to consider +uniformly random sets. In this case, the above approach allows us to conclude +directly. + +More generally, I think the “right” way to think about this is to look at $A$ +as the adjacency matrix of a random $k$-regular graph. There are tons of +results about the probability of the adjacency matrix to be non singular. + +\section{Examples} + +\subsection{Generic Functions} + +\paragraph{TODO:} Add citations! + +These are generic examples and serve as building blocks for the applications +below: +\begin{itemize} + \item $f(S) = g\left(\sum_{i\in S} w_i\right)$ for a concave + $g:\reals\to\reals$ and weights $(w_i)_{i\in N}\in \reals_+^{|N|}$. Note + that $f$ is monotone iff $g$ is monotone. In this case, $g$ does not + matter for the purpose of optimization: the sets are in the same order + with or without $g$, the only things which matter are the weights which + serve as natural parameters for this class of functions. This class of + functions contains: + \begin{itemize} + \item additive functions (when $g$ is the identity function). + \item $f(S) = |S\cap X|$ for some set $X\subseteq N$. This is the + case where the weights are $0/1$ and $g$ is the identity + function. + \item symmetric submodular functions: when the weights are all one. + \item budget-additive functions, when $g:x\mapsto \min(B, x)$ for + some $B$. + \end{itemize} + \item $f(S) = \max_{i\in S} w_i$ for weights $(w_i)_{i\in N}\in + \reals_+^{|N|}$. This class of functions is also naturally parametrized + by the weights. + \item (weighted) matroid rank functions. Given a matroid $M$ over a ground + set $N$, we define its rank function to be: + \begin{displaymath} + \forall S\subseteq N,\; + r(S) = \max_{\substack{I\subseteq S\\I\in M}} |I| + \end{displaymath} + more generally, given a weight function $w:N\to\reals_+$, we define the + weighted matroid rank function: + \begin{displaymath} + \forall S\subseteq N,\; + r(S) = \max_{\substack{I\subseteq S\\I\in M}} \sum_{i\in I} w_i + \end{displaymath} +\end{itemize} + +\paragraph{Remark} The function $f(S)= \max_{i\in S}w_i$ is a weighted matroid +rank function for the $1$-uniform matroid (the matroid where the independent +sets are the sets of size at most one). + +\subsection{Applications} + +\begin{itemize} + \item \emph{Coverage functions:} they can be written as a positive linear + combination of matroid rank functions: + \begin{displaymath} + f(S) = \sum_{u\in\mathcal{U}} w_u c_u(S) + \end{displaymath} + where $c_u$ is the rank function of the matroid $M = \big\{ \emptyset, + \{u\}\big\}$. + \item \emph{Facility location:} (cite Bilmes) there is a universe + $\mathcal{U}$ of locations and a proximity score $s_{i,j}$ for each + pair of locations. We pick a subset of locations $S$ and each point in + the universe is allocated to its closest location (the one with highest + proximity): + \begin{displaymath} + f(S) = \sum_{u\in\mathcal{U}} \max_{v\in S} s_{u,v} + \end{displaymath} + This can be seen as a sum of weighted matroid rank functions: one for + each location in the universe associated with a $1$-uniform matroid + (other applications: job scheduling). + + \item \emph{Image segmentation:} (cite Jegelka) can be (in some cases) + written as a graph cut function. For image segmentation the goal is to + minimize the cut. + \begin{displaymath} + f(S) = \sum_{e\in E} w_e c_e(S) + \end{displaymath} + where $c_e(S)$ is one iff $e\in E(S,\bar{S})$. \textbf{TODO:} I think + this can be written as a matroid rank function. + \item \emph{Learning} (cite Krause) there is + a hypothesis $A$ (a random variable) which is “refined” when more + observations are made. Imagine that there is a finite set $X_1,\dots, + X_n$ of possible observations (random variables). Then, assuming that + the observations are independent conditioned on $A$, the information + gain: + \begin{displaymath} + f(S) = H(A) - H\big(A\,|\,(X_i)_{i\in S}\big) + \end{displaymath} + is submodular. The $\log\det$ is the specific case of a linear + hypothesis observed with additional independent Gaussian noise. + \item \emph{Entropy:} Closely related to the previous one. If $(X_1,\dots, + X_n)$ are random variables, then: + $ f(S) = H(X_S) $ is submodular. In particular, if $(X_1,\dots, + X_n)$ are jointly multivariate gaussian, then: + \begin{displaymath} + f(S) = c|S| + \frac{1}{2}\log\det X_SX_S^T + \end{displaymath} + for $c= 2\pi e...$ and we fall back to the usual $\log\det$ function. + \item \emph{data subset selection/summarization:} in statistical machine + translation, Bilmes used sum of concave over modular: + \begin{displaymath} + f(S) = \sum_{f} \lambda_f \phi\left(\sum_{e\in S}w_f(e)\right) + \end{displaymath} + where each $f$ represents a feature, $w_f(e)$ represents how much of + $f$ element $e$ has, and $\phi$ captures decreasing marginal gain when + we have a lot of a given feature. + Facility location functions are also commonly used for subset selection. + \item \emph{concave spectral functions} One would be tempted to say that + any multivariate concave function of a modular function is submodular. + This would be the natural generalization of concave over modular. + However \textbf{this is not true in general}. However, a possible nice + generalization is the following. Let $M$ be a symmetric $n\times + n$ matrix, and $g$ is a ``matrix concave'' function. Then: + \begin{displaymath} + f(S) = \mathrm{Tr}\big(g(M_S)\big) + \end{displaymath} + is submodular. This contains the $\log\det$ (when $g$ is the matrix + $\log$) but tons of other functions (like quantum entropy). +\end{itemize} +In summary, the two most general classes of submodular functions (which capture +all the examples known to man) are: sums of matrix concave functions and sums +of matroid rank functions. Sums of concave over modular are also nice if we +want to start with a simpler example. + +\section{Passive Optimization} + +Let $\Omega$ be the universe of elements and $f$ a function defined on subsets +of $\Omega$: $f : S \in 2^{[\Omega]} \mapsto f(S) \in \mathbb{R}$. Let $K$ be a +collection of sets of $2^{[\Omega]}$, which we call \emph{constraints}. Let +$S^*_K$ be any solution to $\max_{S \in K} f(S)$, which we will also denote by +$S^*$ when there is no ambiguity. Let $L$ be the problem size, which is often +(but not always) equal to $|\Omega|$. + +In general, we say we can efficiently optimize a function $f$ under constraint +$K$ when we have a polynomial-time algorithm making adaptive value queries to +$f$,which returns a set $S$ such that $S \in K$ and $f(S) \geq \alpha f(S^*)$ +with high probability and $\alpha$ an absolute constant. + +Here, we consider the scenario where we cannot make adaptive value queries, and +in fact, where we cannot make queries at all! Instead, we suppose that we +observe a polynomial number of set-value pairs $(S, f(S))$ where $S$ is taken +from a known distribution $D$. We say we can efficiently \emph{passively +optimize} $f$ under distribution $D$ or $D-$optimize $f$ under constraints $K$ +when, after observing ${\cal O}(L^c)$ set-value pairs from $D$ where $c > 0$ is +an absolute constant, we can return a set $S$ such that $S \in K$ and $f(S) +\geq \alpha f(S^*)$ with high probability and $\alpha$ an absolute constant. + +In the case of \emph{passive} observations of set-value pairs under a +distribution $D$ for a function $f$, recent research has focused on whether we +can efficiently and approximately \emph{learn} $f$. This was formalized in the +PMAC model from \cite{balcan2011learning}. When thinking about passive +optimization, it is necessary to understand the link between being able to + $D-PMAC$ learn $f$ and being able to $D-$optimize $f$. + +\subsection{Additive function} + +We consider here the simple case of additive functions. A function $f$ is +additive if there exists a weight vector $(w_s)_{s \in \Omega}$ such that +$\forall S \subseteq \Omega, \ f(S) = \sum_{s \in S} w_s$. We will sometimes +adopt the notation $w \equiv f(\Omega)$. + +\subsubsection{Inverting the system}\label{subsec:invert_the_system} + +Suppose we have observed $n^c$ set-value pairs ${(S_j, f(S_j))}_{j \in N}$ with +$S_j \sim D$ where $n \equiv |\Omega|$. Consider the $n^c \times n$ matrix $A$ +where for all $i$, the row $A_i = \chi_{S_i}$, the indicator vector of set +$S_i$. Let $B$ be the $n^c \times 1$ vector such that $\forall i, b_j \equiv +f(S_j)$. It is easy to see that if $w$ is the $n \times 1$ corresponding weight +vector for $f$ then: +\begin{displaymath} + A w = B +\end{displaymath} + +Note that if $A$ has full column rank, then we can solve for $w$ exactly and +also optimize $f$ under any cardinality constraint. We can therefore cast the +question of $D-$learning and $D-$optimizing $f$ as a random matrix theory +problem: what is the probability that after $n^c$ for $c > 0$ independent +samples from $D$, the matrix $A$ will have full rank? See +section~\ref{sec:matrix_theory} + +\paragraph{Extension} +Note that the previous reasoning can easily be extended to any \emph{almost} +additive function. Consider a function $g$ such that there exists $\alpha > 0$ +and $\beta > 0$ and an additive function $f$ such that $\forall S, \alpha f(S) +\leq g(S) \leq \beta f(S)$, then by solving for $\max_{S\in K} f(S)$ we have a +$\alpha/\beta$-approximation to the optimum of $g$ since: + +\begin{displaymath} +g(S^*) \geq \alpha f(S^*) \geq \alpha f(OPT) \geq +\frac{\alpha}{\beta} g(OPT) +\end{displaymath} + +where $OPT \in \arg \max_{S \in K} g(S)$. This can be taken one step further by +considering a function $g$ such that there exists $\alpha, \beta >0$, an +additive function $f$ and a bijective univariate function $\phi$, such that +$\forall S, \alpha \phi(f(S)) \leq g(S) \leq \beta \phi(f(S))$. In this case, we +solve the system $A w = \phi^{-1}(B)$ and obtain once again an +$\alpha/\beta$-approximation to the optimum of $g$. + +\begin{remark} +Note that here $D-$optimizing $f$ is easy because $D-$learning $f$ is easy. We +would like to understand whether being able to $D-$learn $f$ is really +necessary to $D-$optimizing it. In fact, many results for PMAC-learning more +complex functions, such as general submodular functions, are negative. Can we +hope to find positive results in cases where PMAC-learning is impossible? +\end{remark} + +\subsubsection{Average weight method} + +We consider here another method to $D-$optimizing $f$, which only requires +$D-$learning $f$ approximately. Note that for every element $i \in \Omega$, and +for a \emph{product} distribution $D$: +\begin{displaymath} + \mathbb{E}_{S \sim D}(f(S)|i \in S) - \mathbb{E}_{S \sim D}(f(S) | i \notin + S) = \sum_{j \neq i \in \Omega} w_s \left(\mathbb{P}(j\in S|i \in S) - + \mathbb{P}(j \in S | i \notin S)\right) + w_i(1 - 0) = w_i +\end{displaymath} + +Let $O$ be the collection of all sets $S$ for which we have observed $f(S)$ and +let $O_i \equiv \{S \in O : i \in S\}$ and $O^c_i \equiv \{ S\in O: i \notin S +\}$. If $O_i$ and $O^c_i$ are non-empty, define the following weight estimator: +\begin{displaymath} + \hat W_i \equiv \frac{1}{|O_i|} \sum_{S \in O_i} f(S) - \frac{1}{|O_i^c|} + \sum_{S \in O_i^c} f(S) +\end{displaymath} + +If $D$ is a product distribution such that $ \exists c > 0, \forall i, +\mathbb{P}(i) \geq c$, it is +easy to show that $\forall i \in +\Omega,\ \hat W_i \rightarrow_{|O| \rightarrow +\infty} w_i$ +By using standard concentration bounds, we can hope to obtain within $poly(n, +\frac{1}{\epsilon})$ observations: + + +%For every node $i$, we compute the \emph{average weight of every set containing +%element $i$}. Let $w_i$ be +%the weight of element $i$, $w \equiv f(\Omega) = \sum_{i \in \Omega} w_i$ and +%$p \equiv \frac{k}{n}$, then $$\forall i \in \Omega, \mathbb{E}_{S \sim +%D_p}\left[f(S)| i \in S\right] = pw + (1 -p)w_i$$ + +%Note that the average weight of every set containing element $i$ preserves the +%ranking of the weights of the elements. For observations ${(S_j, f(S_j))}_{j \in +%N}$ and for $N_i \equiv |\{S : i \in S\}|$, we define the following estimator: + +%\begin{equation} \forall i \in \Omega, w_i^{N_i} \equiv \frac{1}{N_i}\sum_{S | +%i \in S} \frac{f(S) - pw}{1-p} \end{equation} + +%As shown above, $w_i^{N_i} \rightarrow w_i$ as $N_i \rightarrow +\infty$. We +%can obtain a concentration bound of $w_i^{N_i}$ around $w_i$, using Hoeffding's +%lemma: + +%\begin{equation} \mathbb{P}\left(\middle|w_i^{N_i} - w_i \middle| \geq +%\epsilon w_i \right) \leq 2e^{-2(1-p)N_i\frac{\epsilon^2 w_i^2}{w^2}} +%\end{equation} + +%\emph{TODO:multiplicative boudns are very bad for zero weights\dots Need to look +%at additive bounds for these zeros.} + +%For $N_i$ sufficiently large for each element $i$, we have $\forall i \in +%\Omega, (1-\epsilon) w_i \leq w_i^{N_i} \leq (1 + \epsilon) w_i$. Under this +%assumption, if we choose the $k$ elements with largest estimated weight +%$W_i^{N_i}$, we obtain a $\frac{1-\epsilon}{1+\epsilon}$-approximation to OPT, +%where OPT is the value of the maximum weight set of $k$ elements for the +%function $f$. To ensure that $N_i$ is sufficiently large for each element, we +%note that $\mathbb{E}(N_i) = pN$ and use a Chernoff bound coupled with a +%classic union bound: + +%\begin{equation} \mathbb{P}\left(\bigcup_{i \in \Omega} \left[N_i \leq +%\frac{pN}{2}\right]\right) \leq \sum_{i\in \Omega} \mathbb{P}\left(N_i \leq +%\frac{pN}{2}\right) \leq n e^{-\frac{pN}{8}} \end{equation} + +%As such, for $C > 0$ and $N \geq (C+1)\frac{8}{p}\log n$, we have $\forall i +%\in \Omega, N_i \geq \frac{pN}{2}$ with probability at least $1-\frac{1}{n^C}$ + +%\begin{proposition} Assume that $N$ pairs of set-value ${(S_j, f(S_j))}_{j \in + %N}$ are observed, where $S_j \sim D_p$ and $p \equiv \frac{k}{n}$. If $N > + %XXX$, then we can $\epsilon$-learn $f$ and optimize it to a + %$(1+\epsilon)/(1-\epsilon)$ factor for any cardinality constraint with + %probability $XXX$. \end{proposition} + +\subsection{General submodular functions} + +\subsection{Parametric submodular functions} + +\textbf{Note:} all known examples of submodular functions are parametric. The +question is not parametric vs non-parametric but whether or not the number of +parameters is polynomial or exponential in $n$ (the size of the ground set). +For all examples I know except martroid rank functions, it seems to be +polynomial. + +\subsubsection{Influence Functions} + +Let $G = (V, E)$ be a weighted directed graph. Without loss of generality, we +can assign a weight $p_{u,v} \in [0,1]$ to every possible edge $(u,v) \in V^2$. +Let $m$ be the number of non-zero edges of $G$. Let $\sigma_G(S, p)$ be the +influence of the set $S \subseteq V$ in $G$ under the IC model with parameters +$p$. + +Recall from \cite{Kempe:03} that: +\begin{equation} + \sigma_G(S, p) = \sum_{v\in V} \P_{G_p}\big[r_{G_p}(S\leadsto v)\big] +\end{equation} +where $G_p$ is a random graph where each edge $(u,v)\in E$ appears with +probability $p_{u,v}$ and $r_{G_p}(S\leadsto v)$ is the indicator variable of +\emph{there exists a path from $S$ to $v$ in $G_p$}. + +\begin{claim} +\label{cla:oracle} +If for all $(u,v) \in V^2$, $p_{u,v} \geq p'_{u,v} \geq p_{u,v} +- \frac{1}{\alpha m}$, then: +\begin{displaymath} + \forall S \subseteq V,\, \sigma_{G}(S, p) \geq \sigma_{G}(S,p') \geq (1 + - 1/\alpha) \sigma_{G}(S,p) +\end{displaymath} +\end{claim} + +\begin{proof} + We define two coupled random graphs $G_p$ and $G_p'$ as follows: for each + edge $(u,v)\in E$, draw a uniform random variable $\mathcal{U}_{u,v}\in + [0,1]$. Include $(u,v)$ in $G_p$ (resp. $G_{p'}$) iff + $\mathcal{U}_{u,v}\leq p_{u,v}$ (resp. $\mathcal{U}_{u,v}\leq p'_{u,v}$). + It is clear from the construction that each edge $(u,v)$ will be present in + $G_p$ (resp. $G_p'$) with probability $p_{u,v}$ (resp. $p'_{u,v}$). Hence: + \begin{displaymath} + \sigma_G(S, p) = \sum_{v\in V} \P_{G_p}\big[r_{G_p}(S\leadsto v)\big] + \text{ and } + \sigma_G(S, p') = \sum_{v\in V} \P_{G_p'}\big[r_{G_p'}(S\leadsto v)\big] + \end{displaymath} + + By construction $G_{p'}$ is always a subgraph of $G_p$, i.e + $r_{G_p'}(S\leadsto v)\leq r_{G_p}(S\leadsto v)$. This proves the left-hand + side of the Claim's inequality. + + The probability that an edge is present in $G_p$ but not in $G_p'$ is at + most $\frac{1}{\alpha m}$, so with probability $\left(1-\frac{1}{\alpha + m}\right)^m$, $G_p$ and $G_p'$ have the same set of edges. This implies that: + \begin{displaymath} \P_{G_{p'}}\big[r_{G_p'}(S\leadsto v)\big]\geq + \left(1-\frac{1}{\alpha m}\right)^m\P_{G_{p}}\big[r_{G_p}(S\leadsto v)\big] + \end{displaymath} + which proves the right-hand side of the claim after observing that + $\left(1-\frac{1}{\alpha m}\right)^m\geq 1-\frac{1}{\alpha}$ with equality + iff $m=1$. +\end{proof} + + We can use Claim~\ref{cla:oracle} to find a constant factor approximation + algorithm to maximising influence on $G$ by maximising influence on $G'$. For + $k \in \mathbb{N}^*$, let $O_k \in \arg\max_{|S| \leq k} \sigma_G(S)$ and + $\sigma_{G}^* = \sigma_G(O_k)$ where we omit the $k$ when it is unambiguous. + + +\begin{proposition} +\label{prop:approx_optim} +Suppose we have an unknown graph $G$ and a known graph $G'$ such that $V = V'$ +and for all $(u,v) \in V^2, |p_{u,v} - p_{u,v}'| \leq \frac{1}{\alpha m}$. +Then for all $k \in \mathbb{N}^*$, we can find a set $\hat O_k$ such that +$\sigma_{G}(\hat O_k) \geq (1 - e^{\frac{2}{\alpha} - 1}) \sigma^*_{G}$ +\end{proposition} + +\begin{proof} For every edge $(u,v) \in V^2$, let $\hat p = p'_{u,v} - + \frac{1}{\alpha m}$. We are now in the conditions of Claim~\ref{cla:oracle} + with $\alpha \leftarrow \alpha/2$. We return the set $\hat O_k$ obtained by + greedy maximisation on $\hat G$. It is a classic exercise to show then that + $\sigma_G(\hat O_k) \geq 1 - e^{\frac{2}{\alpha} - 1}$ (see Pset 1, CS284r). +\end{proof} + +A small note on the approximation factor: it is only $>0$ for $\alpha > 2$. +Note that $\alpha \geq 7 \implies 1 - e^{\frac{2}{\alpha} - 1} \geq +\frac{1}{2}$ and that it converges to $(1 - 1/e)$ which is the best possible +polynomial-time approximation ratio to influence maximisation unless $P = NP$. +Also note that in the limit of large $m$, $(1 -\frac{1}{\alpha m})^m +\rightarrow \exp(-1/\alpha)$ and the approximation ratio goes to $1 - +\exp(-\exp(-2/\alpha))$. + +\subsubsection{Active set selection of stationary Gaussian Processes} + +\section{Passive Optimization vs. Passive Learning} + +\subsection{Failed Attempt: returning max of observations} + +This doesn't work. Give examples as to why! Remember that there are strong +concentration results for submodular functions -> look at expected value of +observed sets + +\subsection{Example where optimization possible, learning impossible} + +Recall the matroid construction from~\cite{balcan2011learning}: +\begin{theorem} + For any $k$ with $k = 2^{o(n^{1/3})}$, there exists a family of sets ${\cal A} + \subseteq2^{[n]}$ and a family of matroids $\cal{M} = \{M_{\cal{B}} : + \cal{B} \subseteq\cal{A} \}$ such that: + \begin{itemize} + \item $|{\cal A}| = k$ and $|A| = n^{1/3}$ for every $A \in \cal{A}$ + \item For every $\cal{B} \subseteq\cal{A}$ and every $A \in \cal{A}$, we + have: + \begin{align*} + \text{rank}_{M_{\cal{B}}}(A) & = 8 \log k \ if A\in{\cal B} \\ + & = |A| \ if A\in {\cal A}\backslash{\cal B} + \end{align*} + \end{itemize} +\end{theorem} + +Consider the following subset of the above family of matroids: ${\cal +M}^{\epsilon} = \{M_{\cal B} : {\cal B} \subseteq{\cal A} \wedge |{\cal +A}\backslash{\cal B}| \geq \epsilon|{\cal A}|\}$ for $k = 2^{n^{1/6}}$. +Consider an \emph{unknown} function $f$, corresponding to the rank function of +one of the matroids $M_{\cal B}$ from ${\cal M}^{\epsilon}$. Note that as long +as we observe \emph{one} set from ${\cal A} \backslash {\cal B}$, we can +optimize $f$ exactly! Indeed, the largest possible value for $f$ under +cardinality constraint $n^{1/3}$ is $\max_{A\in 2^{[n]}} |A| = n^{1/3}$. + +One example of a distribution under which this occurs with probability at least +a constant is $D_u$, the uniform distribution over all sets of ${\cal A}$. For +$c>0$, after $n^c$ observations $O \equiv (S_i, f(S_i))_i$ for $S_i \sim D_u$, +we will observe at least one element from $\cal{A}\backslash\cal{B}$ with +constant probability: + +\begin{equation} \nonumber \mathbb{P}(\bigcup_{S_i} S_i \in {\cal +A}\backslash{\cal B}) \geq 1 - (1 - \epsilon)^{n^c} \approx \epsilon n^c +\end{equation} + +However, it follows from the analysis of~\cite{balcan2011learning} that we +cannot learn $f$ under any distribution, even with active value queries! +Indeed, for any polynomially-sized set of observations, there exists a +super-polynomially number of functions in ${\cal M^1}$ which coincide on this +set of observations, but which take very different values outside of this set +of observations: $8n^{1/6}$ for $A \in {\cal B}$ and $n^{1/3}$ for $A \in {\cal +A}\backslash {\cal B}$. + +{\bf TODO:} A cleaner simpler example would be nice. + +\section{Meeting notes: 04.03.2015} + +Consider the following function: +\begin{displaymath} + g(S) = \max\left\{\sqrt{n}|X\cap S|, |S|\right\} +\end{displaymath} +where $|X|=\sqrt{n}$. Assume that you are given polynomially many samples where +each element is included with probability $1/2$. Then with high probability all +the samples will have size roughly $n/2$, so you will observe $g(S)=|S|$ with +high probability. + +\paragraph{Claim 1:} $g$ is PAC-learnable because if you output $|S|$ then you +will be correct with high probability is $S$ is drawn from the same +distribution as above. + +\paragraph{Claim 2:} $g$ is not optimizable under budget $\sqrt{n}$ because you +never learn anything about $X$. + +\paragraph{Open Question:} Cook a similar example where $g$ is submodular and +where you are observing sets of the same size as your budget. + +\paragraph{Positive results:} Try to obtain guarantees about learning +parameters of parametric submodular functions and show whether or not these +guarantees are sufficient for optimization. First look at learning weights in +a cover function. Maybe facility location? Sums of concave over modular are +probably too hard because of the connection to neural networks. + +\section{Which learning guarantees imply optimization?} + +Here, we consider the following question: which learning guarantees imply +optimization? For example, \cite{TODO} provides the following guarantee for +cover functions: + +\begin{theorem} +There exists an algorithm such that for any $\epsilon>0$ given random and +uniform examples of a coverage function $c$ outputs a hypothesis, which is also +a coverage function $h$, such that with probability $2/3$: $\mathbb{E}_{\cal +U}[|h(x) - c(x)|] \leq \epsilon$. The algorithm runs in time $\tilde {\cal O}(n) +\cdot \text{poly}(s/\epsilon)$ and uses $\log(n)\cdot \text{poly}(s/epsilon)$ +and examples, where $s = \min\{size(c), (1/\epsilon)^{\log(1/\epsilon)}\}$. +\end{theorem} + +We would like to understand to what extent this $\ell1$-bound allows us to +optimize the coverage function $c$ under cardinality constraints using the +hypothesis $h$. Let us consider the simpler case of an additive function, and +suppose we have a similar guarantee: $\mathbb{E}_{x \sim {\cal U}}[|h(x) - +c(x)|]$ where $\forall x, c(x) \equiv \sum_i w_i x_i$ and $h(x) \equiv \sum_i +\hat w_i x_i$. Can we find a bound on $\|w - \hat w\|_{\infty}$? + +As it turns out, yes. Let $\forall i, w_i - \hat w_i \equiv \alpha_i$ and let +$V(x) \equiv |h(x) - c(x)| = |\sum_{i} \alpha_i x_i |$. Consider the collection +of good sets ${\cal G} \equiv \{S : v(S) < 4\epsilon\}$. We claim that $|{\cal G}| +\geq \frac{3}{4}\cdot2^n$. Suppose the contrary, there is at least a quarter of +the sets which have value $v(S) > 4\epsilon$ such that $\mathbb{E}_{x \sim {\cal +U}}[|v(x)|] \geq \frac{1}{2^n}\sum_{S \in {\cal G}^c} |v(S)| > +\frac{1}{4}\cdot4\epsilon = \epsilon$ which is a contradiction. Consider element +$i$ such that $|\alpha_i| \equiv \|\alpha\|_{\infty}$. Consider the collection +of sets which contain $i$: ${\cal S}_i \equiv \{S : i \in S\}$. Notice that +$|{\cal S}_i| = |{\cal S}_i^c| = 2^{n-1}$. Therefore, $|{\cal G} \cap {\cal +S}_i^c| \geq \frac{1}{4}\cdot 2^n$. For all sets $S$ in ${\cal G} \cap {\cal +S}_i^c$, $v(S\cup\{j\}) \geq \alpha_i - 4\epsilon$. It follows that +$\mathbb{E}_{x \sim {\cal U}}[|v(x)|] \geq \frac{1}{4}(\alpha_j - 4\epsilon)$ +and therefore we have $\|w - \hat w\|_{\infty} \leq 8 \epsilon$. + +\bibliography{optimize} \bibliographystyle{apalike} + + +\end{document} diff --git a/optimize.bib b/optimize.bib deleted file mode 100644 index 3fb5309..0000000 --- a/optimize.bib +++ /dev/null @@ -1,15 +0,0 @@ -@article{vershynin2010introduction, - title={Introduction to the non-asymptotic analysis of random matrices}, - author={Vershynin, Roman}, - journal={arXiv preprint arXiv:1011.3027}, - year={2010} -} - -@inproceedings{balcan2011learning, - title={Learning submodular functions}, - author={Balcan, Maria-Florina and Harvey, Nicholas JA}, - booktitle={Proceedings of the forty-third annual ACM symposium on Theory of computing}, - pages={793--802}, - year={2011}, - organization={ACM} -} \ No newline at end of file diff --git a/paper/main.tex b/paper/main.tex new file mode 100644 index 0000000..58b6b22 --- /dev/null +++ b/paper/main.tex @@ -0,0 +1,42 @@ +\documentclass[10pt]{article} +\usepackage{fullpage, amsmath, amssymb, amsthm, bbm} +\usepackage[utf8x]{inputenc} + +\DeclareMathOperator{\E}{\mathbb{E}} +\let\P\relax +\DeclareMathOperator{\P}{\mathbb{P}} +\newcommand{\ex}[1]{\E\left[#1\right]} +\newcommand{\prob}[1]{\P\left[#1\right]} +\newcommand{\reals}{\mathbb{R}} +\newcommand{\ints}{\mathbb{N}} +\renewcommand{\O}{\mathcal{O}} + + +\newtheorem{proposition}{Proposition} +\newtheorem{corollary}{Corollary} +\newtheorem{problem}{Problem} +\newtheorem{theorem}{Theorem} +\newtheorem{claim}{Claim} +\newtheorem{remark}{Remark} + +\title{Optimizing Submodular Functions in the Face of Uncertainty} +\author{} +\date{} + +\begin{document} +\maketitle + +\section{Introduction} +\input{sections/introduction.tex} + +\section{Learning vs. Optimizing} +\input{sections/negative.tex} + +\section{Positive results} +\input{sections/positive.tex} + +\bibliographystyle{apalike} +\bibliography{optimize} + + +\end{document} diff --git a/paper/optimize.bib b/paper/optimize.bib new file mode 100644 index 0000000..3fb5309 --- /dev/null +++ b/paper/optimize.bib @@ -0,0 +1,15 @@ +@article{vershynin2010introduction, + title={Introduction to the non-asymptotic analysis of random matrices}, + author={Vershynin, Roman}, + journal={arXiv preprint arXiv:1011.3027}, + year={2010} +} + +@inproceedings{balcan2011learning, + title={Learning submodular functions}, + author={Balcan, Maria-Florina and Harvey, Nicholas JA}, + booktitle={Proceedings of the forty-third annual ACM symposium on Theory of computing}, + pages={793--802}, + year={2011}, + organization={ACM} +} \ No newline at end of file diff --git a/paper/sections/introduction.tex b/paper/sections/introduction.tex new file mode 100644 index 0000000..bcb406f --- /dev/null +++ b/paper/sections/introduction.tex @@ -0,0 +1,27 @@ +Let $\Omega$ be the universe of elements and $f$ a function defined on subsets +of $\Omega$: $f : S \in 2^{[\Omega]} \mapsto f(S) \in \mathbb{R}$. Let $K$ be a +collection of sets of $2^{[\Omega]}$, which we call \emph{constraints}. Let +$S^*_K$ be any solution to $\max_{S \in K} f(S)$, which we will also denote by +$S^*$ when there is no ambiguity. Let $L$ be the problem size, which is often +(but not always) equal to $|\Omega|$. + +In general, we say we can efficiently optimize a function $f$ under constraint +$K$ when we have a polynomial-time algorithm making adaptive value queries to +$f$,which returns a set $S$ such that $S \in K$ and $f(S) \geq \alpha f(S^*)$ +with high probability and $\alpha$ an absolute constant. + +Here, we consider the scenario where we cannot make adaptive value queries, and +in fact, where we cannot make queries at all! Instead, we suppose that we +observe a polynomial number of set-value pairs $(S, f(S))$ where $S$ is taken +from a known distribution $D$. We say we can efficiently \emph{passively +optimize} $f$ under distribution $D$ or $D-$optimize $f$ under constraints $K$ +when, after observing ${\cal O}(L^c)$ set-value pairs from $D$ where $c > 0$ is +an absolute constant, we can return a set $S$ such that $S \in K$ and $f(S) +\geq \alpha f(S^*)$ with high probability and $\alpha$ an absolute constant. + +In the case of \emph{passive} observations of set-value pairs under a +distribution $D$ for a function $f$, recent research has focused on whether we +can efficiently and approximately \emph{learn} $f$. This was formalized in the +PMAC model from \cite{balcan2011learning}. When thinking about passive +optimization, it is necessary to understand the link between being able to + $D-PMAC$ learn $f$ and being able to $D-$optimize $f$. diff --git a/paper/sections/negative.tex b/paper/sections/negative.tex new file mode 100644 index 0000000..f88117a --- /dev/null +++ b/paper/sections/negative.tex @@ -0,0 +1,113 @@ +\subsection{Learnable $\nRightarrow$ Optimizable} + +Consider the following function: +\begin{displaymath} + g(S) = \max\left\{\sqrt{n}|X\cap S|, |S|\right\} +\end{displaymath} +where $|X|=\sqrt{n}$. Assume that you are given polynomially many samples where +each element is included with probability $1/2$. Then with high probability all +the samples will have size roughly $n/2$, so you will observe $g(S)=|S|$ with +high probability. + +\paragraph{Claim 1:} $g$ is PAC-learnable because if you output $|S|$ then you +will be correct with high probability is $S$ is drawn from the same +distribution as above. + +\paragraph{Claim 2:} $g$ is not optimizable under budget $\sqrt{n}$ because you +never learn anything about $X$. + +\paragraph{Open Question:} Cook a similar example where $g$ is submodular and +where you are observing sets of the same size as your budget. + +\paragraph{Positive results:} Try to obtain guarantees about learning +parameters of parametric submodular functions and show whether or not these +guarantees are sufficient for optimization. First look at learning weights in +a cover function. Maybe facility location? Sums of concave over modular are +probably too hard because of the connection to neural networks. + +\subsection{Optimizable $n\Rightarrow$ Learnable} + +Recall the matroid construction from~\cite{balcan2011learning}: +\begin{theorem} + For any $k$ with $k = 2^{o(n^{1/3})}$, there exists a family of sets ${\cal A} + \subseteq2^{[n]}$ and a family of matroids $\cal{M} = \{M_{\cal{B}} : + \cal{B} \subseteq\cal{A} \}$ such that: + \begin{itemize} + \item $|{\cal A}| = k$ and $|A| = n^{1/3}$ for every $A \in \cal{A}$ + \item For every $\cal{B} \subseteq\cal{A}$ and every $A \in \cal{A}$, we + have: + \begin{align*} + \text{rank}_{M_{\cal{B}}}(A) & = 8 \log k \ if A\in{\cal B} \\ + & = |A| \ if A\in {\cal A}\backslash{\cal B} + \end{align*} + \end{itemize} +\end{theorem} + +Consider the following subset of the above family of matroids: ${\cal +M}^{\epsilon} = \{M_{\cal B} : {\cal B} \subseteq{\cal A} \wedge |{\cal +A}\backslash{\cal B}| \geq \epsilon|{\cal A}|\}$ for $k = 2^{n^{1/6}}$. +Consider an \emph{unknown} function $f$, corresponding to the rank function of +one of the matroids $M_{\cal B}$ from ${\cal M}^{\epsilon}$. Note that as long +as we observe \emph{one} set from ${\cal A} \backslash {\cal B}$, we can +optimize $f$ exactly! Indeed, the largest possible value for $f$ under +cardinality constraint $n^{1/3}$ is $\max_{A\in 2^{[n]}} |A| = n^{1/3}$. + +One example of a distribution under which this occurs with probability at least +a constant is $D_u$, the uniform distribution over all sets of ${\cal A}$. For +$c>0$, after $n^c$ observations $O \equiv (S_i, f(S_i))_i$ for $S_i \sim D_u$, +we will observe at least one element from $\cal{A}\backslash\cal{B}$ with +constant probability: + +\begin{equation} \nonumber \mathbb{P}(\bigcup_{S_i} S_i \in {\cal +A}\backslash{\cal B}) \geq 1 - (1 - \epsilon)^{n^c} \approx \epsilon n^c +\end{equation} + +However, it follows from the analysis of~\cite{balcan2011learning} that we +cannot learn $f$ under any distribution, even with active value queries! +Indeed, for any polynomially-sized set of observations, there exists a +super-polynomially number of functions in ${\cal M^1}$ which coincide on this +set of observations, but which take very different values outside of this set +of observations: $8n^{1/6}$ for $A \in {\cal B}$ and $n^{1/3}$ for $A \in {\cal +A}\backslash {\cal B}$. + +{\bf TODO:} A cleaner simpler example would be nice. + +\subsection{Which learning guarantees imply optimization?} + +Here, we consider the following question: which learning guarantees imply +optimization? For example, \cite{TODO} provides the following guarantee for +cover functions: + +\begin{theorem} +There exists an algorithm such that for any $\epsilon>0$ given random and +uniform examples of a coverage function $c$ outputs a hypothesis, which is also +a coverage function $h$, such that with probability $2/3$: $\mathbb{E}_{\cal +U}[|h(x) - c(x)|] \leq \epsilon$. The algorithm runs in time $\tilde {\cal O}(n) +\cdot \text{poly}(s/\epsilon)$ and uses $\log(n)\cdot \text{poly}(s/epsilon)$ +and examples, where $s = \min\{size(c), (1/\epsilon)^{\log(1/\epsilon)}\}$. +\end{theorem} + +We would like to understand to what extent this $\ell1$-bound allows us to +optimize the coverage function $c$ under cardinality constraints using the +hypothesis $h$. Let us consider the simpler case of an additive function, and +suppose we have a similar guarantee: $\mathbb{E}_{x \sim {\cal U}}[|h(x) - +c(x)|]$ where $\forall x, c(x) \equiv \sum_i w_i x_i$ and $h(x) \equiv \sum_i +\hat w_i x_i$. Can we find a bound on $\|w - \hat w\|_{\infty}$? + +As it turns out, yes. Let $\forall i, w_i - \hat w_i \equiv \alpha_i$ and let +$V(x) \equiv |h(x) - c(x)| = |\sum_{i} \alpha_i x_i |$. Consider the collection +of good sets ${\cal G} \equiv \{S : v(S) < 4\epsilon\}$. We claim that $|{\cal G}| +\geq \frac{3}{4}\cdot2^n$. Suppose the contrary, there is at least a quarter of +the sets which have value $v(S) > 4\epsilon$ such that $\mathbb{E}_{x \sim {\cal +U}}[|v(x)|] \geq \frac{1}{2^n}\sum_{S \in {\cal G}^c} |v(S)| > +\frac{1}{4}\cdot4\epsilon = \epsilon$ which is a contradiction. Consider element +$i$ such that $|\alpha_i| \equiv \|\alpha\|_{\infty}$. Consider the collection +of sets which contain $i$: ${\cal S}_i \equiv \{S : i \in S\}$. Notice that +$|{\cal S}_i| = |{\cal S}_i^c| = 2^{n-1}$. Therefore, $|{\cal G} \cap {\cal +S}_i^c| \geq \frac{1}{4}\cdot 2^n$. For all sets $S$ in ${\cal G} \cap {\cal +S}_i^c$, $v(S\cup\{j\}) \geq \alpha_i - 4\epsilon$. It follows that +$\mathbb{E}_{x \sim {\cal U}}[|v(x)|] \geq \frac{1}{4}(\alpha_j - 4\epsilon)$ +and therefore we have $\|w - \hat w\|_{\infty} \leq 8 \epsilon$. + + + diff --git a/paper/sections/positive.tex b/paper/sections/positive.tex new file mode 100644 index 0000000..c4e8e0f --- /dev/null +++ b/paper/sections/positive.tex @@ -0,0 +1,365 @@ +\subsection{Applications} + +\paragraph{TODO:} Add citations! + +\subsubsection{Building blocks} + +These are generic examples and serve as building blocks for the applications +below: +\begin{itemize} + \item $f(S) = g\left(\sum_{i\in S} w_i\right)$ for a concave + $g:\reals\to\reals$ and weights $(w_i)_{i\in N}\in \reals_+^{|N|}$. Note + that $f$ is monotone iff $g$ is monotone. In this case, $g$ does not + matter for the purpose of optimization: the sets are in the same order + with or without $g$, the only things which matter are the weights which + serve as natural parameters for this class of functions. This class of + functions contains: + \begin{itemize} + \item additive functions (when $g$ is the identity function). + \item $f(S) = |S\cap X|$ for some set $X\subseteq N$. This is the + case where the weights are $0/1$ and $g$ is the identity + function. + \item symmetric submodular functions: when the weights are all one. + \item budget-additive functions, when $g:x\mapsto \min(B, x)$ for + some $B$. + \end{itemize} + \item $f(S) = \max_{i\in S} w_i$ for weights $(w_i)_{i\in N}\in + \reals_+^{|N|}$. This class of functions is also naturally parametrized + by the weights. + \item (weighted) matroid rank functions. Given a matroid $M$ over a ground + set $N$, we define its rank function to be: + \begin{displaymath} + \forall S\subseteq N,\; + r(S) = \max_{\substack{I\subseteq S\\I\in M}} |I| + \end{displaymath} + more generally, given a weight function $w:N\to\reals_+$, we define the + weighted matroid rank function: + \begin{displaymath} + \forall S\subseteq N,\; + r(S) = \max_{\substack{I\subseteq S\\I\in M}} \sum_{i\in I} w_i + \end{displaymath} +\end{itemize} + +\paragraph{Remark} The function $f(S)= \max_{i\in S}w_i$ is a weighted matroid +rank function for the $1$-uniform matroid (the matroid where the independent +sets are the sets of size at most one). + +\subsubsection{Examples} + +\begin{itemize} + \item \emph{Coverage functions:} they can be written as a positive linear + combination of matroid rank functions: + \begin{displaymath} + f(S) = \sum_{u\in\mathcal{U}} w_u c_u(S) + \end{displaymath} + where $c_u$ is the rank function of the matroid $M = \big\{ \emptyset, + \{u\}\big\}$. + \item \emph{Facility location:} (cite Bilmes) there is a universe + $\mathcal{U}$ of locations and a proximity score $s_{i,j}$ for each + pair of locations. We pick a subset of locations $S$ and each point in + the universe is allocated to its closest location (the one with highest + proximity): + \begin{displaymath} + f(S) = \sum_{u\in\mathcal{U}} \max_{v\in S} s_{u,v} + \end{displaymath} + This can be seen as a sum of weighted matroid rank functions: one for + each location in the universe associated with a $1$-uniform matroid + (other applications: job scheduling). + + \item \emph{Image segmentation:} (cite Jegelka) can be (in some cases) + written as a graph cut function. For image segmentation the goal is to + minimize the cut. + \begin{displaymath} + f(S) = \sum_{e\in E} w_e c_e(S) + \end{displaymath} + where $c_e(S)$ is one iff $e\in E(S,\bar{S})$. \textbf{TODO:} I think + this can be written as a matroid rank function. + \item \emph{Learning} (cite Krause) there is + a hypothesis $A$ (a random variable) which is “refined” when more + observations are made. Imagine that there is a finite set $X_1,\dots, + X_n$ of possible observations (random variables). Then, assuming that + the observations are independent conditioned on $A$, the information + gain: + \begin{displaymath} + f(S) = H(A) - H\big(A\,|\,(X_i)_{i\in S}\big) + \end{displaymath} + is submodular. The $\log\det$ is the specific case of a linear + hypothesis observed with additional independent Gaussian noise. + \item \emph{Entropy:} Closely related to the previous one. If $(X_1,\dots, + X_n)$ are random variables, then: + $ f(S) = H(X_S) $ is submodular. In particular, if $(X_1,\dots, + X_n)$ are jointly multivariate gaussian, then: + \begin{displaymath} + f(S) = c|S| + \frac{1}{2}\log\det X_SX_S^T + \end{displaymath} + for $c= 2\pi e...$ and we fall back to the usual $\log\det$ function. + \item \emph{data subset selection/summarization:} in statistical machine + translation, Bilmes used sum of concave over modular: + \begin{displaymath} + f(S) = \sum_{f} \lambda_f \phi\left(\sum_{e\in S}w_f(e)\right) + \end{displaymath} + where each $f$ represents a feature, $w_f(e)$ represents how much of + $f$ element $e$ has, and $\phi$ captures decreasing marginal gain when + we have a lot of a given feature. + Facility location functions are also commonly used for subset selection. + \item \emph{concave spectral functions} One would be tempted to say that + any multivariate concave function of a modular function is submodular. + This would be the natural generalization of concave over modular. + However \textbf{this is not true in general}. However, a possible nice + generalization is the following. Let $M$ be a symmetric $n\times + n$ matrix, and $g$ is a ``matrix concave'' function. Then: + \begin{displaymath} + f(S) = \mathrm{Tr}\big(g(M_S)\big) + \end{displaymath} + is submodular. This contains the $\log\det$ (when $g$ is the matrix + $\log$) but tons of other functions (like quantum entropy). +\end{itemize} +In summary, the two most general classes of submodular functions (which capture +all the examples known to man) are: sums of matrix concave functions and sums +of matroid rank functions. Sums of concave over modular are also nice if we +want to start with a simpler example. + +\subsection{Additive Functions} + +We consider here the simple case of additive functions. A function $f$ is +additive if there exists a weight vector $(w_s)_{s \in \Omega}$ such that +$\forall S \subseteq \Omega, \ f(S) = \sum_{s \in S} w_s$. We will sometimes +adopt the notation $w \equiv f(\Omega)$. + +Suppose we have observed $n^c$ set-value pairs ${(S_j, f(S_j))}_{j \in N}$ with +$S_j \sim D$ where $n \equiv |\Omega|$. Consider the $n^c \times n$ matrix $A$ +where for all $i$, the row $A_i = \chi_{S_i}$, the indicator vector of set +$S_i$. Let $B$ be the $n^c \times 1$ vector such that $\forall i, b_j \equiv +f(S_j)$. It is easy to see that if $w$ is the $n \times 1$ corresponding weight +vector for $f$ then: +\begin{displaymath} + A w = B +\end{displaymath} + +Note that if $A$ has full column rank, then we can solve for $w$ exactly and +also optimize $f$ under any cardinality constraint. We can therefore cast the +question of $D-$learning and $D-$optimizing $f$ as a random matrix theory +problem: what is the probability that after $n^c$ for $c > 0$ independent +samples from $D$, the matrix $A$ will have full rank? See +section~\ref{sec:matrix_theory} + +\paragraph{Extension} +Note that the previous reasoning can easily be extended to any \emph{almost} +additive function. Consider a function $g$ such that there exists $\alpha > 0$ +and $\beta > 0$ and an additive function $f$ such that $\forall S, \alpha f(S) +\leq g(S) \leq \beta f(S)$, then by solving for $\max_{S\in K} f(S)$ we have a +$\alpha/\beta$-approximation to the optimum of $g$ since: + +\begin{displaymath} +g(S^*) \geq \alpha f(S^*) \geq \alpha f(OPT) \geq +\frac{\alpha}{\beta} g(OPT) +\end{displaymath} + +where $OPT \in \arg \max_{S \in K} g(S)$. This can be taken one step further by +considering a function $g$ such that there exists $\alpha, \beta >0$, an +additive function $f$ and a bijective univariate function $\phi$, such that +$\forall S, \alpha \phi(f(S)) \leq g(S) \leq \beta \phi(f(S))$. In this case, we +solve the system $A w = \phi^{-1}(B)$ and obtain once again an +$\alpha/\beta$-approximation to the optimum of $g$. + + +\subsubsection{Average weight method} + +We consider here another method to $D-$optimizing $f$, which only requires +$D-$learning $f$ approximately. Note that for every element $i \in \Omega$, and +for a \emph{product} distribution $D$: +\begin{displaymath} + \mathbb{E}_{S \sim D}(f(S)|i \in S) - \mathbb{E}_{S \sim D}(f(S) | i \notin + S) = \sum_{j \neq i \in \Omega} w_s \left(\mathbb{P}(j\in S|i \in S) - + \mathbb{P}(j \in S | i \notin S)\right) + w_i(1 - 0) = w_i +\end{displaymath} + +Let $O$ be the collection of all sets $S$ for which we have observed $f(S)$ and +let $O_i \equiv \{S \in O : i \in S\}$ and $O^c_i \equiv \{ S\in O: i \notin S +\}$. If $O_i$ and $O^c_i$ are non-empty, define the following weight estimator: +\begin{displaymath} + \hat W_i \equiv \frac{1}{|O_i|} \sum_{S \in O_i} f(S) - \frac{1}{|O_i^c|} + \sum_{S \in O_i^c} f(S) +\end{displaymath} + +If $D$ is a product distribution such that $ \exists c > 0, \forall i, +\mathbb{P}(i) \geq c$, it is +easy to show that $\forall i \in +\Omega,\ \hat W_i \rightarrow_{|O| \rightarrow +\infty} w_i$ +By using standard concentration bounds, we can hope to obtain within $poly(n, +\frac{1}{\epsilon})$ observations: + +\subsubsection{Generic Random Matrix Theory} +\label{sec:matrix_theory} + +We cite the following result from~\cite{vershynin2010introduction} (Remark 5.40): + +\begin{proposition}\label{prop:non_isotropic_isometry} +Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are independent +sub-gaussian random vectors in $\mathbb{R}^n$ with second moment matrix +$\Sigma$. Then for every $t \geq 0$, the following inequality holds with +probability at least: $1- 2 \exp(-ct^2)$: \begin{equation} \|\frac{1}{N} A^T A +- \Sigma \| \leq \max(\delta, \delta^2) \ \text{where}\ \delta = +C\sqrt{\frac{n}{N}} + \frac{t}{\sqrt{N}} \end{equation} where $C$, $c$ depend +only on the subgaussian norm of the rows $K \equiv \max_i \| A_i\|_{\psi_2}$. +\end{proposition} + +The following result is a simple corollary of +Proposition~\ref{prop:non_isotropic_isometry}: + +\begin{corollary}\label{cor:number_of_rows_needed} Let $n \in \mathbb{N}$ and + $k \in ]0,n[$. Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are + independent Bernoulli variable vectors in $\mathbb{R}^n$ such that + $\mathbb{P}(A_{i,j} = 1) = \frac{k}{n} = 1 - \mathbb{P}(A_{i,j} = 0)$ and let + $\sigma \equiv \frac{k}{n}(1- \frac{k}{n}) \neq 0$, then if $N > {(C+1)}^2 + n/\sigma^2$, the matrix A has full row rank with probability at least $1 - + e^{-cn}$, where $C, c$ are constant depending only on the subgaussian norm + of the rows $K \equiv \sup_{p \geq 1} + {\frac{1}{\sqrt{p}}(\mathbb{E}|A_1|^p)}^\frac{1}{p} = k$\footnote{Is this true? + And how do the constants behave?} \end{corollary} + +\begin{proof} It is easy to compare the kernels of $A$ and $A^TA$ and notice + that $rank(A) = rank(A^T A)$. Since $A^TA$ is an $n \times n$ matrix, it + follows that if $A^TA$ is invertible, then $A$ has full row rank. In other + words, if $A$ has smallest singular value $\sigma_{\min}(A) > 0$, then $A$ + has full row rank. Consider Prop.~\ref{prop:non_isotropic_isometry} with $t + \equiv \sqrt{n}$ and with $N > {(C+1)}^{2} n$, then with probability $1 - + 2\exp(-cn)$: \begin{equation} \|\frac{1}{N} A^T A - \sigma I \| \leq + (C+1)\sqrt{\frac{n}{N}} \end{equation} + +It follows that for any vector $x \in \mathbb{R}^n$, $|\frac{1}{N}\|A x\|^2_2 +-\sigma | \leq (C+1)\sqrt{\frac{n}{N}} \implies \|A x\|^2_2 \geq N\left(\sigma +- (C+1)\sqrt{\frac{n}{N}}\right)$. If $N > {(C+1)}^2n/\sigma^2$, then +$\sigma_{\min}(A) > 0$ with probability at least $1 - e^{-cn}$ \end{proof} + +\subsubsection{Algebraic Approach} + +Here we work on $\mathbb{F}_2$. An $n\times n$ binary matrix can be seen as +a matrix over $\mathbb{F}_2$. Let us assume that each row of $A$ is chosen +uniformly at random among all binary rows of length $n$. A standard counting +arguments shows that the number of non-singular matrices over $\mathbb{F}_2$ +is: +\begin{displaymath} + N_n = (2^n-1)(2^n - 2)\dots (2^n- 2^{n-1}) + = (2^n)^n\prod_{i=1}^n\left(1-\frac{1}{2^i}\right) +\end{displaymath} + +Hence the probability that our random matrix $A$ is invertible is: +\begin{displaymath} + P_n = \prod_{i=1}^n\left(1-\frac{1}{2^i}\right) +\end{displaymath} +It is easy to see that $P_n$ is a decreasing sequence. Its limit is +$\phi(\frac{1}{2})$ where $\phi$ is the Euler function. We have +$\phi(\frac{1}{2})\approx +0.289$ and $P_n$ converges +exponentially fast to this constant (one way to see this is to use the +Pentagonal number theorem). + +Hence, if we observe $\ell\cdot n$ uniformly random binary rows, the +probability that they will have full column rank is at least: +\begin{displaymath} + P_{\ell\cdot n}\geq 1 - \left(1-\phi\left(\frac{1}{2}\right)\right)^{\ell} +\end{displaymath} + +Note that this is the probability of having full column rank over +$\mathbb{F}_2$. A standard linear algebra argument shows that this implies full +column rank over $\mathbb{R}$. + +\paragraph{TODO:} Study the case where we only observe sets of size exactly +$k$, or at most $k$. This amounts to replacing $2^n$ in the computation above +by: +\begin{displaymath} +{n\choose k}\quad\text{or}\quad\sum_{j=0}^k{n\choose j} +\end{displaymath} + +Thinking about it, why do we assume that the sample sets are of size exactly +$k$. I think it would make more sense from the learning perspective to consider +uniformly random sets. In this case, the above approach allows us to conclude +directly. + +More generally, I think the “right” way to think about this is to look at $A$ +as the adjacency matrix of a random $k$-regular graph. There are tons of +results about the probability of the adjacency matrix being non singular. + +\subsection{Influence Functions} + +Let $G = (V, E)$ be a weighted directed graph. Without loss of generality, we +can assign a weight $p_{u,v} \in [0,1]$ to every possible edge $(u,v) \in V^2$. +Let $m$ be the number of non-zero edges of $G$. Let $\sigma_G(S, p)$ be the +influence of the set $S \subseteq V$ in $G$ under the IC model with parameters +$p$. + +Recall from \cite{Kempe:03} that: +\begin{equation} + \sigma_G(S, p) = \sum_{v\in V} \P_{G_p}\big[r_{G_p}(S\leadsto v)\big] +\end{equation} +where $G_p$ is a random graph where each edge $(u,v)\in E$ appears with +probability $p_{u,v}$ and $r_{G_p}(S\leadsto v)$ is the indicator variable of +\emph{there exists a path from $S$ to $v$ in $G_p$}. + +\begin{claim} +\label{cla:oracle} +If for all $(u,v) \in V^2$, $p_{u,v} \geq p'_{u,v} \geq p_{u,v} +- \frac{1}{\alpha m}$, then: +\begin{displaymath} + \forall S \subseteq V,\, \sigma_{G}(S, p) \geq \sigma_{G}(S,p') \geq (1 + - 1/\alpha) \sigma_{G}(S,p) +\end{displaymath} +\end{claim} + +\begin{proof} + We define two coupled random graphs $G_p$ and $G_p'$ as follows: for each + edge $(u,v)\in E$, draw a uniform random variable $\mathcal{U}_{u,v}\in + [0,1]$. Include $(u,v)$ in $G_p$ (resp. $G_{p'}$) iff + $\mathcal{U}_{u,v}\leq p_{u,v}$ (resp. $\mathcal{U}_{u,v}\leq p'_{u,v}$). + It is clear from the construction that each edge $(u,v)$ will be present in + $G_p$ (resp. $G_p'$) with probability $p_{u,v}$ (resp. $p'_{u,v}$). Hence: + \begin{displaymath} + \sigma_G(S, p) = \sum_{v\in V} \P_{G_p}\big[r_{G_p}(S\leadsto v)\big] + \text{ and } + \sigma_G(S, p') = \sum_{v\in V} \P_{G_p'}\big[r_{G_p'}(S\leadsto v)\big] + \end{displaymath} + + By construction $G_{p'}$ is always a subgraph of $G_p$, i.e + $r_{G_p'}(S\leadsto v)\leq r_{G_p}(S\leadsto v)$. This proves the left-hand + side of the Claim's inequality. + + The probability that an edge is present in $G_p$ but not in $G_p'$ is at + most $\frac{1}{\alpha m}$, so with probability $\left(1-\frac{1}{\alpha + m}\right)^m$, $G_p$ and $G_p'$ have the same set of edges. This implies that: + \begin{displaymath} \P_{G_{p'}}\big[r_{G_p'}(S\leadsto v)\big]\geq + \left(1-\frac{1}{\alpha m}\right)^m\P_{G_{p}}\big[r_{G_p}(S\leadsto v)\big] + \end{displaymath} + which proves the right-hand side of the claim after observing that + $\left(1-\frac{1}{\alpha m}\right)^m\geq 1-\frac{1}{\alpha}$ with equality + iff $m=1$. +\end{proof} + + We can use Claim~\ref{cla:oracle} to find a constant factor approximation + algorithm to maximising influence on $G$ by maximising influence on $G'$. For + $k \in \mathbb{N}^*$, let $O_k \in \arg\max_{|S| \leq k} \sigma_G(S)$ and + $\sigma_{G}^* = \sigma_G(O_k)$ where we omit the $k$ when it is unambiguous. + + +\begin{proposition} +\label{prop:approx_optim} +Suppose we have an unknown graph $G$ and a known graph $G'$ such that $V = V'$ +and for all $(u,v) \in V^2, |p_{u,v} - p_{u,v}'| \leq \frac{1}{\alpha m}$. +Then for all $k \in \mathbb{N}^*$, we can find a set $\hat O_k$ such that +$\sigma_{G}(\hat O_k) \geq (1 - e^{\frac{2}{\alpha} - 1}) \sigma^*_{G}$ +\end{proposition} + +\begin{proof} For every edge $(u,v) \in V^2$, let $\hat p = p'_{u,v} - + \frac{1}{\alpha m}$. We are now in the conditions of Claim~\ref{cla:oracle} + with $\alpha \leftarrow \alpha/2$. We return the set $\hat O_k$ obtained by + greedy maximisation on $\hat G$. It is a classic exercise to show then that + $\sigma_G(\hat O_k) \geq 1 - e^{\frac{2}{\alpha} - 1}$ (see Pset 1, CS284r). +\end{proof} + +A small note on the approximation factor: it is only $>0$ for $\alpha > 2$. +Note that $\alpha \geq 7 \implies 1 - e^{\frac{2}{\alpha} - 1} \geq +\frac{1}{2}$ and that it converges to $(1 - 1/e)$ which is the best possible +polynomial-time approximation ratio to influence maximisation unless $P = NP$. +Also note that in the limit of large $m$, $(1 -\frac{1}{\alpha m})^m +\rightarrow \exp(-1/\alpha)$ and the approximation ratio goes to $1 - +\exp(-\exp(-2/\alpha))$. diff --git a/results.tex b/results.tex deleted file mode 100644 index 31801fc..0000000 --- a/results.tex +++ /dev/null @@ -1,616 +0,0 @@ -\documentclass[10pt]{article} -\usepackage{fullpage, amsmath, amssymb, amsthm, bbm} -\usepackage[utf8x]{inputenc} - -\DeclareMathOperator{\E}{\mathbb{E}} -\let\P\relax -\DeclareMathOperator{\P}{\mathbb{P}} -\newcommand{\ex}[1]{\E\left[#1\right]} -\newcommand{\prob}[1]{\P\left[#1\right]} -\newcommand{\reals}{\mathbb{R}} -\newcommand{\ints}{\mathbb{N}} -\renewcommand{\O}{\mathcal{O}} - - -\newtheorem{proposition}{Proposition} -\newtheorem{corollary}{Corollary} -\newtheorem{problem}{Problem} -\newtheorem{theorem}{Theorem} -\newtheorem{claim}{Claim} -\newtheorem{remark}{Remark} - -\title{Learn and/or Optimize} -\author{} -\date{} - -\begin{document} -\maketitle - -\section{Preliminary Results} -\label{sec:matrix_theory} - -\subsection{Generic Random Matrix Theory} -We cite the following result from~\cite{vershynin2010introduction} (Remark 5.40): - -\begin{proposition}\label{prop:non_isotropic_isometry} -Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are independent -sub-gaussian random vectors in $\mathbb{R}^n$ with second moment matrix -$\Sigma$. Then for every $t \geq 0$, the following inequality holds with -probability at least: $1- 2 \exp(-ct^2)$: \begin{equation} \|\frac{1}{N} A^T A -- \Sigma \| \leq \max(\delta, \delta^2) \ \text{where}\ \delta = -C\sqrt{\frac{n}{N}} + \frac{t}{\sqrt{N}} \end{equation} where $C$, $c$ depend -only on the subgaussian norm of the rows $K \equiv \max_i \| A_i\|_{\psi_2}$. -\end{proposition} - -The following result is a simple corollary of -Proposition~\ref{prop:non_isotropic_isometry}: - -\begin{corollary}\label{cor:number_of_rows_needed} Let $n \in \mathbb{N}$ and - $k \in ]0,n[$. Assume that $A$ is an $N \times n$ matrix whose rows $A_i$ are - independent Bernoulli variable vectors in $\mathbb{R}^n$ such that - $\mathbb{P}(A_{i,j} = 1) = \frac{k}{n} = 1 - \mathbb{P}(A_{i,j} = 0)$ and let - $\sigma \equiv \frac{k}{n}(1- \frac{k}{n}) \neq 0$, then if $N > {(C+1)}^2 - n/\sigma^2$, the matrix A has full row rank with probability at least $1 - - e^{-cn}$, where $C, c$ are constant depending only on the subgaussian norm - of the rows $K \equiv \sup_{p \geq 1} - {\frac{1}{\sqrt{p}}(\mathbb{E}|A_1|^p)}^\frac{1}{p} = k$\footnote{Is this true? - And how do the constants behave?} \end{corollary} - -\begin{proof} It is easy to compare the kernels of $A$ and $A^TA$ and notice - that $rank(A) = rank(A^T A)$. Since $A^TA$ is an $n \times n$ matrix, it - follows that if $A^TA$ is invertible, then $A$ has full row rank. In other - words, if $A$ has smallest singular value $\sigma_{\min}(A) > 0$, then $A$ - has full row rank. Consider Prop.~\ref{prop:non_isotropic_isometry} with $t - \equiv \sqrt{n}$ and with $N > {(C+1)}^{2} n$, then with probability $1 - - 2\exp(-cn)$: \begin{equation} \|\frac{1}{N} A^T A - \sigma I \| \leq - (C+1)\sqrt{\frac{n}{N}} \end{equation} - -It follows that for any vector $x \in \mathbb{R}^n$, $|\frac{1}{N}\|A x\|^2_2 --\sigma | \leq (C+1)\sqrt{\frac{n}{N}} \implies \|A x\|^2_2 \geq N\left(\sigma -- (C+1)\sqrt{\frac{n}{N}}\right)$. If $N > {(C+1)}^2n/\sigma^2$, then -$\sigma_{\min}(A) > 0$ with probability at least $1 - e^{-cn}$ \end{proof} - -\subsection{More Direct Approach} - -Here we work on $\mathbb{F}_2$. An $n\times n$ binary matrix can be seen as -a matrix over $\mathbb{F}_2$. Let us assume that each row of $A$ is chosen -uniformly at random among all binary rows of length $n$. A standard counting -arguments shows that the number of non-singular matrices over $\mathbb{F}_2$ -is: -\begin{displaymath} - N_n = (2^n-1)(2^n - 2)\dots (2^n- 2^{n-1}) - = (2^n)^n\prod_{i=1}^n\left(1-\frac{1}{2^i}\right) -\end{displaymath} - -Hence the probability that our random matrix $A$ is invertible is: -\begin{displaymath} - P_n = \prod_{i=1}^n\left(1-\frac{1}{2^i}\right) -\end{displaymath} -It is easy to see that $P_n$ is a decreasing sequence. Its limit is -$\phi(\frac{1}{2})$ where $\phi$ is the Euler function. We have -$\phi(\frac{1}{2})\approx -0.289$ and $P_n$ converges -exponentially fast to this constant (one way to see this is to use the -Pentagonal number theorem). - -Hence, if we observe $\ell\cdot n$ uniformly random binary rows, the -probability that they will have full column rank is at least: -\begin{displaymath} - P_{\ell\cdot n}\geq 1 - \left(1-\phi\left(\frac{1}{2}\right)\right)^{\ell} -\end{displaymath} - -Note that this is the probability of having full column rank over -$\mathbb{F}_2$. A standard linear algebra argument shows that this implies full -column rank over $\mathbb{R}$. - -\paragraph{TODO:} Study the case where we only observe sets of size exactly -$k$, or at most $k$. This amounts to replacing $2^n$ in the computation above -by: -\begin{displaymath} -{n\choose k}\quad\text{or}\quad\sum_{j=0}^k{n\choose j} -\end{displaymath} - -Thinking about it, why do we assume that the sample sets are of size exactly -$k$. I think it would make more sense from the learning perspective to consider -uniformly random sets. In this case, the above approach allows us to conclude -directly. - -More generally, I think the “right” way to think about this is to look at $A$ -as the adjacency matrix of a random $k$-regular graph. There are tons of -results about the probability of the adjacency matrix to be non singular. - -\section{Examples} - -\subsection{Generic Functions} - -\paragraph{TODO:} Add citations! - -These are generic examples and serve as building blocks for the applications -below: -\begin{itemize} - \item $f(S) = g\left(\sum_{i\in S} w_i\right)$ for a concave - $g:\reals\to\reals$ and weights $(w_i)_{i\in N}\in \reals_+^{|N|}$. Note - that $f$ is monotone iff $g$ is monotone. In this case, $g$ does not - matter for the purpose of optimization: the sets are in the same order - with or without $g$, the only things which matter are the weights which - serve as natural parameters for this class of functions. This class of - functions contains: - \begin{itemize} - \item additive functions (when $g$ is the identity function). - \item $f(S) = |S\cap X|$ for some set $X\subseteq N$. This is the - case where the weights are $0/1$ and $g$ is the identity - function. - \item symmetric submodular functions: when the weights are all one. - \item budget-additive functions, when $g:x\mapsto \min(B, x)$ for - some $B$. - \end{itemize} - \item $f(S) = \max_{i\in S} w_i$ for weights $(w_i)_{i\in N}\in - \reals_+^{|N|}$. This class of functions is also naturally parametrized - by the weights. - \item (weighted) matroid rank functions. Given a matroid $M$ over a ground - set $N$, we define its rank function to be: - \begin{displaymath} - \forall S\subseteq N,\; - r(S) = \max_{\substack{I\subseteq S\\I\in M}} |I| - \end{displaymath} - more generally, given a weight function $w:N\to\reals_+$, we define the - weighted matroid rank function: - \begin{displaymath} - \forall S\subseteq N,\; - r(S) = \max_{\substack{I\subseteq S\\I\in M}} \sum_{i\in I} w_i - \end{displaymath} -\end{itemize} - -\paragraph{Remark} The function $f(S)= \max_{i\in S}w_i$ is a weighted matroid -rank function for the $1$-uniform matroid (the matroid where the independent -sets are the sets of size at most one). - -\subsection{Applications} - -\begin{itemize} - \item \emph{Coverage functions:} they can be written as a positive linear - combination of matroid rank functions: - \begin{displaymath} - f(S) = \sum_{u\in\mathcal{U}} w_u c_u(S) - \end{displaymath} - where $c_u$ is the rank function of the matroid $M = \big\{ \emptyset, - \{u\}\big\}$. - \item \emph{Facility location:} (cite Bilmes) there is a universe - $\mathcal{U}$ of locations and a proximity score $s_{i,j}$ for each - pair of locations. We pick a subset of locations $S$ and each point in - the universe is allocated to its closest location (the one with highest - proximity): - \begin{displaymath} - f(S) = \sum_{u\in\mathcal{U}} \max_{v\in S} s_{u,v} - \end{displaymath} - This can be seen as a sum of weighted matroid rank functions: one for - each location in the universe associated with a $1$-uniform matroid - (other applications: job scheduling). - - \item \emph{Image segmentation:} (cite Jegelka) can be (in some cases) - written as a graph cut function. For image segmentation the goal is to - minimize the cut. - \begin{displaymath} - f(S) = \sum_{e\in E} w_e c_e(S) - \end{displaymath} - where $c_e(S)$ is one iff $e\in E(S,\bar{S})$. \textbf{TODO:} I think - this can be written as a matroid rank function. - \item \emph{Learning} (cite Krause) there is - a hypothesis $A$ (a random variable) which is “refined” when more - observations are made. Imagine that there is a finite set $X_1,\dots, - X_n$ of possible observations (random variables). Then, assuming that - the observations are independent conditioned on $A$, the information - gain: - \begin{displaymath} - f(S) = H(A) - H\big(A\,|\,(X_i)_{i\in S}\big) - \end{displaymath} - is submodular. The $\log\det$ is the specific case of a linear - hypothesis observed with additional independent Gaussian noise. - \item \emph{Entropy:} Closely related to the previous one. If $(X_1,\dots, - X_n)$ are random variables, then: - $ f(S) = H(X_S) $ is submodular. In particular, if $(X_1,\dots, - X_n)$ are jointly multivariate gaussian, then: - \begin{displaymath} - f(S) = c|S| + \frac{1}{2}\log\det X_SX_S^T - \end{displaymath} - for $c= 2\pi e...$ and we fall back to the usual $\log\det$ function. - \item \emph{data subset selection/summarization:} in statistical machine - translation, Bilmes used sum of concave over modular: - \begin{displaymath} - f(S) = \sum_{f} \lambda_f \phi\left(\sum_{e\in S}w_f(e)\right) - \end{displaymath} - where each $f$ represents a feature, $w_f(e)$ represents how much of - $f$ element $e$ has, and $\phi$ captures decreasing marginal gain when - we have a lot of a given feature. - Facility location functions are also commonly used for subset selection. - \item \emph{concave spectral functions} One would be tempted to say that - any multivariate concave function of a modular function is submodular. - This would be the natural generalization of concave over modular. - However \textbf{this is not true in general}. However, a possible nice - generalization is the following. Let $M$ be a symmetric $n\times - n$ matrix, and $g$ is a ``matrix concave'' function. Then: - \begin{displaymath} - f(S) = \mathrm{Tr}\big(g(M_S)\big) - \end{displaymath} - is submodular. This contains the $\log\det$ (when $g$ is the matrix - $\log$) but tons of other functions (like quantum entropy). -\end{itemize} -In summary, the two most general classes of submodular functions (which capture -all the examples known to man) are: sums of matrix concave functions and sums -of matroid rank functions. Sums of concave over modular are also nice if we -want to start with a simpler example. - -\section{Passive Optimization} - -Let $\Omega$ be the universe of elements and $f$ a function defined on subsets -of $\Omega$: $f : S \in 2^{[\Omega]} \mapsto f(S) \in \mathbb{R}$. Let $K$ be a -collection of sets of $2^{[\Omega]}$, which we call \emph{constraints}. Let -$S^*_K$ be any solution to $\max_{S \in K} f(S)$, which we will also denote by -$S^*$ when there is no ambiguity. Let $L$ be the problem size, which is often -(but not always) equal to $|\Omega|$. - -In general, we say we can efficiently optimize a function $f$ under constraint -$K$ when we have a polynomial-time algorithm making adaptive value queries to -$f$,which returns a set $S$ such that $S \in K$ and $f(S) \geq \alpha f(S^*)$ -with high probability and $\alpha$ an absolute constant. - -Here, we consider the scenario where we cannot make adaptive value queries, and -in fact, where we cannot make queries at all! Instead, we suppose that we -observe a polynomial number of set-value pairs $(S, f(S))$ where $S$ is taken -from a known distribution $D$. We say we can efficiently \emph{passively -optimize} $f$ under distribution $D$ or $D-$optimize $f$ under constraints $K$ -when, after observing ${\cal O}(L^c)$ set-value pairs from $D$ where $c > 0$ is -an absolute constant, we can return a set $S$ such that $S \in K$ and $f(S) -\geq \alpha f(S^*)$ with high probability and $\alpha$ an absolute constant. - -In the case of \emph{passive} observations of set-value pairs under a -distribution $D$ for a function $f$, recent research has focused on whether we -can efficiently and approximately \emph{learn} $f$. This was formalized in the -PMAC model from \cite{balcan2011learning}. When thinking about passive -optimization, it is necessary to understand the link between being able to - $D-PMAC$ learn $f$ and being able to $D-$optimize $f$. - -\subsection{Additive function} - -We consider here the simple case of additive functions. A function $f$ is -additive if there exists a weight vector $(w_s)_{s \in \Omega}$ such that -$\forall S \subseteq \Omega, \ f(S) = \sum_{s \in S} w_s$. We will sometimes -adopt the notation $w \equiv f(\Omega)$. - -\subsubsection{Inverting the system}\label{subsec:invert_the_system} - -Suppose we have observed $n^c$ set-value pairs ${(S_j, f(S_j))}_{j \in N}$ with -$S_j \sim D$ where $n \equiv |\Omega|$. Consider the $n^c \times n$ matrix $A$ -where for all $i$, the row $A_i = \chi_{S_i}$, the indicator vector of set -$S_i$. Let $B$ be the $n^c \times 1$ vector such that $\forall i, b_j \equiv -f(S_j)$. It is easy to see that if $w$ is the $n \times 1$ corresponding weight -vector for $f$ then: -\begin{displaymath} - A w = B -\end{displaymath} - -Note that if $A$ has full column rank, then we can solve for $w$ exactly and -also optimize $f$ under any cardinality constraint. We can therefore cast the -question of $D-$learning and $D-$optimizing $f$ as a random matrix theory -problem: what is the probability that after $n^c$ for $c > 0$ independent -samples from $D$, the matrix $A$ will have full rank? See -section~\ref{sec:matrix_theory} - -\paragraph{Extension} -Note that the previous reasoning can easily be extended to any \emph{almost} -additive function. Consider a function $g$ such that there exists $\alpha > 0$ -and $\beta > 0$ and an additive function $f$ such that $\forall S, \alpha f(S) -\leq g(S) \leq \beta f(S)$, then by solving for $\max_{S\in K} f(S)$ we have a -$\alpha/\beta$-approximation to the optimum of $g$ since: - -\begin{displaymath} -g(S^*) \geq \alpha f(S^*) \geq \alpha f(OPT) \geq -\frac{\alpha}{\beta} g(OPT) -\end{displaymath} - -where $OPT \in \arg \max_{S \in K} g(S)$. This can be taken one step further by -considering a function $g$ such that there exists $\alpha, \beta >0$, an -additive function $f$ and a bijective univariate function $\phi$, such that -$\forall S, \alpha \phi(f(S)) \leq g(S) \leq \beta \phi(f(S))$. In this case, we -solve the system $A w = \phi^{-1}(B)$ and obtain once again an -$\alpha/\beta$-approximation to the optimum of $g$. - -\begin{remark} -Note that here $D-$optimizing $f$ is easy because $D-$learning $f$ is easy. We -would like to understand whether being able to $D-$learn $f$ is really -necessary to $D-$optimizing it. In fact, many results for PMAC-learning more -complex functions, such as general submodular functions, are negative. Can we -hope to find positive results in cases where PMAC-learning is impossible? -\end{remark} - -\subsubsection{Average weight method} - -We consider here another method to $D-$optimizing $f$, which only requires -$D-$learning $f$ approximately. Note that for every element $i \in \Omega$, and -for a \emph{product} distribution $D$: -\begin{displaymath} - \mathbb{E}_{S \sim D}(f(S)|i \in S) - \mathbb{E}_{S \sim D}(f(S) | i \notin - S) = \sum_{j \neq i \in \Omega} w_s \left(\mathbb{P}(j\in S|i \in S) - - \mathbb{P}(j \in S | i \notin S)\right) + w_i(1 - 0) = w_i -\end{displaymath} - -Let $O$ be the collection of all sets $S$ for which we have observed $f(S)$ and -let $O_i \equiv \{S \in O : i \in S\}$ and $O^c_i \equiv \{ S\in O: i \notin S -\}$. If $O_i$ and $O^c_i$ are non-empty, define the following weight estimator: -\begin{displaymath} - \hat W_i \equiv \frac{1}{|O_i|} \sum_{S \in O_i} f(S) - \frac{1}{|O_i^c|} - \sum_{S \in O_i^c} f(S) -\end{displaymath} - -If $D$ is a product distribution such that $ \exists c > 0, \forall i, -\mathbb{P}(i) \geq c$, it is -easy to show that $\forall i \in -\Omega,\ \hat W_i \rightarrow_{|O| \rightarrow +\infty} w_i$ -By using standard concentration bounds, we can hope to obtain within $poly(n, -\frac{1}{\epsilon})$ observations: - - -%For every node $i$, we compute the \emph{average weight of every set containing -%element $i$}. Let $w_i$ be -%the weight of element $i$, $w \equiv f(\Omega) = \sum_{i \in \Omega} w_i$ and -%$p \equiv \frac{k}{n}$, then $$\forall i \in \Omega, \mathbb{E}_{S \sim -%D_p}\left[f(S)| i \in S\right] = pw + (1 -p)w_i$$ - -%Note that the average weight of every set containing element $i$ preserves the -%ranking of the weights of the elements. For observations ${(S_j, f(S_j))}_{j \in -%N}$ and for $N_i \equiv |\{S : i \in S\}|$, we define the following estimator: - -%\begin{equation} \forall i \in \Omega, w_i^{N_i} \equiv \frac{1}{N_i}\sum_{S | -%i \in S} \frac{f(S) - pw}{1-p} \end{equation} - -%As shown above, $w_i^{N_i} \rightarrow w_i$ as $N_i \rightarrow +\infty$. We -%can obtain a concentration bound of $w_i^{N_i}$ around $w_i$, using Hoeffding's -%lemma: - -%\begin{equation} \mathbb{P}\left(\middle|w_i^{N_i} - w_i \middle| \geq -%\epsilon w_i \right) \leq 2e^{-2(1-p)N_i\frac{\epsilon^2 w_i^2}{w^2}} -%\end{equation} - -%\emph{TODO:multiplicative boudns are very bad for zero weights\dots Need to look -%at additive bounds for these zeros.} - -%For $N_i$ sufficiently large for each element $i$, we have $\forall i \in -%\Omega, (1-\epsilon) w_i \leq w_i^{N_i} \leq (1 + \epsilon) w_i$. Under this -%assumption, if we choose the $k$ elements with largest estimated weight -%$W_i^{N_i}$, we obtain a $\frac{1-\epsilon}{1+\epsilon}$-approximation to OPT, -%where OPT is the value of the maximum weight set of $k$ elements for the -%function $f$. To ensure that $N_i$ is sufficiently large for each element, we -%note that $\mathbb{E}(N_i) = pN$ and use a Chernoff bound coupled with a -%classic union bound: - -%\begin{equation} \mathbb{P}\left(\bigcup_{i \in \Omega} \left[N_i \leq -%\frac{pN}{2}\right]\right) \leq \sum_{i\in \Omega} \mathbb{P}\left(N_i \leq -%\frac{pN}{2}\right) \leq n e^{-\frac{pN}{8}} \end{equation} - -%As such, for $C > 0$ and $N \geq (C+1)\frac{8}{p}\log n$, we have $\forall i -%\in \Omega, N_i \geq \frac{pN}{2}$ with probability at least $1-\frac{1}{n^C}$ - -%\begin{proposition} Assume that $N$ pairs of set-value ${(S_j, f(S_j))}_{j \in - %N}$ are observed, where $S_j \sim D_p$ and $p \equiv \frac{k}{n}$. If $N > - %XXX$, then we can $\epsilon$-learn $f$ and optimize it to a - %$(1+\epsilon)/(1-\epsilon)$ factor for any cardinality constraint with - %probability $XXX$. \end{proposition} - -\subsection{General submodular functions} - -\subsection{Parametric submodular functions} - -\textbf{Note:} all known examples of submodular functions are parametric. The -question is not parametric vs non-parametric but whether or not the number of -parameters is polynomial or exponential in $n$ (the size of the ground set). -For all examples I know except martroid rank functions, it seems to be -polynomial. - -\subsubsection{Influence Functions} - -Let $G = (V, E)$ be a weighted directed graph. Without loss of generality, we -can assign a weight $p_{u,v} \in [0,1]$ to every possible edge $(u,v) \in V^2$. -Let $m$ be the number of non-zero edges of $G$. Let $\sigma_G(S, p)$ be the -influence of the set $S \subseteq V$ in $G$ under the IC model with parameters -$p$. - -Recall from \cite{Kempe:03} that: -\begin{equation} - \sigma_G(S, p) = \sum_{v\in V} \P_{G_p}\big[r_{G_p}(S\leadsto v)\big] -\end{equation} -where $G_p$ is a random graph where each edge $(u,v)\in E$ appears with -probability $p_{u,v}$ and $r_{G_p}(S\leadsto v)$ is the indicator variable of -\emph{there exists a path from $S$ to $v$ in $G_p$}. - -\begin{claim} -\label{cla:oracle} -If for all $(u,v) \in V^2$, $p_{u,v} \geq p'_{u,v} \geq p_{u,v} -- \frac{1}{\alpha m}$, then: -\begin{displaymath} - \forall S \subseteq V,\, \sigma_{G}(S, p) \geq \sigma_{G}(S,p') \geq (1 - - 1/\alpha) \sigma_{G}(S,p) -\end{displaymath} -\end{claim} - -\begin{proof} - We define two coupled random graphs $G_p$ and $G_p'$ as follows: for each - edge $(u,v)\in E$, draw a uniform random variable $\mathcal{U}_{u,v}\in - [0,1]$. Include $(u,v)$ in $G_p$ (resp. $G_{p'}$) iff - $\mathcal{U}_{u,v}\leq p_{u,v}$ (resp. $\mathcal{U}_{u,v}\leq p'_{u,v}$). - It is clear from the construction that each edge $(u,v)$ will be present in - $G_p$ (resp. $G_p'$) with probability $p_{u,v}$ (resp. $p'_{u,v}$). Hence: - \begin{displaymath} - \sigma_G(S, p) = \sum_{v\in V} \P_{G_p}\big[r_{G_p}(S\leadsto v)\big] - \text{ and } - \sigma_G(S, p') = \sum_{v\in V} \P_{G_p'}\big[r_{G_p'}(S\leadsto v)\big] - \end{displaymath} - - By construction $G_{p'}$ is always a subgraph of $G_p$, i.e - $r_{G_p'}(S\leadsto v)\leq r_{G_p}(S\leadsto v)$. This proves the left-hand - side of the Claim's inequality. - - The probability that an edge is present in $G_p$ but not in $G_p'$ is at - most $\frac{1}{\alpha m}$, so with probability $\left(1-\frac{1}{\alpha - m}\right)^m$, $G_p$ and $G_p'$ have the same set of edges. This implies that: - \begin{displaymath} \P_{G_{p'}}\big[r_{G_p'}(S\leadsto v)\big]\geq - \left(1-\frac{1}{\alpha m}\right)^m\P_{G_{p}}\big[r_{G_p}(S\leadsto v)\big] - \end{displaymath} - which proves the right-hand side of the claim after observing that - $\left(1-\frac{1}{\alpha m}\right)^m\geq 1-\frac{1}{\alpha}$ with equality - iff $m=1$. -\end{proof} - - We can use Claim~\ref{cla:oracle} to find a constant factor approximation - algorithm to maximising influence on $G$ by maximising influence on $G'$. For - $k \in \mathbb{N}^*$, let $O_k \in \arg\max_{|S| \leq k} \sigma_G(S)$ and - $\sigma_{G}^* = \sigma_G(O_k)$ where we omit the $k$ when it is unambiguous. - - -\begin{proposition} -\label{prop:approx_optim} -Suppose we have an unknown graph $G$ and a known graph $G'$ such that $V = V'$ -and for all $(u,v) \in V^2, |p_{u,v} - p_{u,v}'| \leq \frac{1}{\alpha m}$. -Then for all $k \in \mathbb{N}^*$, we can find a set $\hat O_k$ such that -$\sigma_{G}(\hat O_k) \geq (1 - e^{\frac{2}{\alpha} - 1}) \sigma^*_{G}$ -\end{proposition} - -\begin{proof} For every edge $(u,v) \in V^2$, let $\hat p = p'_{u,v} - - \frac{1}{\alpha m}$. We are now in the conditions of Claim~\ref{cla:oracle} - with $\alpha \leftarrow \alpha/2$. We return the set $\hat O_k$ obtained by - greedy maximisation on $\hat G$. It is a classic exercise to show then that - $\sigma_G(\hat O_k) \geq 1 - e^{\frac{2}{\alpha} - 1}$ (see Pset 1, CS284r). -\end{proof} - -A small note on the approximation factor: it is only $>0$ for $\alpha > 2$. -Note that $\alpha \geq 7 \implies 1 - e^{\frac{2}{\alpha} - 1} \geq -\frac{1}{2}$ and that it converges to $(1 - 1/e)$ which is the best possible -polynomial-time approximation ratio to influence maximisation unless $P = NP$. -Also note that in the limit of large $m$, $(1 -\frac{1}{\alpha m})^m -\rightarrow \exp(-1/\alpha)$ and the approximation ratio goes to $1 - -\exp(-\exp(-2/\alpha))$. - -\subsubsection{Active set selection of stationary Gaussian Processes} - -\section{Passive Optimization vs. Passive Learning} - -\subsection{Failed Attempt: returning max of observations} - -This doesn't work. Give examples as to why! Remember that there are strong -concentration results for submodular functions -> look at expected value of -observed sets - -\subsection{Example where optimization possible, learning impossible} - -Recall the matroid construction from~\cite{balcan2011learning}: -\begin{theorem} - For any $k$ with $k = 2^{o(n^{1/3})}$, there exists a family of sets ${\cal A} - \subseteq2^{[n]}$ and a family of matroids $\cal{M} = \{M_{\cal{B}} : - \cal{B} \subseteq\cal{A} \}$ such that: - \begin{itemize} - \item $|{\cal A}| = k$ and $|A| = n^{1/3}$ for every $A \in \cal{A}$ - \item For every $\cal{B} \subseteq\cal{A}$ and every $A \in \cal{A}$, we - have: - \begin{align*} - \text{rank}_{M_{\cal{B}}}(A) & = 8 \log k \ if A\in{\cal B} \\ - & = |A| \ if A\in {\cal A}\backslash{\cal B} - \end{align*} - \end{itemize} -\end{theorem} - -Consider the following subset of the above family of matroids: ${\cal -M}^{\epsilon} = \{M_{\cal B} : {\cal B} \subseteq{\cal A} \wedge |{\cal -A}\backslash{\cal B}| \geq \epsilon|{\cal A}|\}$ for $k = 2^{n^{1/6}}$. -Consider an \emph{unknown} function $f$, corresponding to the rank function of -one of the matroids $M_{\cal B}$ from ${\cal M}^{\epsilon}$. Note that as long -as we observe \emph{one} set from ${\cal A} \backslash {\cal B}$, we can -optimize $f$ exactly! Indeed, the largest possible value for $f$ under -cardinality constraint $n^{1/3}$ is $\max_{A\in 2^{[n]}} |A| = n^{1/3}$. - -One example of a distribution under which this occurs with probability at least -a constant is $D_u$, the uniform distribution over all sets of ${\cal A}$. For -$c>0$, after $n^c$ observations $O \equiv (S_i, f(S_i))_i$ for $S_i \sim D_u$, -we will observe at least one element from $\cal{A}\backslash\cal{B}$ with -constant probability: - -\begin{equation} \nonumber \mathbb{P}(\bigcup_{S_i} S_i \in {\cal -A}\backslash{\cal B}) \geq 1 - (1 - \epsilon)^{n^c} \approx \epsilon n^c -\end{equation} - -However, it follows from the analysis of~\cite{balcan2011learning} that we -cannot learn $f$ under any distribution, even with active value queries! -Indeed, for any polynomially-sized set of observations, there exists a -super-polynomially number of functions in ${\cal M^1}$ which coincide on this -set of observations, but which take very different values outside of this set -of observations: $8n^{1/6}$ for $A \in {\cal B}$ and $n^{1/3}$ for $A \in {\cal -A}\backslash {\cal B}$. - -{\bf TODO:} A cleaner simpler example would be nice. - -\section{Meeting notes: 04.03.2015} - -Consider the following function: -\begin{displaymath} - g(S) = \max\left\{\sqrt{n}|X\cap S|, |S|\right\} -\end{displaymath} -where $|X|=\sqrt{n}$. Assume that you are given polynomially many samples where -each element is included with probability $1/2$. Then with high probability all -the samples will have size roughly $n/2$, so you will observe $g(S)=|S|$ with -high probability. - -\paragraph{Claim 1:} $g$ is PAC-learnable because if you output $|S|$ then you -will be correct with high probability is $S$ is drawn from the same -distribution as above. - -\paragraph{Claim 2:} $g$ is not optimizable under budget $\sqrt{n}$ because you -never learn anything about $X$. - -\paragraph{Open Question:} Cook a similar example where $g$ is submodular and -where you are observing sets of the same size as your budget. - -\paragraph{Positive results:} Try to obtain guarantees about learning -parameters of parametric submodular functions and show whether or not these -guarantees are sufficient for optimization. First look at learning weights in -a cover function. Maybe facility location? Sums of concave over modular are -probably too hard because of the connection to neural networks. - -\section{Which learning guarantees imply optimization?} - -Here, we consider the following question: which learning guarantees imply -optimization? For example, \cite{TODO} provides the following guarantee for -cover functions: - -\begin{theorem} -There exists an algorithm such that for any $\epsilon>0$ given random and -uniform examples of a coverage function $c$ outputs a hypothesis, which is also -a coverage function $h$, such that with probability $2/3$: $\mathbb{E}_{\cal -U}[|h(x) - c(x)|] \leq \epsilon$. The algorithm runs in time $\tilde {\cal O}(n) -\cdot \text{poly}(s/\epsilon)$ and uses $\log(n)\cdot \text{poly}(s/epsilon)$ -and examples, where $s = \min\{size(c), (1/\epsilon)^{\log(1/\epsilon)}\}$. -\end{theorem} - -We would like to understand to what extent this $\ell1$-bound allows us to -optimize the coverage function $c$ under cardinality constraints using the -hypothesis $h$. Let us consider the simpler case of an additive function, and -suppose we have a similar guarantee: $\mathbb{E}_{x \sim {\cal U}}[|h(x) - -c(x)|]$ where $\forall x, c(x) \equiv \sum_i w_i x_i$ and $h(x) \equiv \sum_i -\hat w_i x_i$. Can we find a bound on $\|w - \hat w\|_{\infty}$? - -As it turns out, yes. Let $\forall i, w_i - \hat w_i \equiv \alpha_i$ and let -$V(x) \equiv |h(x) - c(x)| = |\sum_{i} \alpha_i x_i |$. Consider the collection -of good sets $\cal G \equiv \{S : v(S) < 4\epsilon\}$. We claim that $|{\cal G}| -\geq \frac{3}{4}c\dot2^n$. Suppose the contrary, there is at least a quarter of -the sets which have value $v(S) > 4\epsilon$ such that $\mathbb{E}_{x \sim {\cal -U}}[|v(x)|] \geq \frac{1}{2^n}\sum_{S \in {\cal G}^c} |v(S)| > -\frac{1}{4}\cdot4\epsilon = \epsilon$ which is a contradiction. Consider element -$i$ such that $|\alpha_i| \equiv \|\alpha\|_{\infty}$. Consider the collection -of sets which contain $i$: ${\cal S}_i \equiv \{S : i \in S\}$. Notice that -$|{\cal S}_i| = |{\cal S}_i^c| = 2^{n-1}$. Therefore, $|{\cal G} \cap {\cal -S}_i^c| \geq \frac{1}{4}\cdot 2^n$. For all sets $S$ in ${\cal G} \cap {\cal -S}_i^c$, $v(S\cup\{j\}) \geq \alpha_i - 4\epsilon$. It follows that -$\mathbb{E}_{x \sim {\cal U}}[|v(x)|] \geq \frac{1}{4}(\alpha_j - 4\epsilon)$ -and therefore we have $\|w - \hat w\|_{\infty} \leq 8 \epsilon$. - -\bibliography{optimize} \bibliographystyle{apalike} - - -\end{document} -- cgit v1.2.3-70-g09d2 From 90211ba1847251c552361a7d0b2eef1c540ef72a Mon Sep 17 00:00:00 2001 From: Thibaut Horel Date: Tue, 14 Apr 2015 15:27:24 -0400 Subject: Nips format and minor tweaks. We are already close to the page limit… MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- paper/main.tex | 10 +- paper/nips15submit_e.sty | 236 ++++++++++++++++++++++++++++++++++++++++++++ paper/sections/negative.tex | 2 +- 3 files changed, 244 insertions(+), 4 deletions(-) create mode 100644 paper/nips15submit_e.sty diff --git a/paper/main.tex b/paper/main.tex index 58b6b22..0b33bdd 100644 --- a/paper/main.tex +++ b/paper/main.tex @@ -1,6 +1,11 @@ \documentclass[10pt]{article} -\usepackage{fullpage, amsmath, amssymb, amsthm, bbm} +\usepackage{nips15submit_e} +\usepackage{amsmath, amssymb, amsthm, bbm, microtype} \usepackage[utf8x]{inputenc} +\usepackage[capitalize, noabbrev]{cleveref} +\usepackage[pagebackref=false,breaklinks=true,colorlinks=true]{hyperref} +\usepackage{natbib} +\usepackage{url} \DeclareMathOperator{\E}{\mathbb{E}} \let\P\relax @@ -11,7 +16,6 @@ \newcommand{\ints}{\mathbb{N}} \renewcommand{\O}{\mathcal{O}} - \newtheorem{proposition}{Proposition} \newtheorem{corollary}{Corollary} \newtheorem{problem}{Problem} @@ -35,7 +39,7 @@ \section{Positive results} \input{sections/positive.tex} -\bibliographystyle{apalike} +\bibliographystyle{abbrv} \bibliography{optimize} diff --git a/paper/nips15submit_e.sty b/paper/nips15submit_e.sty new file mode 100644 index 0000000..75dc8a8 --- /dev/null +++ b/paper/nips15submit_e.sty @@ -0,0 +1,236 @@ +%%%% NIPS Macros (LaTex) +%%%% Style File +%%%% Dec 12, 1990 Rev Aug 14, 1991; Sept, 1995; April, 1997; April, 1999 + +% This file can be used with Latex2e whether running in main mode, or +% 2.09 compatibility mode. +% +% If using main mode, you need to include the commands +% \documentclass{article} +% \usepackage{nips10submit_e,times} +% as the first lines in your document. Or, if you do not have Times +% Roman font available, you can just use +% \documentclass{article} +% \usepackage{nips10submit_e} +% instead. +% +% If using 2.09 compatibility mode, you need to include the command +% \documentstyle[nips10submit_09,times]{article} +% as the first line in your document. Or, if you do not have Times +% Roman font available, you can include the command +% \documentstyle[nips10submit_09]{article} +% instead. + +% Change the overall width of the page. If these parameters are +% changed, they will require corresponding changes in the +% maketitle section. +% +\usepackage{eso-pic} % used by \AddToShipoutPicture + +\renewcommand{\topfraction}{0.95} % let figure take up nearly whole page +\renewcommand{\textfraction}{0.05} % let figure take up nearly whole page + +% Define nipsfinal, set to true if nipsfinalcopy is defined +\newif\ifnipsfinal +\nipsfinalfalse +\def\nipsfinalcopy{\nipsfinaltrue} +\font\nipstenhv = phvb at 8pt % *** IF THIS FAILS, SEE nips10submit_e.sty *** + +% Specify the dimensions of each page + +\setlength{\paperheight}{11in} +\setlength{\paperwidth}{8.5in} + +\oddsidemargin .5in % Note \oddsidemargin = \evensidemargin +\evensidemargin .5in +\marginparwidth 0.07 true in +%\marginparwidth 0.75 true in +%\topmargin 0 true pt % Nominal distance from top of page to top of +%\topmargin 0.125in +\topmargin -0.625in +\addtolength{\headsep}{0.25in} +\textheight 9.0 true in % Height of text (including footnotes & figures) +\textwidth 5.5 true in % Width of text line. +\widowpenalty=10000 +\clubpenalty=10000 + +% \thispagestyle{empty} \pagestyle{empty} +\flushbottom \sloppy + +% We're never going to need a table of contents, so just flush it to +% save space --- suggested by drstrip@sandia-2 +\def\addcontentsline#1#2#3{} + +% Title stuff, taken from deproc. +\def\maketitle{\par +\begingroup + \def\thefootnote{\fnsymbol{footnote}} + \def\@makefnmark{\hbox to 0pt{$^{\@thefnmark}$\hss}} % for perfect author + % name centering +% The footnote-mark was overlapping the footnote-text, +% added the following to fix this problem (MK) + \long\def\@makefntext##1{\parindent 1em\noindent + \hbox to1.8em{\hss $\m@th ^{\@thefnmark}$}##1} + \@maketitle \@thanks +\endgroup +\setcounter{footnote}{0} +\let\maketitle\relax \let\@maketitle\relax +\gdef\@thanks{}\gdef\@author{}\gdef\@title{}\let\thanks\relax} + +% The toptitlebar has been raised to top-justify the first page + +% Title (includes both anonimized and non-anonimized versions) +\def\@maketitle{\vbox{\hsize\textwidth +\linewidth\hsize \vskip 0.1in \toptitlebar \centering +{\LARGE\bf \@title\par} \bottomtitlebar % \vskip 0.1in % minus +\ifnipsfinal + \def\And{\end{tabular}\hfil\linebreak[0]\hfil + \begin{tabular}[t]{c}\bf\rule{\z@}{24pt}\ignorespaces}% + \def\AND{\end{tabular}\hfil\linebreak[4]\hfil + \begin{tabular}[t]{c}\bf\rule{\z@}{24pt}\ignorespaces}% + \begin{tabular}[t]{c}\bf\rule{\z@}{24pt}\@author\end{tabular}% +\else + \begin{tabular}[t]{c}\bf\rule{\z@}{24pt} +Anonymous Author(s) \\ +Affiliation \\ +Address \\ +\texttt{email} \\ +\end{tabular}% +\fi +\vskip 0.3in minus 0.1in}} + +\renewenvironment{abstract}{\vskip.075in\centerline{\large\bf +Abstract}\vspace{0.5ex}\begin{quote}}{\par\end{quote}\vskip 1ex} + +% sections with less space +\def\section{\@startsection {section}{1}{\z@}{-2.0ex plus + -0.5ex minus -.2ex}{1.5ex plus 0.3ex +minus0.2ex}{\large\bf\raggedright}} + +\def\subsection{\@startsection{subsection}{2}{\z@}{-1.8ex plus +-0.5ex minus -.2ex}{0.8ex plus .2ex}{\normalsize\bf\raggedright}} +\def\subsubsection{\@startsection{subsubsection}{3}{\z@}{-1.5ex +plus -0.5ex minus -.2ex}{0.5ex plus +.2ex}{\normalsize\bf\raggedright}} +\def\paragraph{\@startsection{paragraph}{4}{\z@}{1.5ex plus +0.5ex minus .2ex}{-1em}{\normalsize\bf}} +\def\subparagraph{\@startsection{subparagraph}{5}{\z@}{1.5ex plus + 0.5ex minus .2ex}{-1em}{\normalsize\bf}} +\def\subsubsubsection{\vskip +5pt{\noindent\normalsize\rm\raggedright}} + + +% Footnotes +\footnotesep 6.65pt % +\skip\footins 9pt plus 4pt minus 2pt +\def\footnoterule{\kern-3pt \hrule width 12pc \kern 2.6pt } +\setcounter{footnote}{0} + +% Lists and paragraphs +\parindent 0pt +\topsep 4pt plus 1pt minus 2pt +\partopsep 1pt plus 0.5pt minus 0.5pt +\itemsep 2pt plus 1pt minus 0.5pt +\parsep 2pt plus 1pt minus 0.5pt +\parskip .5pc + + +%\leftmargin2em +\leftmargin3pc +\leftmargini\leftmargin \leftmarginii 2em +\leftmarginiii 1.5em \leftmarginiv 1.0em \leftmarginv .5em + +%\labelsep \labelsep 5pt + +\def\@listi{\leftmargin\leftmargini} +\def\@listii{\leftmargin\leftmarginii + \labelwidth\leftmarginii\advance\labelwidth-\labelsep + \topsep 2pt plus 1pt minus 0.5pt + \parsep 1pt plus 0.5pt minus 0.5pt + \itemsep \parsep} +\def\@listiii{\leftmargin\leftmarginiii + \labelwidth\leftmarginiii\advance\labelwidth-\labelsep + \topsep 1pt plus 0.5pt minus 0.5pt + \parsep \z@ \partopsep 0.5pt plus 0pt minus 0.5pt + \itemsep \topsep} +\def\@listiv{\leftmargin\leftmarginiv + \labelwidth\leftmarginiv\advance\labelwidth-\labelsep} +\def\@listv{\leftmargin\leftmarginv + \labelwidth\leftmarginv\advance\labelwidth-\labelsep} +\def\@listvi{\leftmargin\leftmarginvi + \labelwidth\leftmarginvi\advance\labelwidth-\labelsep} + +\abovedisplayskip 7pt plus2pt minus5pt% +\belowdisplayskip \abovedisplayskip +\abovedisplayshortskip 0pt plus3pt% +\belowdisplayshortskip 4pt plus3pt minus3pt% + +% Less leading in most fonts (due to the narrow columns) +% The choices were between 1-pt and 1.5-pt leading +%\def\@normalsize{\@setsize\normalsize{11pt}\xpt\@xpt} % got rid of @ (MK) +\def\normalsize{\@setsize\normalsize{11pt}\xpt\@xpt} +\def\small{\@setsize\small{10pt}\ixpt\@ixpt} +\def\footnotesize{\@setsize\footnotesize{10pt}\ixpt\@ixpt} +\def\scriptsize{\@setsize\scriptsize{8pt}\viipt\@viipt} +\def\tiny{\@setsize\tiny{7pt}\vipt\@vipt} +\def\large{\@setsize\large{14pt}\xiipt\@xiipt} +\def\Large{\@setsize\Large{16pt}\xivpt\@xivpt} +\def\LARGE{\@setsize\LARGE{20pt}\xviipt\@xviipt} +\def\huge{\@setsize\huge{23pt}\xxpt\@xxpt} +\def\Huge{\@setsize\Huge{28pt}\xxvpt\@xxvpt} + +\def\toptitlebar{\hrule height4pt\vskip .25in\vskip-\parskip} + +\def\bottomtitlebar{\vskip .29in\vskip-\parskip\hrule height1pt\vskip +.09in} % +%Reduced second vskip to compensate for adding the strut in \@author + +% Vertical Ruler +% This code is, largely, from the CVPR 2010 conference style file +% ----- define vruler +\makeatletter +\newbox\nipsrulerbox +\newcount\nipsrulercount +\newdimen\nipsruleroffset +\newdimen\cv@lineheight +\newdimen\cv@boxheight +\newbox\cv@tmpbox +\newcount\cv@refno +\newcount\cv@tot +% NUMBER with left flushed zeros \fillzeros[] +\newcount\cv@tmpc@ \newcount\cv@tmpc +\def\fillzeros[#1]#2{\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi +\cv@tmpc=1 % +\loop\ifnum\cv@tmpc@<10 \else \divide\cv@tmpc@ by 10 \advance\cv@tmpc by 1 \fi + \ifnum\cv@tmpc@=10\relax\cv@tmpc@=11\relax\fi \ifnum\cv@tmpc@>10 \repeat +\ifnum#2<0\advance\cv@tmpc1\relax-\fi +\loop\ifnum\cv@tmpc<#1\relax0\advance\cv@tmpc1\relax\fi \ifnum\cv@tmpc<#1 \repeat +\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi \relax\the\cv@tmpc@}% +% \makevruler[][][][][] +\def\makevruler[#1][#2][#3][#4][#5]{\begingroup\offinterlineskip +\textheight=#5\vbadness=10000\vfuzz=120ex\overfullrule=0pt% +\global\setbox\nipsrulerbox=\vbox to \textheight{% +{\parskip=0pt\hfuzz=150em\cv@boxheight=\textheight +\cv@lineheight=#1\global\nipsrulercount=#2% +\cv@tot\cv@boxheight\divide\cv@tot\cv@lineheight\advance\cv@tot2% +\cv@refno1\vskip-\cv@lineheight\vskip1ex% +\loop\setbox\cv@tmpbox=\hbox to0cm{{\nipstenhv\hfil\fillzeros[#4]\nipsrulercount}}% +\ht\cv@tmpbox\cv@lineheight\dp\cv@tmpbox0pt\box\cv@tmpbox\break +\advance\cv@refno1\global\advance\nipsrulercount#3\relax +\ifnum\cv@refno<\cv@tot\repeat}}\endgroup}% +\makeatother +% ----- end of vruler + +% \makevruler[][][][][] +\def\nipsruler#1{\makevruler[12pt][#1][1][3][0.993\textheight]\usebox{\nipsrulerbox}} +\AddToShipoutPicture{% +\ifnipsfinal\else +\nipsruleroffset=\textheight +\advance\nipsruleroffset by -3.7pt + \color[rgb]{.7,.7,.7} + \AtTextUpperLeft{% + \put(\LenToUnit{-35pt},\LenToUnit{-\nipsruleroffset}){%left ruler + \nipsruler{\nipsrulercount}} + } +\fi +} diff --git a/paper/sections/negative.tex b/paper/sections/negative.tex index f88117a..2eaeb47 100644 --- a/paper/sections/negative.tex +++ b/paper/sections/negative.tex @@ -25,7 +25,7 @@ guarantees are sufficient for optimization. First look at learning weights in a cover function. Maybe facility location? Sums of concave over modular are probably too hard because of the connection to neural networks. -\subsection{Optimizable $n\Rightarrow$ Learnable} +\subsection{Optimizable $\nRightarrow$ Learnable} Recall the matroid construction from~\cite{balcan2011learning}: \begin{theorem} -- cgit v1.2.3-70-g09d2 From ead8ef85b50848f078a360b040f59ea962821cb4 Mon Sep 17 00:00:00 2001 From: Thibaut Horel Date: Tue, 14 Apr 2015 16:46:34 -0400 Subject: Sketch of the introduction --- paper/sections/introduction.tex | 81 +++++++++++++++++++++++++++-------------- paper/sections/negative.tex | 30 +++++++++++++++ 2 files changed, 84 insertions(+), 27 deletions(-) diff --git a/paper/sections/introduction.tex b/paper/sections/introduction.tex index bcb406f..6eda59d 100644 --- a/paper/sections/introduction.tex +++ b/paper/sections/introduction.tex @@ -1,27 +1,54 @@ -Let $\Omega$ be the universe of elements and $f$ a function defined on subsets -of $\Omega$: $f : S \in 2^{[\Omega]} \mapsto f(S) \in \mathbb{R}$. Let $K$ be a -collection of sets of $2^{[\Omega]}$, which we call \emph{constraints}. Let -$S^*_K$ be any solution to $\max_{S \in K} f(S)$, which we will also denote by -$S^*$ when there is no ambiguity. Let $L$ be the problem size, which is often -(but not always) equal to $|\Omega|$. - -In general, we say we can efficiently optimize a function $f$ under constraint -$K$ when we have a polynomial-time algorithm making adaptive value queries to -$f$,which returns a set $S$ such that $S \in K$ and $f(S) \geq \alpha f(S^*)$ -with high probability and $\alpha$ an absolute constant. - -Here, we consider the scenario where we cannot make adaptive value queries, and -in fact, where we cannot make queries at all! Instead, we suppose that we -observe a polynomial number of set-value pairs $(S, f(S))$ where $S$ is taken -from a known distribution $D$. We say we can efficiently \emph{passively -optimize} $f$ under distribution $D$ or $D-$optimize $f$ under constraints $K$ -when, after observing ${\cal O}(L^c)$ set-value pairs from $D$ where $c > 0$ is -an absolute constant, we can return a set $S$ such that $S \in K$ and $f(S) -\geq \alpha f(S^*)$ with high probability and $\alpha$ an absolute constant. - -In the case of \emph{passive} observations of set-value pairs under a -distribution $D$ for a function $f$, recent research has focused on whether we -can efficiently and approximately \emph{learn} $f$. This was formalized in the -PMAC model from \cite{balcan2011learning}. When thinking about passive -optimization, it is necessary to understand the link between being able to - $D-PMAC$ learn $f$ and being able to $D-$optimize $f$. +The natural “diminishing return” property of submodular functions makes them +suitable to model many real-life scenarios. However, even when they are +suitable as a model, it is not realistic to assume that they are entirely +known. For example, most submodular functions depend on unknown parameters that +have to be learned from observations. + +A recent line of work has focused on learning submodular functions. The +PMAC-learnable framework formalizes what it means to learn a submodular +function in an “overall” sense. For specific examples of submodular functions, +results are known on how well the parameters of the function can be learned +from observations. + +When the experimenter wishes to optimize the unknown submodular function, the +well-known approximation algorithms cannot be readily applied because they rely +on a value query oracle, which is much stronger than what the present scenario +suggests (at best, random independent samples). The results on learning +submodular functions also seem to be insufficient in that they are myopic to +the ultimate use which will be made of the submodular function (optimizing). +The experimenter experimenter only cares about learning the function to the +extent that what he knows is sufficient to optimize it. As such, the main +question underlying the present work is the following: + +\begin{center} + \emph{What is a good model of learning for submodular functions when the + ultimate goal is to be able to optimize them?} +\end{center} + +We make the following contributions: +\begin{itemize} + \item We show how the PMAC learning framework does not provide a satisfying + framework for optimizing under uncertainty. In particular, we show that + there are functions which are PMAC-learnable but not optimizable and + vice-versa. + \item For specific class of submodular functions, we show how to optimize + them under uncertainty. +\end{itemize} + +\subsection{Related Work} + +\paragraph{Maximizing submodular functions} Vast literature on maximizing +submodular functions under many kind of constraints. All the algorithms rely on +some kind of oracle (e.g. value query or demand query) which is stronger than +what we have: access to polynomially many observations coming from +a distribution. In particular, an algorithm cannot make arbitrary (and +certainly not adaptive) queries to the oracle. + +\paragraph{PMAC-learning} Interesting line of work, but oblivious to the +ultimate goal. + +\paragraph{Bandits} Interesting in that it ties together learning and +optimizing. However, it assumes that arbitrary queries can be made to an oracle +allowing the experimenter to explore more efficiently (even though the standard +regret benchmark means that he will incur some cost for exploring vs +exploiting. diff --git a/paper/sections/negative.tex b/paper/sections/negative.tex index 2eaeb47..555aa1d 100644 --- a/paper/sections/negative.tex +++ b/paper/sections/negative.tex @@ -1,3 +1,33 @@ +\subsection{Model} + +Let $\Omega$ be the universe of elements and $f$ a function defined on subsets +of $\Omega$: $f : S \in 2^{[\Omega]} \mapsto f(S) \in \mathbb{R}$. Let $K$ be a +collection of sets of $2^{[\Omega]}$, which we call \emph{constraints}. Let +$S^*_K$ be any solution to $\max_{S \in K} f(S)$, which we will also denote by +$S^*$ when there is no ambiguity. Let $L$ be the problem size, which is often +(but not always) equal to $|\Omega|$. + +In general, we say we can efficiently optimize a function $f$ under constraint +$K$ when we have a polynomial-time algorithm making adaptive value queries to +$f$,which returns a set $S$ such that $S \in K$ and $f(S) \geq \alpha f(S^*)$ +with high probability and $\alpha$ an absolute constant. + +Here, we consider the scenario where we cannot make adaptive value queries, and +in fact, where we cannot make queries at all! Instead, we suppose that we +observe a polynomial number of set-value pairs $(S, f(S))$ where $S$ is taken +from a known distribution $D$. We say we can efficiently \emph{passively +optimize} $f$ under distribution $D$ or $D-$optimize $f$ under constraints $K$ +when, after observing ${\cal O}(L^c)$ set-value pairs from $D$ where $c > 0$ is +an absolute constant, we can return a set $S$ such that $S \in K$ and $f(S) +\geq \alpha f(S^*)$ with high probability and $\alpha$ an absolute constant. + +In the case of \emph{passive} observations of set-value pairs under a +distribution $D$ for a function $f$, recent research has focused on whether we +can efficiently and approximately \emph{learn} $f$. This was formalized in the +PMAC model from \cite{balcan2011learning}. When thinking about passive +optimization, it is necessary to understand the link between being able to + $D-PMAC$ learn $f$ and being able to $D-$optimize $f$. + \subsection{Learnable $\nRightarrow$ Optimizable} Consider the following function: -- cgit v1.2.3-70-g09d2