1 files changed, 45 insertions, 3 deletions
diff --git a/results.tex b/results.tex
index f4cea1a..1848d35 100644
--- a/results.tex
+++ b/results.tex
@@ -186,7 +186,9 @@ g(S^*) \geq \alpha f(S^*) \geq \alpha f(OPT) \geq
 where $OPT \in \arg \max_{S \in K} g(S)$. This can be taken one step further by
 considering a function $g$ such that there exists $\alpha, \beta >0$, an
 additive function $f$ and a bijective univariate function $\phi$, such that
-$\forall S, \alpha \phi(f(S)) \leq g(S) \leq \beta \phi(f(S))$. In this case, we solve the system $A w = \phi^{-1}(B)$ and obtain once again an $\alpha/\beta$-approximation to the optimum of $g$.
+$\forall S, \alpha \phi(f(S)) \leq g(S) \leq \beta \phi(f(S))$. In this case,
+we solve the system $A w = \phi^{-1}(B)$ and obtain once again an
+$\alpha/\beta$-approximation to the optimum of $g$.
 
 \begin{remark}
 Note that here $D-$optimizing $f$ is easy because $D-$learning $f$ is easy. We
@@ -219,11 +221,51 @@ If $D$ is a product distribution such that $ \exists c > 0, \forall i,
 \mathbb{P}(i) \geq c$, it is
 easy to show that $\forall i \in
 \Omega,\ \hat W_i \rightarrow_{|O| \rightarrow +\infty} w_i$
-By using standard concentration bounds, we can hope to obtain within $poly(n, \frac{1}{\epsilon})$ observations:
+By using standard concentration bounds, we can hope to obtain within $poly(n,
+\frac{1}{\epsilon})$ observations and with high probability:
 
+\begin{align}
+  &\forall i \in S : w_i \neq 0 \implies (1-\epsilon) w_i \leq \hat W_i \leq (1
+  + \epsilon) w_i \label{eq:first_condition}\\
+  & \forall i \in S: w_i = 0 \implies 0 \leq \hat W_i \leq \epsilon
+  \label{eq:second_condition}
+\end{align}
+
+Note that if Eq.~\ref{eq:first_condition} and Eq.~\ref{eq:second_condition} are
+both verified, then we can approximately $D-$optimize $f$ under cardinality
+constraint. Let $K \in \mathbb{N}$, let $J$ be the indices of the $K$ largest
+estimated weights $\hat W_i$ and $I$ be the indices of the $K$ largest true
+weights $w_i$. We have:
+
+\begin{align*}
+  \sum_{j \in J \cap \{j : w_j \neq 0\}} w_i  & \geq \frac{1}{1+\epsilon}
+  \sum_{j \in J \cap \{j : w_j \neq 0\}} \hat W_i \\
+                                              & \geq \frac{1}{1+\epsilon}
+  \left( \sum_{j \in J} \hat W_i - |J \cap \{j: w_j =0\} |\epsilon \right) \\
+  & \geq \frac{1}{1+\epsilon} \left( \sum_{j in I} \hat W_i - K\epsilon \right)
+  \\ & \geq \frac{1-\epsilon}{1+\epsilon} OPT - \frac{\epsilon}{1+\epsilon} K
+\end{align*}
+
+Let $\epsilon \leq 1/(1+K/(\delta OPT)) \approx \delta OPT/K$, then
+it follows that choosing the $K$ largest estimated weights $\hat W_i$ is a
+$(1-\delta)(1-\epsilon)/(1+\epsilon)$-approximation to the optimum.
+
+\paragraph{Interpretation} Notice that the expression of our estimator is the
+finite-sample approximation of the differential of the multilinear extension of
+$f$ with respect to $w_i$. Because $f$ is additive, the gradient of its
+multilinear extension with respect to its parameters is constant. Recalling the
+continuous-greedy procedure, it is intuitive that we can $D$-optimize an
+additive function by estimating the gradient of its multilinear extension at
+the unit vectors.
+
+\begin{remark}
+Note that we have not managed to do without a learning phase, we are still
+learning $f$ approximately to optimize it.
+\end{remark}
 
 %For every node
-%$i$, we compute the \emph{average weight of every set containing element $i$}. Let $w_i$ be
+%$i$, we compute the \emph{average weight of every set containing element $i$}.
+%Let $w_i$ be
 %the weight of element $i$, $w \equiv f(\Omega) = \sum_{i \in \Omega} w_i$ and
 %$p \equiv \frac{k}{n}$, then $$\forall i \in \Omega, \mathbb{E}_{S \sim
 %D_p}\left[f(S)| i \in S\right] = pw + (1 -p)w_i$$