From ed0a93f509073e9c4159a21a07a24d022d931099 Mon Sep 17 00:00:00 2001
From: Thibaut Horel <thibaut.horel@gmail.com>
Date: Fri, 6 Nov 2015 17:42:01 -0500
Subject: Pass over identifiability section

---
 finale/mid_report.tex | 93 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 63 insertions(+), 30 deletions(-)

(limited to 'finale')

diff --git a/finale/mid_report.tex b/finale/mid_report.tex
index 75f5f4a..b7d9608 100644
--- a/finale/mid_report.tex
+++ b/finale/mid_report.tex
@@ -195,15 +195,19 @@ specific instances of the GLC model.
 
 \section{Identifiability}
 
-A given source distribution $p_s$ and \cref{eq:markov} completely specifies the
-distribution $p$ of a cascade $\mathbf{x} = (x_t)_{t\geq 0}$:
-\begin{displaymath}
+It follows from Section 2, that a source distribution $p_s$ and
+\cref{eq:markov} together completely specify the distribution $p$ of a cascade
+$\mathbf{x} = (x_t)_{t\geq 0}$:
+\begin{equation}
+    \label{eq:dist}
     p(\bx;\Theta) = p_s(x^0)\prod_{t\geq 1}\prod_{i\in S_t}
     f(\bt_i\cdot x^{t-1})^{x^t_i}\big(1-f(\theta_i\cdot x^{t-1})\big)^{1-x_i^t}
-\end{displaymath}
+\end{equation}
 
-We see this model as parametrized by $\Theta$. $f$ and $p_s$ are considered
-fixed.
+In what follows, we consider that $p_s$ and $f$ are fixed. Hence the only
+parameter of the model is the matrix $\Theta$ of parameters that we wish to
+learn. We note that the problem of learning $\Theta$ might be ill-defined in
+case of unidentifiability. We recall the definition of identifiability below.
 
 \begin{definition}
     We say that the model $\mathcal{P}
@@ -215,9 +219,10 @@ fixed.
     \end{displaymath}
 \end{definition}
 
-In our case, identifiability is not guaranteed. In fact there are two factors
-which can lead to unidentifiability: lack of information or ambiguity. We
-illustrate both factors in the subsequent two examples.
+For the GLC model, identifiability is not guaranteed. To develop an intuition
+about why this might be the case, we present below examples of
+unidentifiability because of two obstructions: lack of information and parent
+ambiguity.
 
 \begin{figure}
 \begin{center}
@@ -234,11 +239,12 @@ adversarially.}
 \begin{example}[Lack of information]
     Let us consider the graph in \cref{fig:ident} and the source distribution
     $p_s$ defined by $p_s\big((0, 1, 0)\big) = 1$. In other words, node $2$ is
-    always the only infected node at $t=0$. Then it is clear that all
+    always the only infected node at $t=0$. Then, it is clear that all
     casacades die at $t=1$ at which node 3 becomes infected with probability
     $f(\theta_{2,3})$. This probability does not depend on $\theta_{1,3}$.
-    Hence all matrices $\Theta$ compatible with the graph and for which
-    $\theta_{2,3}$ is fixed yield the same cascade distribution.
+    Hence all matrices $\Theta$ compatible with the graph (defining the same
+    edge set) and for which $\theta_{2,3}$ is constant yield the same cascade
+    distribution according to \cref{eq:dist}.
 \end{example}
 
 \vspace{.5em}
@@ -256,17 +262,20 @@ for some $c\in \R_+$ define the same cascade probability distribution.
 
 \vspace{.5em}
 
-The examples show that for a model to be identifiable, we need conditions on
-the source distribution, in particular in relation to how well it plays with
-the reachability structure of the graph.
+It appears from these examples show that the question of identifiability is
+closely related to the question of reachability of nodes from the source. The
+following proposition shows that with the right assumptions on the source
+distribution we can guarantee identifiability of the model.
 
-We will abuse the notation $p_s$ and for $u\in V$ write:
+We will slightly abuse the notation $p_s$ and for $u\in V$ write:
 \begin{displaymath}
     p_s(w) \eqdef \sum_{\bx: \bx_w = 1} p_s(\bx)
 \end{displaymath}
-Furthermore we write $u\leadsto v$ if there is a directed path from $u$ to $v$
-in $G$ and $u\leadsto v\leadsto w$ if there is a directed path from $u$ to $w$
-passing through $v$ in $G$.
+and similarly for a set of vertics $S$,  $p_s(S)$ is defined by replacing the
+condition $\bx_w = 1$ by $\bx_S = 1$ in the summation condition in the above
+equation. Furthermore, we write $u\leadsto v$ if there is a directed path from
+$u$ to $v$ in $G$ and $u\leadsto v\leadsto w$ if there is a directed path from
+$u$ to $w$ passing through $v$ in $G$.
 
 \begin{proposition}
 Under the assumptions:
@@ -276,22 +285,46 @@ Under the assumptions:
     \item $\forall (u,v)\in V^2$, there exists $w$ such that $p_s(w)\neq 0$ and
         $w\leadsto u\leadsto v$.
 \end{enumerate}
-then the model is identifiable.
+then the GLC model is identifiable.
 \end{proposition}
 
-2. prevents parent ambiguity at the source. 1. prevents parent ambiguity from
-   occurring later on in the cascade even when there is no ambiguity at the
-   source (could be relaxed to ``absence of forks''). 3. prevents lack of
-   information.
+\begin{proof}[Proof sketch] We only give a quick proof sketch here and defer
+    the details to the final version of this report. Let us consider $\Theta
+    \neq \Theta'$. The goal is to prove that $p(\cdot\,;\,\Theta)\neq
+    p(\cdot\,\;\, \Theta')$. Since $\Theta \neq \Theta'$ there exists a pair
+    $(u,v)$ such that $\Theta_{u,v} \neq \Theta'_{u,v}$. Using assumption 3. we
+    are guaranteed that with non-zero probability there will exist a time-step
+    $t$ such that $x^t_u = 1$. We then distinguish two cases:
+    \begin{itemize}
+        \item $t=0$, in this case assumption 2. guarantees that with non-zero
+            probability $u$ will be the only infected node at $t=0$.
+        \item $t\geq 1$, in this case assumption 1. guarantees that with
+            non-zero probability $u$ will be the only infected node at
+            time-step $t$ (because the transitions in \cref{eq:markov} occur
+            independently).
+    \end{itemize}
+    In both cases, it then follows from the form of \cref{eq:dist} after
+    conditioning on $x^{t'}$ for $t' < t$ that the probability that $v$ will be
+    infected at $t+1$ is different under $\Theta$ and under $\Theta'$.
+\end{proof}
 
-\begin{proof}
 
-\end{proof}
+\begin{remark}
+Intuitively, the assumptions of the proposition can be interpreted as follows:
+assumption 2. prevents parent ambiguity at the source; assumption 1. prevents
+parent ambiguity from occurring at later time-steps in the cascade even when
+there is no ambiguity at the source; finally, assumption 3. prevents lack of
+information.
+
+When the graph $G$ is strongly connected, examples of simple source
+distributions which satisfy the assumptions of the proposition include a single
+source chosen uniformly at random among the vertices of the graph, or a source
+where each node in $G$ is included independently with some probability $p>0$.
 
-\begin{remark} The conditions are the weakest under which there is
-    identifiability (we can show that they are necessary and sufficient).
-    Reasonable source distributions which imply the conditions are uniform
-    random, independent Bernouilli.
+Finally, Assumption 1 can be relaxed to a weaker assumption. For this weaker
+assumption, Proposition 2. becomes an exact characterization of the
+identifiability of the model. Again, we defer the details to the final version
+of this report.
 \end{remark}
 
 \section{Inference}
-- 
cgit v1.2.3-70-g09d2