4 files changed, 121 insertions, 98 deletions
diff --git a/paper/sections/model.tex b/paper/sections/model.tex
index e35aa4a..44236da 100644
--- a/paper/sections/model.tex
+++ b/paper/sections/model.tex
@@ -1,65 +1,55 @@
-We consider a graph ${\cal G}= (V, E, \Theta)$, where $\Theta$ is a $|V|\times
-|V|$ matrix of paremeters describing the edge weights of $\mathcal{G}$. We
-define $m\defeq |V|$. A \emph{cascade model} is a Markov process over the
-finite state space $\{0, 1, \dots, K\}^V$. An \emph{influence cascade} is
-a realisation of the random process described by a cascade model.
+We consider a graph ${\cal G}= (V, E, \Theta)$, where $\Theta$ is a $|V|\times |V|$ matrix of parameters describing the edge weights of $\mathcal{G}$. Let $m\defeq |V|$. For each node $j$, let $\Theta_{j}$ be the $j^{th}$ column vector of $\Theta$. Intuitively, $\Theta_{i,j}$ captures the ``influence'' of node $i$ on node $j$. A \emph{Cascade model} is a Markov process over a finite state space $\{0, 1, \dots, K-1\}^V$ with the following properties:
+\begin{enumerate}
+\item Conditioned on the previous time step, the transition probabilities for each node $i \in V$ are mutually independent.
+\item Of the $K$ possible states, there exists a \emph{contagious state} such that all transition probabilities of the Markov process can be expressed as a function of the graph parameters $\Theta$ and the set of ``contagious nodes'' at the previous time step.
+\item The initial probability over $\{0, 1, \dots, K-1\}^V$ of the Cascade Model is such that all nodes can eventually reach a \emph{contagious state} with non-zero probability. The ``contagious'' nodes at $t=0$ are called \emph{source nodes}.
+\end{enumerate}
 
-In practice, we restrict ourselves to discrete-time homogeneous cascade models.
-At $t=0$, each node in the graph has a constant probability $p_{init}$ of being
-in the ``active state''. The active nodes at $t=0$ are called the source nodes.
-Our probabilistic model for the source nodes is more realistic and less
-restrictive than the ``single source'' assumption made in \cite{Daneshmand:2014}
-and \cite{Abrahao:13}.
+In other words, a cascade model describes a diffusion process, whereby a set of contagious nodes ``influence'' other nodes in the graph to adopt or not their behavior. An \emph{influence cascade} is a realisation of this random process: the successive states of the nodes in graph ${\cal G}$. Note that both the ``single source'' assumption made in \cite{Daneshmand:2014} and \cite{Abrahao:13} as well as the ``uniformly chosen source set'' assumption made in \cite{Netrapalli:2012} verify condition 3.
 
+In the context of Graph Inference, \cite{Netrapalli:2012} focuses specifically on the well-known discrete-time independent cascade model recalled below, which \cite{Abrahao:13} and \cite{Daneshmand:2014} generalize to continuous time. Whilst we restrict ourselves to discrete-time processes, we consider both the independent cascade model and the linear voter model, and show they can be solved by a very similar maximum-likelihood program. The linear threshold model is a special case and is discussed in Section~\ref{sec:linear_threshold}.
 
-\subsection{Generalized Linear Cascade Models}
-
-Denoting by $X^t$ the state of the cascade at time step $t$, we interpret
-$X^t_j = 1$ for node $j\in V$ as ``node $j$ is active'', \emph{i.e}, ``$j$
-exhibits the source nodes' behavior at time step $t$''.  We draw inspiration
-from \emph{generalized linear models} (GLM) to define a generalized linear
-cascade.
+% Denoting by $X^t$ the state of the cascade at time step $t$, we interpret
+% $X^t_j = 1$ for node $j\in V$ as ``node $j$ is contagious'', \emph{i.e}, ``$j$
+% exhibits the source nodes' behavior at time step $t$''.  We draw inspiration
+% from \emph{generalized linear models} (GLM) to define a generalized linear
+% cascade.
 
-\begin{definition}
-    \label{def:glcm} 
-    Let us denote by $\{\mathcal{F}_t, t\in\ints\}$ the natural
-    filtration induced by $\{X_t, t\in\ints\}$. A \emph{generalized linear
-    cascade} is characterized by the following equation:
-    \begin{displaymath}
-        \P[X^{t+1}=x\,|\, \mathcal{F}_t] =
-        \prod_{i=1}^m f(\inprod{\theta_i}{X^{t}})^{x_i}
-        \big(1-f(\inprod{\theta_i}{X^{t}}\big)^{1-x_i}
-    \end{displaymath}
-where $f:\mathbb{R}\to[0,1]$ and $\theta_i$ is the $i$-th column of $\Theta$.
-\end{definition}
+% \begin{definition}
+%     \label{def:glcm} 
+%     Let us denote by $\{\mathcal{F}_t, t\in\ints\}$ the natural
+%     filtration induced by $\{X_t, t\in\ints\}$. A \emph{generalized linear
+%     cascade} is characterized by the following equation:
+%     \begin{displaymath}
+%         \P[X^{t+1}=x\,|\, \mathcal{F}_t] =
+%         \prod_{i=1}^m f(\inprod{\theta_i}{X^{t}})^{x_i}
+%         \big(1-f(\inprod{\theta_i}{X^{t}}\big)^{1-x_i}
+%     \end{displaymath}
+% where $f:\mathbb{R}\to[0,1]$ and $\theta_i$ is the $i$-th column of $\Theta$.
+% \end{definition}
 
-It follows immediately from this definition that a generalized linear cascade
-satisfies the Markov property:
-\begin{displaymath}
-    \P[X^{t+1}=x\,|\mathcal{F}_t] = \P[X^{t+1}=x\,|\, X^t]
-\end{displaymath}
-Note also that $\E[X^{t+1}_i\,|\,X^t] = f(\inprod{\theta_i}{X^t})$. As such,
-$f$ can be interpreted as the inverse link function of our generalized linear
-cascade model.
+% It follows immediately from this definition that a generalized linear cascade
+% satisfies the Markov property:
+% \begin{displaymath}
+%     \P[X^{t+1}=x\,|\mathcal{F}_t] = \P[X^{t+1}=x\,|\, X^t]
+% \end{displaymath}
+% Note also that $\E[X^{t+1}_i\,|\,X^t] = f(\inprod{\theta_i}{X^t})$. As such,
+% $f$ can be interpreted as the inverse link function of our generalized linear
+% cascade model.
 
-\subsection{Examples}
-\label{subsec:examples}
+\subsection{Independent Cascade Model}
 
-In this section, we show that the well-known Independent Cascade Model and the Voter model are Generalized Linear Cascades. The Linear Threshold model will be discussed in Section~\ref{sec:linear_threshold}.
-
-\subsubsection{Independent Cascade Model}
-
-In the independent cascade model, nodes can be either susceptible, active or
-inactive. At $t=0$, all source nodes are ``active'' and all remaining nodes are
+In the independent cascade model, nodes can be either susceptible, contagious or
+immune. At $t=0$, all source nodes are ``contagious'' and all remaining nodes are
 ``susceptible''. At each time step $t$, for each edge $(i,j)$ where $j$ is
-susceptible and $i$ is active, $i$ attempts to infect $j$ with probability
+susceptible and $i$ is contagious, $i$ attempts to infect $j$ with probability
 $p_{i,j}\in[0,1]$, the infection attempts are independent of each other. If $i$
-succeeds, $j$ will become active at time step $t+1$. Regardless of $i$'s
-success, node $i$ will be inactive at time $t+1$. In other words, nodes stay
-active for only one time step. The cascade process terminates when no active
+succeeds, $j$ will become contagious at time step $t+1$. Regardless of $i$'s
+success, node $i$ will be immune at time $t+1$. In other words, nodes stay
+contagious for only one time step. The cascade process terminates when no contagious
 nodes remain.
 
-If we denote by $X^t$ the indicator variable of the set of active nodes at time
+If we denote by $X^t$ the indicator variable of the set of contagious nodes at time
 step $t$, then if $j$ is susceptible at time step $t+1$, we have:
 \begin{displaymath}
     \P\big[X^{t+1}_j = 1\,|\, X^{t}\big]
@@ -72,24 +62,57 @@ Defining $\Theta_{i,j} \defeq \log(1-p_{i,j})$, this can be rewritten as:
     = 1 - \prod_{i = 1}^m e^{\Theta_{i,j}X^t_i}
     = 1 - e^{\inprod{\theta_j}{X^t}}
 \end{equation}
-which is a Generalized Linear Cascade model with inverse link function $f(z)
-= 1 - e^z$.
 
-\subsubsection{The Voter Model}
+\subsection{The Linear Voter Model}
 
-In the Voter Model, nodes can be either red or blue, where ``blue'' is the
-source state. The parameters of the graph are normalized such that $\forall i,
+In the Linear Voter Model, nodes can be either \emph{red} or \emph{blue}. Both states verify the properties of a contagious state, but for ease of presentation, we will suppose that the contagious nodes are the \emph{blue}. The parameters of the graph are normalized such that $\forall i,
 \ \sum_j \Theta_{i,j} = 1$. Each round, every node $j$ independently chooses
-one of its neighbors with probability $\Theta_ij$ and adopts their color. The
+one of its neighbors with probability $\Theta_{ij}$ and adopts their color. The
 cascades stops at a fixed horizon time T or if all nodes are of the same color.
 If we denote by $X^t$ the indicator variable of the set of blue nodes at time
 step $t$, then we have:
 \begin{equation}
-\mathbb{P}\left[X^{t+1}_j = 1 | X^t \right] = \sum_{i=1}^m \Theta_{i,j} X_i^t = \inprod{\theta_j}{X^t}
+\mathbb{P}\left[X^{t+1}_j = 1 | X^t \right] = \sum_{i=1}^m \Theta_{i,j} X_i^t = \inprod{\Theta_j}{X^t}
 \tag{V}
 \end{equation}
-which is a Generalized Linear Cascade model with inverse link function
-$f(z) = z$.
+
+% \subsection{The Linear Threshold Model}
+
+% In the Linear Threshold Model, each node $j\in V$ has a threshold $t_j$ from the interval $[0,1]$ and for each node, the sum of incoming weights is less than $1$: $\forall j\in V$, $\sum_{i=1}^m \Theta_{i,j} \leq 1$. 
+
+% Nodes have two states: susceptible or contagious. At each time step, each susceptible
+% node $j$ becomes contagious if the sum of the incoming weights from contagious parents is greater than $j$'s threshold $t_j$. Nodes remain contagious until the end of the cascade, which is reached when no new susceptible nodes become contagious.
+
+% As such, the source nodes are chosen, the process is entirely deterministic. Let $X^t$ be the indicator variable of the set of contagious nodes at time step $t-1$, then:
+% \begin{equation}
+% \nonumber
+%     X^{t+1}_j = \mathbbm{1}_{\sum_{i=1}^m \Theta_{i,j}X^t_i > t_j}
+%     = \mathbbm{1}_{\inprod{\theta_j}{X^t} > t_j}
+% \end{equation}
+% where we defined again $\theta_j\defeq (\Theta_{1,j},\ldots,\Theta_{m,j})$. In other words, for every step in the linear threshold model, we observe the following signal:
+
+% \begin{equation}
+%     \label{eq:lt}
+%     \tag{LT}
+%     \mathbb{P} \left[X^{t+1}_j = 1 | X^t\right] = \text{sign} \left(\inprod{\theta_j}{X^t} - t_j \right)
+% \end{equation}
+% where ``sign'' is the function $\mathbbm{1}_{\cdot > 0}$.
+
+\subsection{Generalized Linear Cascade Models}
+\label{sec:GLC}
+
+Despite their obvious differences, both models share a common similarity: conditioned on node $j$ being susceptible at time step $t$, the probability that node $j$ becomes infected at time step $t+1$ is a bernoulli of parameter $f(\Theta_j \cdot X^t)$, where $f:\mathbb{R} \rightarrow [0,1]$. In the case of the voter model, $f_{\text{V}}$ is the identify function, and in the case of the independent cascade model $f_{\text{ICC}} : z \rightarrow 1 - e^z$. We draw inspiration from generalized linear models to introduce \emph{Generalized Linear Cascades}:
+
+\begin{definition}
+\label{def:glcm}
+Let us denote by $\{\mathcal{F}_t, t\in\ints\}$ the natural filtration induced by the state of ${\cal G}$ up to time step t. A \emph{generalized linear cascade} is a cascade model characterized by the following equation:
+\begin{displaymath}
+    \P[X^{t+1}=x\,|\, \mathcal{F}_t] =
+    \prod_{i=1}^m f_{X^t_i,x_i}(\Theta_i \cdot X^t)
+\end{displaymath}
+where $f_{s,p}:\mathbb{R}\to[0,1]$ for each state pair $(s,p) \in \{0, \dots, K-1 \}$
+\end{definition}
+
 
 \subsection{Maximum Likelihood Estimation}
 
@@ -124,7 +147,7 @@ $\Theta$ can be estimated by a separate optimization program:
     \hat{\theta}_i \in \argmax_{\theta}\frac{1}{n_i}\mathcal{L}_i(\theta_i\,|\,x^1,\ldots,x^n)
     + \lambda\|\theta_i\|_1
 \end{equation}
-where we denote by $n_i$ the first step at which node $i$ becomes active and
+where we denote by $n_i$ the first step at which node $i$ becomes contagious and
 where:
 \begin{multline}
     \mathcal{L}_i(\theta_i\,|\,x^1,\ldots,x^n) = \frac{1}{n_i}
diff --git a/src/algorithms.py b/src/algorithms.py
index 926d9bc..9ebccf1 100644
--- a/src/algorithms.py
+++ b/src/algorithms.py
@@ -31,7 +31,7 @@ def greedy_prediction(G, cascades):
     return G_hat
 
 
-def recovery_l1obj_l2constraint(G, cascades, floor_cstt, passed_function,
+def recovery_passed_function(G, cascades, floor_cstt, passed_function,
                                 *args, **kwargs):
     """
     Returns estimated graph from convex program specified by passed_function
@@ -40,11 +40,13 @@ def recovery_l1obj_l2constraint(G, cascades, floor_cstt, passed_function,
     G_hat = cascade_creation.InfluenceGraph(max_proba=None)
     G_hat.add_nodes_from(G.nodes()) 
 
+    f_x, f_xz = passed_function(*args, **kwargs)
+
     for node in G_hat.nodes():
         print(node)
         try:
             M, w = cascade_creation.icc_matrixvector_for_node(cascades, node)
-            p_node, __ = passed_function(M,w, *args, **kwargs)
+            p_node, _ = convex_optimization.diff_and_opt(M, w, f_x, f_xz)
             G_hat = cascade_creation.add_edges_from_proba_vector(G=G_hat,
                         p_node=p_node, node=node, floor_cstt=floor_cstt)
         except timeout.TimeoutError:
@@ -86,15 +88,15 @@ def test():
     unit test
     """
     G = cascade_creation.InfluenceGraph(max_proba = .8)
-    G.erdos_init(n = 10, p = .2)
+    G.erdos_init(n = 100, p = .2)
     import time
     t0 = time.time()
     A = cascade_creation.generate_cascades(G, .2, 1000)
-    if 1:
-        G_hat = greedy_prediction(G, A)
     if 0:
-        G_hat = recovery_l1obj_l2constraint(G, A,
-                passed_function=convex_optimization.l1obj_l2penalization,
+        G_hat = greedy_prediction(G, A)
+    if 1:
+        G_hat = recovery_passed_function(G, A,
+                passed_function=convex_optimization.type_lasso,
                 floor_cstt=.1, lbda=10)
     correctness_measure(G, G_hat, print_values=True)
 
diff --git a/src/convex_optimization.py b/src/convex_optimization.py
index 1f34508..163b6d5 100644
--- a/src/convex_optimization.py
+++ b/src/convex_optimization.py
@@ -44,55 +44,52 @@ def sparse_recovery(M_val, w_val, lbda):
     return diff_and_opt(theta, theta_, M, M_val, w, lbda, y)
 
 
-@timeout.timeout(20)
-def type_lasso(M_val, w_val, lbda):
+def type_lasso(lbda):
     """
     Solves:
         min - sum_j theta_j + lbda*|e^{M*theta} - (1 - w)|_2
         s.t theta_j <= 0
     """
-    assert len(M_val) == len(w_val)
-
-    if M_val.dtype == bool:
-        M_val = M_val.astype('float32')
-
     if type(lbda) == int or type(lbda) == float:
         lbda = np.array(lbda)
 
+    lbda = theano.shared(lbda.astype(theano.config.floatX))
+
     theta = tensor.row().T
     theta_ = theta.flatten()
 
-    M = theano.shared(M_val.astype(theano.config.floatX))
-    w = theano.shared(w_val.astype(theano.config.floatX))
-    lbda = theano.shared(lbda.astype(theano.config.floatX))
+    M = tensor.matrix(name='M', dtype=theano.config.floatX)
+    w = tensor.vector(name='w', dtype=theano.config.floatX)
 
     y = lbda * (theta_).norm(1) + (
         tensor.exp(M.dot(theta_)) - (1 - w)).norm(2)
 
-    return diff_and_opt(theta, theta_, M, M_val, w, lbda, y)
-
-
-def diff_and_opt(theta, theta_, M, M_val, w, lbda, y):
     z = tensor.row().T
     z_ = z.flatten()
 
-    m, n = M_val.shape
-
     y_diff = tensor.grad(y, theta_)
     y_hess = z[0] * theano.gradient.hessian(y, theta_)
-    f_x = theano.function([theta], [y, y_diff], allow_input_downcast=True)
-    f_xz = theano.function([theta, z], [y, y_diff, y_hess],
-                           allow_input_downcast=True)
+    f_x = theano.function([theta, M, w], [y, y_diff],
+                        allow_input_downcast=True)
+    f_xz = theano.function([theta, z, M, w], [y, y_diff, y_hess],
+                        allow_input_downcast=True)
+
+    return f_x, f_xz
+
+@timeout.timeout(8)
+def diff_and_opt(M_val, w_val, f_x, f_xz):
+
+    m, n = M_val.shape
 
     def F(x=None, z=None):
         if x is None:
             return 0, cvxopt.matrix(-.001, (n,1))
         elif z is None:
-            y, y_diff = f_x(x)
+            y, y_diff = f_x(x, M_val, w_val)
             return cvxopt.matrix(float(y), (1, 1)),\
                    cvxopt.matrix(y_diff.astype("float64")).T
         else:
-            y, y_diff, y_hess = f_xz(x, z)
+            y, y_diff, y_hess = f_xz(x, z, M_val, w_val)
             return cvxopt.matrix(float(y), (1, 1)), \
                     cvxopt.matrix(y_diff.astype("float64")).T, \
                     cvxopt.matrix(y_hess.astype("float64"))
@@ -104,7 +101,7 @@ def diff_and_opt(theta, theta_, M, M_val, w, lbda, y):
     #cvxopt.solvers.options['feastol'] = 2e-5
     #cvxopt.solvers.options['abstol'] = 2e-5
     #cvxopt.solvers.options['maxiters'] = 100
-    cvxopt.solvers.options['show_progress'] = False
+    cvxopt.solvers.options['show_progress'] = True
     try:
         theta = cvxopt.solvers.cp(F, G, h)['x']
     except ArithmeticError:
@@ -128,13 +125,14 @@ def test():
     M_val, w_val = cascade_creation.icc_matrixvector_for_node(A, 0)
     
     #Type lasso
-    if 0:
-        p_vec, theta = type_lasso(M_val, w_val, lbda)
+    if 1:
+        f_x, f_xz = type_lasso(lbda)
+        p_vec, _ = diff_and_opt(M_val, w_val, f_x, f_xz)
         print(p_vec)
         print(G.mat)
 
     #Sparse recovery
-    if 1:
+    if 0:
         p_vec, theta = sparse_recovery(M_val, w_val, lbda)
         print(p_vec)
         print(G.mat)
diff --git a/src/make_plots.py b/src/make_plots.py
index 3ac8362..c1cafe7 100644
--- a/src/make_plots.py
+++ b/src/make_plots.py
@@ -23,7 +23,7 @@ def compare_greedy_and_lagrange_cs284r():
     #Lagrange Objective
     G_hat = algorithms.recovery_l1obj_l2constraint(G, A,
             passed_function=convex_optimization.type_lasso,
-            floor_cstt=.1, lbda=10)
+            floor_cstt=.05, lbda=10)
     algorithms.correctness_measure(G, G_hat, print_values=True)
 
 
@@ -32,11 +32,11 @@ def test():
     unit test
     """
     G = cascade_creation.InfluenceGraph(max_proba=.8)
-    G.erdos_init(n=20, p=.2)
+    G.erdos_init(n=30, p=.2)
     A = cascade_creation.generate_cascades(G, p_init=.1, n_cascades=1000)
-    G_hat = algorithms.recovery_l1obj_l2constraint(G, A,
-                passed_function=convex_optimization.sparse_recovery,
-                floor_cstt=.1, lbda=1)
+    G_hat = algorithms.recovery_passed_function(G, A,
+                passed_function=convex_optimization.type_lasso,
+                floor_cstt=.001, lbda=.0001)
     algorithms.correctness_measure(G, G_hat, print_values=True)
 
 if __name__=="__main__":