From 4d4fa73eb0a549ee487e012acf02b97deb0f33a0 Mon Sep 17 00:00:00 2001
From: Thibaut Horel <thibaut.horel@gmail.com>
Date: Tue, 17 Feb 2015 15:18:08 -0500
Subject: Add notes on message passing

---
 message-passing/main.tex | 179 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 message-passing/main.tex

(limited to 'message-passing/main.tex')

diff --git a/message-passing/main.tex b/message-passing/main.tex
new file mode 100644
index 0000000..a86c83f
--- /dev/null
+++ b/message-passing/main.tex
@@ -0,0 +1,179 @@
+\documentclass{article}
+\title{Message Passing on Trees: Analysis of the Sum-Product Algorithm}
+\author{Thibaut Horel}
+\usepackage[utf8]{inputenc}
+\usepackage{amsmath, amsfonts, amsthm}
+\newcommand{\set}[1]{\mathbf{#1}}
+\newcommand{\gx}{\mathcal{X}}
+\newcommand{\bx}{\boldsymbol{x}}
+\newcommand{\fa}{\phi}
+\newcommand*{\defeq}{\equiv}
+\newtheorem*{lemma}{Lemma}
+\newtheorem*{theorem}{Theorem}
+\usepackage[pagebackref=true,breaklinks=true,colorlinks=true]{hyperref}
+\begin{document}
+\maketitle
+
+\section{Introduction}
+
+Let $\gx_1,\ldots,\gx_n$ be $n$ finite sets, $n\in\set{N}$. We consider,
+a discrete probability distribution $p$ over
+$\boldsymbol{\gx}\defeq\gx_1\times\ldots\times\gx_n$ which \emph{factorizes}
+according to the undirected tree $T = (V, E)$ with $V = \{1,\ldots. n\}$. By
+definition, this means that there exist functions
+$\fa_i:\gx_i\rightarrow\set{R}^+$, $i\in V$, and functions
+$\fa_{i,j}:\gx_i\times\gx_j\rightarrow\set{R}^+$, $(i,j)\in E$, such that:
+\begin{equation}\label{eq:factor}
+    p(x_1,\ldots,x_n) = \kappa\prod_{i\in V}\fa_i(x_i)
+    \prod_{(i,j)\in E} \fa_{i,j}(x_i, x_j),
+    \quad
+    (x_1,\ldots,x_n)\in\boldsymbol\gx
+\end{equation}
+where $\kappa$ is a normalization constant which ensures that $p$ sums to one
+over $\boldsymbol\gx$.
+
+From now on, vectors will be denoted by bold letters (\emph{e.g.}
+$\bx\in\boldsymbol\gx$), while their coordinates will be denoted by regular
+letters (\emph{e.g.} $(x_1\ldots, x_n)\in\boldsymbol\gx$). If $S$ is a subset
+of $V$, and $\bx$ is a vector indexed by $V$, $\bx_S$ denotes the sub-vector
+obtained by keeping from $\bx$ only its coordinates in $S$ (when the ordering
+of the coordinates does not matter and there is no ambiguity).  Finally, for
+a node $i\in V$, $N(i)$ denotes the set of its neighbors: $N(i)\defeq \{j\in
+V\,|\, (i,j)\in E\}$.
+
+Our goal is to understand the behavior of the \emph{sum-product message passing
+algorithm} when applied to computing marginals of the probability distribution
+$p$. Formally, we want to compute for all $i\in V$, the marginal $p_i$:
+\begin{equation}\label{eq:marginal}
+    p_i(x) \defeq \sum_{\substack{\bx\in\gx\\ x_i = x}}p(\bx),
+    \quad
+    x\in\gx_i
+\end{equation}
+
+We recall that a tree can be \emph{rooted} at any of its nodes. By definition,
+this amounts to designate a node as the root of the tree, hence defining
+a partial ordering on the nodes of the tree. Once a tree is rooted, the notions
+of \emph{subtree}, \emph{descendant}, \emph{ascendant} and \emph{leaves} are
+defined with respect to the corresponding partial ordering.
+
+Let us now choose $i\in V$ and root $T$ on $i$. Let $j\in N(i)$, we denote by
+$T_{j,i} = (V_{j,i}, E_{j,i})$ the subtree rooted at $j$. Note that $i$ and
+$T_{j,i}$ for $j\in N(i)$ form a subdivision of $T$ into disjoint subtrees.
+Using this subdivision, and distributing the sum in \eqref{eq:marginal} over
+the products in \eqref{eq:factor} accordingly:
+\begin{equation}\label{eq:finish}
+    p_i(x) = \kappa \fa_i(x)\prod_{j\in N(i)} M_{j,i}(x)
+\end{equation}
+where:
+\begin{equation}\label{eq:limit}
+    M_{j,i}(x) = \sum_{\bx\in\boldsymbol\gx_{V_{i,j}}}\fa_{i,j}(x,x_j)
+        \prod_{k\in V_{j,i}}\fa_k(x_k)
+        \prod_{(k,l)\in E_{j,i}}\fa_{k,l}(x_k, x_l)
+\end{equation}
+Note that by keeping upfront the summation over $\gx_j$ in \eqref{eq:limit}
+while distributing the remaining summations over the products, we get:
+\begin{equation}\label{eq:recurse}
+    M_{j,i}(x) = \sum_{x_j\in\gx_j}\fa_{i,j}(x, x_j)\fa_j(x_j)\prod_{k\in
+    N(j)\setminus\{i\}}M_{k,j}(x_j)
+\end{equation}
+
+The form obtained in \eqref{eq:recurse} suggests a recursive algorithm to
+compute $p_i$: by starting from the leaves of $T$ rooted at $i$, the quantities
+$M_{l,k}$, $(k,l)\in E$ can be computed recursively, ultimately leading to the
+computation of $p_i$ as given by \eqref{eq:finish}.
+
+\section{The sum-product algorithm}
+
+The recursion above is at the core of the sum-product algorithm. The notable
+difference being that all the marginals are computed at the same time, thus
+reestablishing the symmetry which was lost when rooting $T$ on $i$. As
+a consequence, a node $j$ cannot rely on its neighbors having properly computed
+the quantities $M_{k,j}$, $k\in N(j)\setminus\{i\}$ \emph{prior to} computing
+$M_{j,i}$ and \eqref{eq:recurse} cannot be treated as an exact recursion.
+Instead, the sum-product algorithm is described in terms of time steps: at
+each time step, each node sends messages to its neighbors, the messages
+being computed from the messages received at the previous time step in a way
+paralleling \eqref{eq:recurse}.
+
+Formally, for all $(i,j)$ in $E$, we define $|\gx_i|$ messages that are sent
+from $j$ to $i$ at step $t\in\set{N}^*$:
+\begin{align}
+    \mu^0_{j,i}(x_i)&=\sum_{x_j\in\gx_j}\fa_{i,j}(x_i,x_j)\fa_j(x_j),
+    \quad x_i\in\gx_i\\
+    \label{eq:message-t}
+    \mu_{j,i}^t(x_i)&= \sum_{x_j\in\gx_j}\fa_{i,j}(x_i,x_j)\fa_j(x_j)
+    \prod_{k\in N(j)\setminus\{i\}}\mu_{k,j}^{t-1}(x_j),
+    \quad x_i\in\gx_i
+\end{align}
+
+Let us denote by $h_{j,i}$ the height of the subtree rooted at $j$ in $T$
+rooted at $i$:
+\begin{displaymath}
+    h_{j,i} \defeq
+    \begin{cases}
+        0 & \text{if $j$ is a leaf in $T$ rooted at $i$}\\
+        1 + \max_{k\in N(j)\setminus\{i\}} h_{k,j} & \text{otherwise}
+    \end{cases}
+\end{displaymath}
+Then, the following lemma holds.
+
+\begin{lemma}
+    Let $(i,j)\in E$ be any edge of $T$, then:
+    \begin{displaymath}
+        \forall t\geq h_{j,i},\;\forall x_i\in\gx_i,\quad
+        \mu_{j,i}^t(x_i) = M_{j,i}(x_i)
+    \end{displaymath}
+\end{lemma}
+
+\begin{proof}
+    We will prove this lemma by induction on $h_{j,i}$. The initialization is
+    straightforward: if $h_{j,i} = 0$, then the products in \eqref{eq:message-t}
+    as well as in \eqref{eq:recurse} are empty, and for all $t\geq 0$,
+    $\mu_{j,i}^t(x_i) = \mu_{j,i}^0(x_i) = M_{j,i}(x_i)$.
+
+    For the inductive step, let us consider an edge $(i,j)\in E$, with
+    associated height $h_{j,i}$ and a time $t\geq h_{j,i}$. By definition of
+    $h_{j,i}$, for all $k\in N(j)\setminus\{i\}$, $h_{k,j}<h_{j,i}$.
+    Furthermore, since $t\geq h_{j,i}$, we have that $t-1\geq h_{j,k}$. Hence,
+    applying the lemma:
+    \begin{displaymath}
+        \forall k\in N(j)\setminus\{i\},\;\forall x_j\in\gx_j,\quad
+        \mu_{k,j}^{t-1}(x_j) = M_{k,j}(x_j)
+    \end{displaymath}
+    Using this in \eqref{eq:message-t}, we obtain the exact same right-hand
+    term as in \eqref{eq:recurse} and:
+    \begin{displaymath}
+        \forall x_i\in\gx_i,\quad \mu_{j,i}^t(x_i) = M_{j,i}(x_i)
+    \end{displaymath}
+    which concludes the proof of the lemma.
+\end{proof}
+
+Let us denote by $D$ the diameter of $T$, that is, $D\defeq \max_{(i,j)\in
+V^2} d(i,j)$, where $d(i,j)$ denotes the length of the shortest path from $i$
+to $j$ in $T$.
+
+The previous lemma has the simple following consequence.
+
+\begin{theorem}
+    Let $(i,j)$ be any edge of $T$, then:
+    \begin{displaymath}
+        \forall t \geq D-1,\;\forall x_i\in\gx_i,\quad
+        \mu_{j,i}^t(x_i) = M_{j,i}(x_i)
+    \end{displaymath}
+\end{theorem}
+
+\begin{proof}
+    It suffices to observe that $D = 1 + max_{(i,j)\in E}\, h_{j,i}$ and to
+    apply the lemma.
+\end{proof}
+
+This implies the following simple mean of computing the marginals of $p$:
+compute the message updates in \eqref{eq:message-t} $D-1$ times. Then, all the
+messages will have \emph{converged} to the quantities $M_{j,i}$, $(i,j)\in E$.
+Finally, the marginals are given by formula \eqref{eq:marginal}.
+
+\vspace{\baselineskip}
+
+\textbf{TODO:} remarks, time/space complexity. It's time optimal in some sense.
+Practical considerations to implement the algorithm.
+\end{document}
-- 
cgit v1.2.3-70-g09d2