\documentclass{article}
\title{Message Passing on Trees: Analysis of the Sum-Product Algorithm}
\author{Thibaut Horel}
\usepackage[utf8]{inputenc}
\usepackage{amsmath, amsfonts, amsthm}
\newcommand{\set}[1]{\mathbf{#1}}
\newcommand{\gx}{\mathcal{X}}
\newcommand{\bx}{\boldsymbol{x}}
\newcommand{\fa}{\phi}
\newcommand*{\defeq}{\equiv}
\newtheorem*{lemma}{Lemma}
\newtheorem*{theorem}{Theorem}
\usepackage[pagebackref=true,breaklinks=true,colorlinks=true]{hyperref}
\begin{document}
\maketitle

\section{Introduction}

Let $\gx_1,\ldots,\gx_n$ be $n$ finite sets, $n\in\set{N}$. We consider,
a discrete probability distribution $p$ over
$\boldsymbol{\gx}\defeq\gx_1\times\ldots\times\gx_n$ which \emph{factorizes}
according to the undirected tree $T = (V, E)$ with $V = \{1,\ldots. n\}$. By
definition, this means that there exist functions
$\fa_i:\gx_i\rightarrow\set{R}^+$, $i\in V$, and functions
$\fa_{i,j}:\gx_i\times\gx_j\rightarrow\set{R}^+$, $(i,j)\in E$, such that:
\begin{equation}\label{eq:factor}
    p(x_1,\ldots,x_n) = \kappa\prod_{i\in V}\fa_i(x_i)
    \prod_{(i,j)\in E} \fa_{i,j}(x_i, x_j),
    \quad
    (x_1,\ldots,x_n)\in\boldsymbol\gx
\end{equation}
where $\kappa$ is a normalization constant which ensures that $p$ sums to one
over $\boldsymbol\gx$.

From now on, vectors will be denoted by bold letters (\emph{e.g.}
$\bx\in\boldsymbol\gx$), while their coordinates will be denoted by regular
letters (\emph{e.g.} $(x_1\ldots, x_n)\in\boldsymbol\gx$). If $S$ is a subset
of $V$, and $\bx$ is a vector indexed by $V$, $\bx_S$ denotes the sub-vector
obtained by keeping from $\bx$ only its coordinates in $S$ (when the ordering
of the coordinates does not matter and there is no ambiguity).  Finally, for
a node $i\in V$, $N(i)$ denotes the set of its neighbors: $N(i)\defeq \{j\in
V\,|\, (i,j)\in E\}$.

Our goal is to understand the behavior of the \emph{sum-product message passing
algorithm} when applied to computing marginals of the probability distribution
$p$. Formally, we want to compute for all $i\in V$, the marginal $p_i$:
\begin{equation}\label{eq:marginal}
    p_i(x) \defeq \sum_{\substack{\bx\in\gx\\ x_i = x}}p(\bx),
    \quad
    x\in\gx_i
\end{equation}

We recall that a tree can be \emph{rooted} at any of its nodes. By definition,
this amounts to designate a node as the root of the tree, hence defining
a partial ordering on the nodes of the tree. Once a tree is rooted, the notions
of \emph{subtree}, \emph{descendant}, \emph{ascendant} and \emph{leaves} are
defined with respect to the corresponding partial ordering.

Let us now choose $i\in V$ and root $T$ on $i$. Let $j\in N(i)$, we denote by
$T_{j,i} = (V_{j,i}, E_{j,i})$ the subtree rooted at $j$. Note that $i$ and
$T_{j,i}$ for $j\in N(i)$ form a subdivision of $T$ into disjoint subtrees.
Using this subdivision, and distributing the sum in \eqref{eq:marginal} over
the products in \eqref{eq:factor} accordingly:
\begin{equation}\label{eq:finish}
    p_i(x) = \kappa \fa_i(x)\prod_{j\in N(i)} M_{j,i}(x)
\end{equation}
where:
\begin{equation}\label{eq:limit}
    M_{j,i}(x) = \sum_{\bx\in\boldsymbol\gx_{V_{i,j}}}\fa_{i,j}(x,x_j)
        \prod_{k\in V_{j,i}}\fa_k(x_k)
        \prod_{(k,l)\in E_{j,i}}\fa_{k,l}(x_k, x_l)
\end{equation}
Note that by keeping upfront the summation over $\gx_j$ in \eqref{eq:limit}
while distributing the remaining summations over the products, we get:
\begin{equation}\label{eq:recurse}
    M_{j,i}(x) = \sum_{x_j\in\gx_j}\fa_{i,j}(x, x_j)\fa_j(x_j)\prod_{k\in
    N(j)\setminus\{i\}}M_{k,j}(x_j)
\end{equation}

The form obtained in \eqref{eq:recurse} suggests a recursive algorithm to
compute $p_i$: by starting from the leaves of $T$ rooted at $i$, the quantities
$M_{l,k}$, $(k,l)\in E$ can be computed recursively, ultimately leading to the
computation of $p_i$ as given by \eqref{eq:finish}.

\section{The sum-product algorithm}

The recursion above is at the core of the sum-product algorithm. The notable
difference being that all the marginals are computed at the same time, thus
reestablishing the symmetry which was lost when rooting $T$ on $i$. As
a consequence, a node $j$ cannot rely on its neighbors having properly computed
the quantities $M_{k,j}$, $k\in N(j)\setminus\{i\}$ \emph{prior to} computing
$M_{j,i}$ and \eqref{eq:recurse} cannot be treated as an exact recursion.
Instead, the sum-product algorithm is described in terms of time steps: at
each time step, each node sends messages to its neighbors, the messages
being computed from the messages received at the previous time step in a way
paralleling \eqref{eq:recurse}.

Formally, for all $(i,j)$ in $E$, we define $|\gx_i|$ messages that are sent
from $j$ to $i$ at step $t\in\set{N}^*$:
\begin{align}
    \mu^0_{j,i}(x_i)&=\sum_{x_j\in\gx_j}\fa_{i,j}(x_i,x_j)\fa_j(x_j),
    \quad x_i\in\gx_i\\
    \label{eq:message-t}
    \mu_{j,i}^t(x_i)&= \sum_{x_j\in\gx_j}\fa_{i,j}(x_i,x_j)\fa_j(x_j)
    \prod_{k\in N(j)\setminus\{i\}}\mu_{k,j}^{t-1}(x_j),
    \quad x_i\in\gx_i
\end{align}

Let us denote by $h_{j,i}$ the height of the subtree rooted at $j$ in $T$
rooted at $i$:
\begin{displaymath}
    h_{j,i} \defeq
    \begin{cases}
        0 & \text{if $j$ is a leaf in $T$ rooted at $i$}\\
        1 + \max_{k\in N(j)\setminus\{i\}} h_{k,j} & \text{otherwise}
    \end{cases}
\end{displaymath}
Then, the following lemma holds.

\begin{lemma}
    Let $(i,j)\in E$ be any edge of $T$, then:
    \begin{displaymath}
        \forall t\geq h_{j,i},\;\forall x_i\in\gx_i,\quad
        \mu_{j,i}^t(x_i) = M_{j,i}(x_i)
    \end{displaymath}
\end{lemma}

\begin{proof}
    We will prove this lemma by induction on $h_{j,i}$. The initialization is
    straightforward: if $h_{j,i} = 0$, then the products in \eqref{eq:message-t}
    as well as in \eqref{eq:recurse} are empty, and for all $t\geq 0$,
    $\mu_{j,i}^t(x_i) = \mu_{j,i}^0(x_i) = M_{j,i}(x_i)$.

    For the inductive step, let us consider an edge $(i,j)\in E$, with
    associated height $h_{j,i}$ and a time $t\geq h_{j,i}$. By definition of
    $h_{j,i}$, for all $k\in N(j)\setminus\{i\}$, $h_{k,j}<h_{j,i}$.
    Furthermore, since $t\geq h_{j,i}$, we have that $t-1\geq h_{j,k}$. Hence,
    applying the lemma:
    \begin{displaymath}
        \forall k\in N(j)\setminus\{i\},\;\forall x_j\in\gx_j,\quad
        \mu_{k,j}^{t-1}(x_j) = M_{k,j}(x_j)
    \end{displaymath}
    Using this in \eqref{eq:message-t}, we obtain the exact same right-hand
    term as in \eqref{eq:recurse} and:
    \begin{displaymath}
        \forall x_i\in\gx_i,\quad \mu_{j,i}^t(x_i) = M_{j,i}(x_i)
    \end{displaymath}
    which concludes the proof of the lemma.
\end{proof}

Let us denote by $D$ the diameter of $T$, that is, $D\defeq \max_{(i,j)\in
V^2} d(i,j)$, where $d(i,j)$ denotes the length of the shortest path from $i$
to $j$ in $T$.

The previous lemma has the simple following consequence.

\begin{theorem}
    Let $(i,j)$ be any edge of $T$, then:
    \begin{displaymath}
        \forall t \geq D-1,\;\forall x_i\in\gx_i,\quad
        \mu_{j,i}^t(x_i) = M_{j,i}(x_i)
    \end{displaymath}
\end{theorem}

\begin{proof}
    It suffices to observe that $D = 1 + max_{(i,j)\in E}\, h_{j,i}$ and to
    apply the lemma.
\end{proof}

This implies the following simple mean of computing the marginals of $p$:
compute the message updates in \eqref{eq:message-t} $D-1$ times. Then, all the
messages will have \emph{converged} to the quantities $M_{j,i}$, $(i,j)\in E$.
Finally, the marginals are given by formula \eqref{eq:marginal}.

\vspace{\baselineskip}

\textbf{TODO:} remarks, time/space complexity. It's time optimal in some sense.
Practical considerations to implement the algorithm.
\end{document}