From 036659c719079a9782cd3c6443aabc0ab802e5c3 Mon Sep 17 00:00:00 2001
From: Thibaut Horel <thibaut.horel@gmail.com>
Date: Fri, 11 Nov 2016 12:33:08 -0500
Subject: Sorting lower-bound via information theory

---
 sorting/main.tex | 155 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 155 insertions(+)
 create mode 100644 sorting/main.tex

(limited to 'sorting')

diff --git a/sorting/main.tex b/sorting/main.tex
new file mode 100644
index 0000000..7325e67
--- /dev/null
+++ b/sorting/main.tex
@@ -0,0 +1,155 @@
+\documentclass{article}
+\title{Information Theoretic Analysis of the Complexity of Comparison-based Sorting}
+\author{Thibaut Horel}
+\usepackage[utf8]{inputenc}
+\usepackage{amsmath, amsfonts, amsthm}
+\newcommand{\set}[1]{\mathbf{#1}}
+\newcommand{\gx}{\mathcal{X}}
+\newcommand{\bx}{\boldsymbol{x}}
+\newcommand{\fa}{\phi}
+\newcommand*{\defeq}{\equiv}
+\newtheorem*{lemma}{Lemma}
+\newtheorem*{theorem}{Theorem}
+\newtheorem*{proposition}{Proposition}
+\theoremstyle{remark}
+\newtheorem*{remark}{Remark}
+\let\P\relax
+\DeclareMathOperator{\P}{\mathbb{P}}
+\DeclareMathOperator{\E}{\mathbb{E}}
+\usepackage[pagebackref=true,breaklinks=true,colorlinks=true]{hyperref}
+\begin{document}
+\maketitle
+
+We give an information-theoretic analysis of the complexity of comparison-based
+sorting algorithms. We first give a proof of the standard $n\log n$ lower-bound
+on the worst-case complexity of sorting. While conceptually equivalent to the
+``standard proof'', we show how the information-theoretic framework provides
+a simple and unified approach which naturally generalizes to average-case
+complexity and randomized algorithms.
+
+\paragraph{Model.} Let $\mathfrak{S}(E)$ be the symmetric group of the
+permutations on a set $E$ of $n$ elements. We assume that $E$ is totally
+ordered and write $e_1,\dots, e_n$ its elements in sorted order.
+
+A comparison-based algorithm is given as input a permuted array $[\sigma(e_1),
+\dots,\linebreak \sigma(e_n)]$ for some $\sigma\in\mathfrak{S}(E)$ and is
+allowed to compare any two cells of this array. Formally, the algorithm has
+access to $\sigma$ through \emph{comparison queries}; for $i,
+j\in\{1,\dots,n\}^2$, the output of query $q(i, j)$ is given by:
+\begin{displaymath}
+    q(i, j) = \begin{cases}
+        1&\text{if } \sigma(e_i)\leq \sigma(e_j)\\
+        0&\text{otherwise}
+    \end{cases}
+\end{displaymath}
+The output of the algorithm is the sorted array $[e_1,\dots,e_n]$.
+
+Note that the input of the algorithm by itself is not a complete representation
+of $\sigma$, but the pair (input, output) is. In other words, the algorithm is
+discovering the permutation $\sigma$ through comparison queries. This point of
+view is important to understand the information-theoretic approach.
+
+\paragraph{Worst-case complexity.} For a comparison-based sorting algorithm $A$
+and $\sigma\in\mathfrak{S}(E)$ we denote by $c(A, \sigma)$ the number of
+comparison queries performed by $A$ on input $\sigma$\footnote{Note that the
+number of comparisons can depend on the input, for example a bubble sort on
+sorted input only requires $n$ comparisons, but requires $\Omega(n^2)$
+comparisons in the worst case.}. The worst-case complexity $c_A$ of $A$ is
+defined by $c_A\defeq\max_{\sigma\in\mathfrak{S}(E)} c(A,\sigma)$.
+
+\begin{proposition}
+    For any deterministic comparison-based sorting algorithm $A$, we have $c_A
+    = \Omega(n\log n)$.
+\end{proposition}
+
+\begin{proof}
+    Let us consider a random variable $X$ uniform over $\mathfrak{S}(E)$ and
+    let us denote by $Y=(Y_1,\dots,Y_{c(A,X)})$ the output of the queries
+    performed by $A$. Note that since we have complete knowledge of $\sigma$ at
+    the end of the execution of $A$, the conditional entropy $H(X|Y)$ is $0$.
+    Hence we can write:
+    \begin{align}
+        0 = H(X|Y) &= H(X, Y) - H(Y) \\
+        &=H(X) - H(Y)\label{eq:2}\\
+        &\geq H(X) - c_A\label{eq:3}\\
+        &=\log(n!) - c_A \label{eq:4}
+    \end{align}
+    where \eqref{eq:2} follows from the fact that $Y$ is a deterministic
+    function of $X$ and \eqref{eq:4} follows from the fact that $X$ is uniform
+    over a set of size $n!$. \eqref{eq:3} follows from the bound $H(Y)\leq
+    c_A$: each query only reveals one  bit of information and there are at most
+    $c_A$ queries. This last argument can be made formal:
+    \begin{align}
+        H(Y) &= \sum_{k\geq 0} H(Y| c(A, X) = k)\P[c(A, X) = k]\\
+        &=\sum_{k\geq 0} \P[c(A, X) = k] H(Y_1,\dots, Y_k)\\
+        &\leq\sum_{k\geq 0} \P[c(A, X) = k]\sum_{i=1}^k H(Y_i)\label{eq:7}\\
+        &\leq\sum_{k\geq 0} k\cdot\P[c(A, X) = k]\label{eq:8}\\
+        &\leq c_A\label{eq:9}
+    \end{align}
+    where \eqref{eq:7} is a standard inequality (the joint entropy is maximal
+    for independent variables) and \eqref{eq:8} follows from $H(Y_i)\leq 1$
+    (since $Y_i$ takes values in $\{0, 1\}$). Finally, \eqref{eq:9} simply
+    follows form the definition of worst-case complexity.
+
+    We then conclude the proof by rewriting \eqref{eq:4} as $c_A\geq \log(n!)$
+    and applying a standard Stirling approximation of the factorial.
+\end{proof}
+
+\begin{remark}
+The above proof is written in a verbose way to make sure there is no confusion
+    and to facilitate further generalizations, but the intuition is very
+    simple: we gain $\log(n!)$ bits of information in $c_A$ queries and each
+    query gives at most $1$ bit of information, hence $c_A\geq \log(n!)$.
+\end{remark}
+
+\paragraph{Average-case complexity.} The average complexity of
+a comparison-based sorting algorithm $A$ is defined by $C_A\defeq \E[c(A, X)]$
+where $X$ is a uniform random variable over $\mathfrak{S}(E)$. We have the
+trivial relationship between average-case and worst-case complexity: $c_A\geq
+C_A$ since the expectation of a random variable is at most its supremum.
+Perhaps surprisingly though, the average complexity of sorting is no smaller
+than its worst-case complexity.
+
+\begin{proposition}
+    For any deterministic comparison-based sorting algorithm $A$, we have $C_A
+    = \Omega(n\log n)$.
+\end{proposition}
+
+\begin{proof}
+The proof is exactly the same as the previous proof except that we observe that
+    \eqref{eq:8} is exactly providing the bound $H(Y)\leq C_A$. In other words,
+    the previous proof was already a proof of the average-case complexity,
+    implicitly using the fact that $c_A\geq C_A$.
+\end{proof}
+
+\paragraph{Randomized algorithms.} We now consider the case where $A$ can be
+a randomized algorithm, \emph{i.e} a distribution over deterministic
+algorithms. This implies that the queries $Y_1, \dots, Y_{c(A, X)}$ made by $A$
+on output $X$ are no longer a deterministic function of $X$. The complexity of
+$A$ is then similarly defined by $C_A \defeq \E[c(A, X)]$ where the expectation is
+taken over both the randomness of $A$ and the randomness of $X$ (uniform over
+$\mathfrak{S}(E)$).
+
+\begin{remark}
+    One could also define the \emph{worst-case expected complexity}
+    $C_A'=\linebreak\max_{\sigma}\E[c(A, \sigma)]$, but as before we have
+    $C_A'\geq C_A$. As a consequence, a lower-bound on $C_A$ (randomized
+    algorithm and random input) is, in some sense, the strongest result we can
+    achieve.
+\end{remark}
+
+\begin{proposition}
+    For a (possibly randomized) comparison-based sorting algorithm $A$, we have
+    $C_A=\Omega(n\log n)$.
+\end{proposition}
+
+\begin{proof}
+Again, the proof is exactly the one we already saw. The only difference is that
+    \eqref{eq:2} is now an inequality: $Y$ is no longer a deterministic
+    function of $X$ so we have $H(X,Y)\geq H(X)$.
+\end{proof}
+
+\paragraph{TODO.} Talk about the case where $E$ is a multiset, i.e. some keys
+can be repeated, entropy-optimal sorting, 3-way quicksort, etc.
+
+\end{document}
-- 
cgit v1.2.3-70-g09d2