sorting/main.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155

\documentclass{article}
\title{Information Theoretic Analysis of the Complexity of Comparison-based Sorting}
\author{Thibaut Horel}
\usepackage[utf8]{inputenc}
\usepackage{amsmath, amsfonts, amsthm}
\newcommand{\set}[1]{\mathbf{#1}}
\newcommand{\gx}{\mathcal{X}}
\newcommand{\bx}{\boldsymbol{x}}
\newcommand{\fa}{\phi}
\newcommand*{\defeq}{\equiv}
\newtheorem*{lemma}{Lemma}
\newtheorem*{theorem}{Theorem}
\newtheorem*{proposition}{Proposition}
\theoremstyle{remark}
\newtheorem*{remark}{Remark}
\let\P\relax
\DeclareMathOperator{\P}{\mathbb{P}}
\DeclareMathOperator{\E}{\mathbb{E}}
\usepackage[pagebackref=true,breaklinks=true,colorlinks=true]{hyperref}
\begin{document}
\maketitle

We give an information-theoretic analysis of the complexity of comparison-based
sorting algorithms. We first give a proof of the standard $n\log n$ lower-bound
on the worst-case complexity of sorting. While conceptually equivalent to the
``standard proof'', we show how the information-theoretic framework provides
a simple and unified approach which naturally generalizes to average-case
complexity and randomized algorithms.

\paragraph{Model.} Let $\mathfrak{S}(E)$ be the symmetric group of the
permutations on a set $E$ of $n$ elements. We assume that $E$ is totally
ordered and write $e_1,\dots, e_n$ its elements in sorted order.

A comparison-based algorithm is given as input a permuted array $[\sigma(e_1),
\dots,\linebreak \sigma(e_n)]$ for some $\sigma\in\mathfrak{S}(E)$ and is
allowed to compare any two cells of this array. Formally, the algorithm has
access to $\sigma$ through \emph{comparison queries}; for $i,
j\in\{1,\dots,n\}^2$, the output of query $q(i, j)$ is given by:
\begin{displaymath}
    q(i, j) = \begin{cases}
        1&\text{if } \sigma(e_i)\leq \sigma(e_j)\\
        0&\text{otherwise}
    \end{cases}
\end{displaymath}
The output of the algorithm is the sorted array $[e_1,\dots,e_n]$.

Note that the input of the algorithm by itself is not a complete representation
of $\sigma$, but the pair (input, output) is. In other words, the algorithm is
discovering the permutation $\sigma$ through comparison queries. This point of
view is important to understand the information-theoretic approach.

\paragraph{Worst-case complexity.} For a comparison-based sorting algorithm $A$
and $\sigma\in\mathfrak{S}(E)$ we denote by $c(A, \sigma)$ the number of
comparison queries performed by $A$ on input $\sigma$\footnote{Note that the
number of comparisons can depend on the input, for example a bubble sort on
sorted input only requires $n$ comparisons, but requires $\Omega(n^2)$
comparisons in the worst case.}. The worst-case complexity $c_A$ of $A$ is
defined by $c_A\defeq\max_{\sigma\in\mathfrak{S}(E)} c(A,\sigma)$.

\begin{proposition}
    For any deterministic comparison-based sorting algorithm $A$, we have $c_A
    = \Omega(n\log n)$.
\end{proposition}

\begin{proof}
    Let us consider a random variable $X$ uniform over $\mathfrak{S}(E)$ and
    let us denote by $Y=(Y_1,\dots,Y_{c(A,X)})$ the output of the queries
    performed by $A$. Note that since we have complete knowledge of $\sigma$ at
    the end of the execution of $A$, the conditional entropy $H(X|Y)$ is $0$.
    Hence we can write:
    \begin{align}
        0 = H(X|Y) &= H(X, Y) - H(Y) \\
        &=H(X) - H(Y)\label{eq:2}\\
        &\geq H(X) - c_A\label{eq:3}\\
        &=\log(n!) - c_A \label{eq:4}
    \end{align}
    where \eqref{eq:2} follows from the fact that $Y$ is a deterministic
    function of $X$ and \eqref{eq:4} follows from the fact that $X$ is uniform
    over a set of size $n!$. \eqref{eq:3} follows from the bound $H(Y)\leq
    c_A$: each query only reveals one  bit of information and there are at most
    $c_A$ queries. This last argument can be made formal:
    \begin{align}
        H(Y) &= \sum_{k\geq 0} H(Y| c(A, X) = k)\P[c(A, X) = k]\\
        &=\sum_{k\geq 0} \P[c(A, X) = k] H(Y_1,\dots, Y_k)\\
        &\leq\sum_{k\geq 0} \P[c(A, X) = k]\sum_{i=1}^k H(Y_i)\label{eq:7}\\
        &\leq\sum_{k\geq 0} k\cdot\P[c(A, X) = k]\label{eq:8}\\
        &\leq c_A\label{eq:9}
    \end{align}
    where \eqref{eq:7} is a standard inequality (the joint entropy is maximal
    for independent variables) and \eqref{eq:8} follows from $H(Y_i)\leq 1$
    (since $Y_i$ takes values in $\{0, 1\}$). Finally, \eqref{eq:9} simply
    follows form the definition of worst-case complexity.

    We then conclude the proof by rewriting \eqref{eq:4} as $c_A\geq \log(n!)$
    and applying a standard Stirling approximation of the factorial.
\end{proof}

\begin{remark}
The above proof is written in a verbose way to make sure there is no confusion
    and to facilitate further generalizations, but the intuition is very
    simple: we gain $\log(n!)$ bits of information in $c_A$ queries and each
    query gives at most $1$ bit of information, hence $c_A\geq \log(n!)$.
\end{remark}

\paragraph{Average-case complexity.} The average complexity of
a comparison-based sorting algorithm $A$ is defined by $C_A\defeq \E[c(A, X)]$
where $X$ is a uniform random variable over $\mathfrak{S}(E)$. We have the
trivial relationship between average-case and worst-case complexity: $c_A\geq
C_A$ since the expectation of a random variable is at most its supremum.
Perhaps surprisingly though, the average complexity of sorting is no smaller
than its worst-case complexity.

\begin{proposition}
    For any deterministic comparison-based sorting algorithm $A$, we have $C_A
    = \Omega(n\log n)$.
\end{proposition}

\begin{proof}
The proof is exactly the same as the previous proof except that we observe that
    \eqref{eq:8} is exactly providing the bound $H(Y)\leq C_A$. In other words,
    the previous proof was already a proof of the average-case complexity,
    implicitly using the fact that $c_A\geq C_A$.
\end{proof}

\paragraph{Randomized algorithms.} We now consider the case where $A$ can be
a randomized algorithm, \emph{i.e} a distribution over deterministic
algorithms. This implies that the queries $Y_1, \dots, Y_{c(A, X)}$ made by $A$
on output $X$ are no longer a deterministic function of $X$. The complexity of
$A$ is then similarly defined by $C_A \defeq \E[c(A, X)]$ where the expectation is
taken over both the randomness of $A$ and the randomness of $X$ (uniform over
$\mathfrak{S}(E)$).

\begin{remark}
    One could also define the \emph{worst-case expected complexity}
    $C_A'=\linebreak\max_{\sigma}\E[c(A, \sigma)]$, but as before we have
    $C_A'\geq C_A$. As a consequence, a lower-bound on $C_A$ (randomized
    algorithm and random input) is, in some sense, the strongest result we can
    achieve.
\end{remark}

\begin{proposition}
    For a (possibly randomized) comparison-based sorting algorithm $A$, we have
    $C_A=\Omega(n\log n)$.
\end{proposition}

\begin{proof}
Again, the proof is exactly the one we already saw. The only difference is that
    \eqref{eq:2} is now an inequality: $Y$ is no longer a deterministic
    function of $X$ so we have $H(X,Y)\geq H(X)$.
\end{proof}

\paragraph{TODO.} Talk about the case where $E$ is a multiset, i.e. some keys
can be repeated, entropy-optimal sorting, 3-way quicksort, etc.

\end{document}