Some more details and Jelani's additions

author: Thibaut Horel <thibaut.horel@gmail.com> 2014-09-11 11:39:46 -0400
committer: Thibaut Horel <thibaut.horel@gmail.com> 2014-09-11 11:39:46 -0400
commit: 2240f5cc3708235143be6efb48e3fb93b2bf1f8a (patch)
tree: 892eb44114dc2ab7c034f1e925991f01e06fa310 /lecture/main.tex
parent: b61804836ac2e35f636540d5a170486d9f845973 (diff)
download: cs224-2240f5cc3708235143be6efb48e3fb93b2bf1f8a.tar.gz
1 files changed, 118 insertions, 53 deletions
diff --git a/lecture/main.tex b/lecture/main.tex
index 3edd05e..bef50ba 100644
--- a/lecture/main.tex
+++ b/lecture/main.tex
@@ -56,18 +56,16 @@
 \begin{document}
 
 \lecture{Lecture 3 --- Tuesday Sep 09, 2014}{Fall 2014}{Prof.\ Jelani Nelson}{Thibaut Horel}
-}
+
 \section{Overview}
 
-Today, hashing
-\begin{itemize}
-    \item load balancing
-    \item k-wise independence
-    \item dictionary (perfect hashing, linear probing)
-    \item retrieval problem (cuckoo hashing, bloomier filters)
-    \item membership, bloom filters
-    \item simple tabulation
-\end{itemize}
+In the previous lecture we finished covering data structures for the
+predecessor problem by studying Fusion Trees.
+
+In this lecture, we cover several notions related to hashing: using hashing
+for load balancing, $k$-wise independent families of hashing functions and the
+dictionary problem for which we show two solutions: chaining with linked lists
+and linear probing.
 
 \section{Hashing}
 
@@ -126,7 +124,7 @@ The Chernoff bound can be proved using Markov's inequality.
 Using the Chernoff bound, we can prove the following proposition.
 
 \begin{proposition}
-    For the balls and bins random process, with $m=n$, then:
+    For the balls and bins random process, with $m=n$, then for any $c > 1$:
     \begin{displaymath}
         \Pr\left(\exists \text{ machine with more than $\frac{c\log n}{\log\log
             n}$ jobs}\right)
@@ -147,16 +145,18 @@ Using the Chernoff bound, we can prove the following proposition.
     \begin{displaymath}
         P\left(X\geq \frac{c\log n}{\log \log n}\right) \leq
     \left(\frac{e}{c\log n/\log\log n}\right)^{\frac{c\log
-    n}{\log\log n}} = \left(\frac{e}{k}\right)^k\leq\frac{1}{n^c}
+    n}{\log\log n}} = \left(\frac{e}{k}\right)^k
     \end{displaymath}
-    where $k=\frac{c\log n}{\log \log n}$. The last inequality comes from the
-    fact that the solution to $k^k = n^c$ is $k=O(\log n/\log\log n)$. This
+    where $k=\frac{c\log n}{\log \log n}$. It is easy to see that for any
+    $\eps>0$ and for $n$ large enough, $\left(\frac{e}{k}\right)^k \leq
+    \frac{1}{n^{c-\eps}}$. This
     implies, by using a union bound:
     \begin{displaymath}
         \Pr(\exists\text{ overloaded machine})
         \leq \sum_{t=1}^n \Pr (t\text{ is overloaded})
-        \leq \frac{1}{n^{c-1}}\qedhere
+        \leq \frac{1}{n^{c-\eps-1}}
     \end{displaymath}
+    Chosing $\eps < c-1$ is enough to conclude the proof.
 \end{proof}
 
 We propose an alternative proof which does not rely on the Chernoff bound.
@@ -198,14 +198,16 @@ the following definitions.
     for all $k$.
 \end{example}
 
-\begin{example}[$u=m=p$, $p$ prime] We define
-    $\H = \big\{h: h(x) = \sum_{i=0}^{k-1} a_ix^i\bmod p\big\}$ where
-    $a_i\in\{0,\ldots,p-1\}$. By using polynomial interpolation, we know that
-    for any \mbox{$(x_1,\ldots, x_k, y_1,\ldots,y_k)\in[p]^{2k}$} there exists
-    a unique polynomial $h\in\H$ such that $h(x_1) = y_1,\ldots, h(x_k) = y_k$.
-    Since there are $p^k$ elements in $\H$, if $h\in\H$ is drawn uniformly at
-    random, $\Pr(h(x_1) = y_1,\ldots,h(x_k) = y_k) = 1/p^k$, which shows that
-    $\H$ is $k$-wise independent.
+\begin{example}[$u=m=p$, $p$ prime] We define $\H = \big\{h: h(x)
+    = \sum_{i=0}^{k-1} a_ix^i\bmod p\big\}$ where $a_i\in\{0,\ldots,p-1\}$. By
+    using polynomial interpolation, we know that for any \mbox{$(x_1,\ldots,
+    x_k, y_1,\ldots,y_k)\in[p]^{2k}$} there exists a unique polynomial $h\in\H$
+    such that $h(x_1) = y_1,\ldots, h(x_k) = y_k$.  Since there are $p^k$
+    elements in $\H$, if $h\in\H$ is drawn uniformly at random, $\Pr(h(x_1)
+    = y_1,\ldots,h(x_k) = y_k) = 1/p^k$, which shows that $\H$ is $k$-wise
+    independent. It is also easy to see that this family is not
+    \mbox{$k+1$-wise} independent: again using polynomial interpolation, once
+    $h(x_1),\ldots,h(x_k)$ are fixed, $h(x_{k+1})$ is entirely determined.
 \end{example}
 
 \begin{example}[$u=p$, $p$ prime\footnote{In practice, we do not lose by
@@ -213,7 +215,9 @@ the following definitions.
     exists a prime between $u$ and $2u$.}]
     We define $\H = \big\{h: h(x) = \big(\sum_{i=0}^{k-1}a_i x^i \bmod p\big)
     \bmod m\big\}$. We see that $\Pr(h(x_1) = y_1,\ldots, h(x_k) = y_k)\leq
-    \left(\frac{2}{m}\right)^k$. 
+    \left(\frac{2}{m}\right)^k$. Note that in this example,
+    $h(x_1),\ldots,h(x_k)$ does not behave exactly $k$ independent uniformly
+    distributed variables, but this bound is sufficient in most applications.
 \end{example}
 
 \section{Dictionary problem}
@@ -238,65 +242,126 @@ The running time of an operation on key $k$ is linear in the length of the
 linked list at $h(k)$. It is possible to show that $\E_h[\text{length of list
 at $h(k)$}]= O(1)$ if $m\geq n$.
 
+For details of the analysis, see Lecture Notes 12 from the Spring 2014 offering of CS 124, or see \cite[Section 11.2]{CLRS}.
+
 \subsection{Perfect hashing}
 
+In this problem there is some set $S\subset[u]$ where $|S| = n$, and we would like to find a hash function $h:[u]\rightarrow[m]$ (hashing into the $m$ tables of some array) which acts injectively on $S$. This was mentioned in class, but no details were given -- for the interested reader, see the lecture notes on perfect hashing from the Spring 2014 offering of CS 124, or read \cite[Section 11.5]{CLRS}, or see the original paper \cite{FKS84}.
+
 \subsection{Linear probing}
 
 In real architectures, sequential memory accesses are much faster. A solution
 to the dictionary problem with better spacial locality is \emph{linear
-probing}. Array $A$ store $k$ at $h(k)$. If the spot is already taken, keep
-walking to the right until you find and empty spot and store the key there.
-
-Linear probing dates back at least to 1954 where it was used in an assembly
-program written by Samuel, Amdahl and Boehme. It was first analyzed in a note
-published by Knuth \cite{Knuth63} in the case where $h$ is totally random ($\H$
-is the set of all functions). There was a breakthrough in 
-
-This was suggested by (Samuel, Amdahl, Boehme, in '54) and first analyzed by
-Knuth in '63 assuming that $h$ is totally random ($\H$ is the set of all the
-functions). ($m=(1+\eps)n$, $\E(time per op) = O(1/\eps^2)$.
+probing}. In this solution, the hash table is an array $A$, but unlike the
+solution with chaining and linked lists, the values are stored directrly in
+$A$. More specifically, the value associated with key $k$ is stored at $h(k)$.
+If this cell is already taken, we start walking to the right, one cell at
+a time\footnote{We wrap back to the beginning of the array when we reach its
+right edge.}, until we find and empty cell and insert the value in it.
 
-(Pagh, Pagh, Ruzic '09): breakthrough. for any $\H$ $5$-wise independent implies $\E
-time = O(1)$ ($m\geq cn$).
+\paragraph{History.} Linear probing dates back at least to 1954 where it was
+used in an assembly program written by Samuel, Amdahl and Boehme. It was first
+analyzed in a note published by Knuth \cite{Knuth63} in the case where $h$ is
+totally random ($\H$ is the set of all functions). In this paper, he showed
+that when $m= (1+\eps)n$, then $\E[\text{time per operation}]
+= O\left(\frac{1}{\eps^2}\right)$. There was a breakthrough in a paper by Pagh,
+Pagh and Ruzic \cite{ppr} where it was proved that for any $5$-wise independent
+family, $\E[\text{time per operation}] = O(1)$. Finally Patrascu and Thorup
+\cite{pt} showed that $4$-wise independent families do not guarantee a constant
+running time per operation (that is, they provided an example of a $4$-wise
+independent family for which the expected running time is not constant).
 
-Patrascu, Thorup (ICALP '10) showed that $4$-wise doesn't guarantee  $O(1)$.
+Here, we will prove the following proposition.
 
-\begin{claim}
-    If $m\geq cn$ and $\H$ is $7$-wise independent, then $\E(\text{query time})
+\begin{proposition}\label{prop:unfinished}
+    If $m\geq cn$ and $\H$ is $7$-wise independent, then $\E[\text{query time}]
     = O(1)$.
-\end{claim}
+\end{proposition}
 
-\begin{proof}
-How many spots to you have to touch before finding an empty spot. We are going
-to show that the expected length of a run is a constant.
-\end{proof}
+To prove this proposition, we need to able to compute the expected number of
+cells we have to probe before finding an empty cell. We are going to show
+that this expected number is constant.
 
 \begin{definition}
-    Given an interval $I$ of cells in $A$, $L(I) = |\{i: h(i)\in
-    I\}|$\footnote{we allow intervals to wrap around at the end of the table}.
+    Given an interval $I$ of cells in $A$, we define $L(I)=|\{i: h(i)\in
+    I\}|$\footnote{We allow intervals to wrap around at the end of the table.}.
 \end{definition}
 
 \begin{definition}
-    $I$ is full is $|L(I)| \geq |I|$.
+    We say that $I$ is \emph{full} iff $|L(I)| \geq |I|$.
 \end{definition}
 
 \begin{lemma}
-    If \texttt{query(x)} for some $x$ makes $k$ probes, then $h(x)$ is in some
-    full interval of length at least $k$.
+    If \texttt{query(x)} for some $x$ makes $k$ probes, then there exist at
+    least $k$ full intervals containing $h(x)$.
 \end{lemma}
 
 \begin{proof}
+    Let us define $z_1$ the position of the first empty cell on the left of
+    $h(x)$ and $z_2$ the position of the first empty cell on the right of
+    $h(x)$. It is easy to see that each of the intervals $[z_1, h(x)], [z_1,
+    h(x)+1],\ldots,[z_1, z_2-1]$ are full. There are $z_2-h(x)$ such intervals.
+    But by the assumption of the lemma, $z_2 = h(x)+ k$ which conclues the
+    proof of the lemma.
+\end{proof}
+
+\begin{proof}
     $\E(time to query(x)) \leq \E(number of full intervals containing x)\leq
     \sum_{k=1}^\infty\sum_{intervals I, |I|=k, h(x)\in I} \Pr(full)$
     $\sum_{k=1}^\infty k\max_I \Pr(I full)$
     we need to show that the $\max$ is $O(1/k^3)$ so that the sum converges.
     $(m=2n)$, $n=number of diff keys active$.
     $$\Pr(I full) \leq \Pr\big(|L(I)-\E L(I)|\geq \E L(I)\big)$$
-    we would like to use Chernoff, but we don't have independence. 
-    we can raise both sides to the power $6$.
+\end{proof}
+
+\begin{proof}[Proof of Proposition~\ref{prop:unfinished}]
+By applying the previous lemma:
+\begin{align*}
+    \E[\text{time of \texttt{query(x)}}]
+    &\leq \E[\text{number of full intervals
+        containing $h(x)$}]\\
+        &\leq \sum_{k\geq 1}\sum_{\substack{I, |I|=k\\h(x)\in I}} \Pr(I\text{
+    full}) \leq \sum_{k\geq 1}k \max_{I, |I|=k, h(x)\in I}\Pr(I\text{ full})
+\end{align*}
+We need to show that the $\max$ in the previous sum is
+$O\left(\frac{1}{k^3}\right)$ so that the sum converges. We assume that $m=2n$
+where $n$ is the number of different keys active.
+\begin{displaymath}
+    \Pr(I\text{ full}) = \Pr(|L(I)\geq |I|)\leq\Pr\big(|L(I)-\E L(I)|\geq \E L(I)\big)
+\end{displaymath}
+Now we would like to apply Chernoff, but we don't have independence. Instead we
+can raise both sides of the inequality to some power and apply Markov's
+inequality. This is called the \emph{methods of moments}. We will do this in
+the next lecture.
 \end{proof}
 
 \bibliographystyle{alpha}
-\bibliography{main}
+\begin{thebibliography}{PPR09}
+
+\bibitem[CLRS]{CLRS}
+Thomas H. Cormen, Charles E. Leiserson, Ronald L. Rivest, Clifford Stein.
+Introduction to Algorithms, 3$^{\text{rd}}$ edition, MIT Press, 2009.
+
+\bibitem[FKS84]{FKS84}
+Michael L. Fredman, J\'{a}nos Koml\'{o}s, Endre Szemer\'{e}di.
+Storing a Sparse Table with $O(1)$ Worst Case Access Time. 
+{\em J. ACM 31(3)}, pages 538--544, 1984.
+
+\bibitem[Knu63]{Knuth63}
+Don Knuth.
+\newblock Notes On ``Open'' Addressing, 1963.
+
+\bibitem[PPR09]{ppr}
+Anna Pagh, Rasmus Pagh, and Milan Ruzic.
+\newblock Linear probing with constant independence.
+\newblock {\em {SIAM} J. Comput.}, 39(3):1107--1120, 2009.
+
+\bibitem[PT10]{pt}
+Mihai Patrascu and Mikkel Thorup.
+\newblock On the \emph{k}-independence required by linear probing and minwise
+  independence.
+\newblock {\em ICALP}, pages 715--726, 2010.
+
+\end{thebibliography}
 
 \end{document}
author	Thibaut Horel <thibaut.horel@gmail.com>	2014-09-11 11:39:46 -0400
committer	Thibaut Horel <thibaut.horel@gmail.com>	2014-09-11 11:39:46 -0400
commit	2240f5cc3708235143be6efb48e3fb93b2bf1f8a (patch)
tree	892eb44114dc2ab7c034f1e925991f01e06fa310 /lecture/main.tex
parent	b61804836ac2e35f636540d5a170486d9f845973 (diff)
download	cs224-2240f5cc3708235143be6efb48e3fb93b2bf1f8a.tar.gz