\documentclass[10pt]{article}
\usepackage{fullpage}
\usepackage[utf8x]{inputenc}
\usepackage{amsmath,amsfonts,amsthm}
\usepackage[english]{babel}
\usepackage[capitalize, noabbrev]{cleveref}

% these are compressed lists to help fit into a 1 page limit
\newenvironment{enumerate*}%
  {\vspace{-2ex} \begin{enumerate} %
     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
  {\end{enumerate}}
 
\newenvironment{itemize*}%
  {\vspace{-2ex} \begin{itemize} %
     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
  {\end{itemize}}
 
\newenvironment{description*}%
  {\vspace{-2ex} \begin{description} %
     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
  {\end{description}}

\DeclareMathOperator*{\E}{\mathbb{E}}
\let\Pr\relax
\DeclareMathOperator*{\Pr}{\mathbb{P}}

\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
\newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}}
\newcommand{\llbracket}{[\![}
\newcommand{\eps}{\varepsilon}

\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}

\author{Thibaut Horel}
\title{CS 224 Problem Set 2 -- Solutions}

\begin{document}

\maketitle
Collaborators: Eric Balkanski and Jean Pouget-Abadie
\section*{Problem 1}

\paragraph{(a)} Let us consider the function $f$ defined for $x>-1$ by:
\begin{displaymath}
    \forall x>-1,\quad f(x) = \frac{(1+x)\log(1+x)-x}{x^2}
\end{displaymath}
Using the fact that $\log(1+x) = x -\frac{x^2}{2} + o(x^2)$ as $x\rightarrow
0$, we have:
\begin{displaymath}
    f(x) = \frac{{\frac{x^2}{2} + o(x^2)}}{x^2}
\end{displaymath}
Hence, $f(x)\rightarrow \frac{1}{2}$ as $x\rightarrow 0$ and the function is
well-defined and continuous (or can be continuously extended) at 0. Since it is
continuous elsewhere, we can denote by $C$ its minimum on the interval
$[-\frac{1}{2},\frac{1}{2}]$. Then, for $0< \eps < \frac{1}{2}$:
\begin{displaymath}
    (1+\eps)\log(1+\eps) - \eps \geq C\eps^2
\end{displaymath}
and
\begin{displaymath}
    (1-\eps)\log(1-\eps) + \eps \geq C\eps^2
\end{displaymath}
By applying the exponential function, these two inequality imply:
\begin{equation}\label{eq:1}
    \left(\frac{e^\eps}{(1+\eps)^{1+\eps}}\right)\leq e^{-C\eps^2}
    \quad\textrm{and}\quad
    \left(\frac{e^{-\eps}}{(1-\eps)^{1-\eps}}\right)\leq e^{-C\eps^2}
\end{equation}
A union bound and the two Chernoff bounds given in the problem statement give:
\begin{displaymath}
    \Pr\big[|X-\mu|\geq \eps\mu\big] \leq 
    \left(\frac{e^\eps}{(1+\eps)^{1+\eps}}\right) + 
    \left(\frac{e^{-\eps}}{(1-\eps)^{1-\eps}}\right)
\end{displaymath}
The right-hand side of this inequality is bounded from above by $2e^{-C\eps^2}$
using \cref{eq:1}. This concludes the proof of this question.

\paragraph{(b)} Using the symmetrization trick seen in class, we can write:
\begin{displaymath}
    \E\left[(X-\mu)^{2l}\right]
    \leq 2^{2l} \E\left[\left(\sum_i \sigma_i X_i\right)^{2l}\right]
    = 2^{2l}\sum_{1\leq i_1,\ldots, i_{2l}\leq n}\E\left[\sigma_{i_1}\ldots\sigma_{i_{2l}}\right]
    \E\left[X_{i_1}\ldots X_{i_{2l}}\right]
\end{displaymath}
As in class, the only non-zero terms in this sum are the ones where each unique
index in $i_1,\ldots, i_{2l}$ is repeated an even number of times. Furthermore,
if there are $k$ unique such indices, then $\E\left[X_{i_1}\ldots
X_{i_{2l}}\right] = p^k$. By reordering the sum based on how many such $k$
unique indices one can have (at most $l$ since each index is repeated at least
twice), we get:
\begin{displaymath}
    \E\left[(X-\mu)^{2l}\right]\leq 2^{2l}
    \sum_{k=1}^l p^k m_k
\end{displaymath}
where $m_k$ is the number of ways to chose $2l$ indices in $\{1,\ldots,n\}$
such that each index is repeated an even number of times. We can bound $m_k$:
\begin{displaymath}
    m_k \leq \binom{n}{k} k^{2l}
\end{displaymath}
indeed there are $\binom{n}{k}$ ways of choosing the set of $k$ unique indices.
Once this set is fixed, each of the indices in $i_1,\ldots,i_{2l}$ must map to
one of those $k$ indices. There are $k^{2l}$ ways of choosing this mapping.
This implies:
\begin{displaymath}
    \E\left[(X-\mu)^{2l}\right]\leq 2^{2l}
    \sum_{k=1}^l p^k\binom{n}{k}k^{2l}
\end{displaymath}
Using the hint, we can bound $\binom{n}{k} \leq \left(\frac{ne}{k}\right)^k$
leading to:
\begin{displaymath}
    \E\left[(X-\mu)^{2l}\right]\leq 2^{2l}
    \sum_{k=1}^l p^k\left(\frac{ne}{k}\right)^k k^{2l}
    \leq (2l)^{2l} \sum_{k=1}^l\left(\frac{nep}{l}\right)^k
\end{displaymath}
the last inequality coming from $k^{2l-k}\leq l^{2l-k}$. Assuming that
$\frac{nep}{l}\geq 2$ (this will be satisfied later when we chose $l$), we can
bound the geometric series by twice its maximum term:
\begin{displaymath}
    \E\left[(X-\mu)^{2l}\right]\leq 2\cdot(2l)^{2l}
    \left(\frac{nep}{l}\right)^l = 
    2 \left(4\mu el\right)^l
\end{displaymath}
Hence:
\begin{displaymath}
    \Pr\left[|X-\mu|\geq \eps\mu\right]\leq 
    2\cdot\left(\frac{4\mu el}{\eps^2\mu^2}\right)^l = 
    2\cdot\left(\frac{4el}{\eps^2\mu}\right)^l 
\end{displaymath}
Choosing $l = \left\lceil\frac{\eps^2\mu}{5e}\right\rceil$, we obtain\footnote{Note that
    we could possibly have an issue when $\eps^2\mu\leq 5e$ but the Chernoff
bound becomes trivial in this case: if $\eps^2\mu$ is bounded by a constant,
there exists a constant for which the Chernoff bound is true, we can then take
the minimum of this constant and the $C'$ that we are constructing to obtain
a global Chernoff bound.}:
\begin{displaymath}
    \Pr\left[|X-\mu|\geq \eps\mu\right]
    \leq
    2\cdot\left(\frac{4}{5}\right)^{C\mu\eps^2}
\end{displaymath}
where $C=\frac{1}{5e}$. The right-hand side can be bounded from above by
$2e^{-C'\mu\eps^2}$ where $C' = C\log(5/4)>0$. This concludes the proof.

\paragraph{(c)} The case were the $p_i$ are not all the same can be reduced to
the previous case by considering the random variables $X_i' = X - p_i$. These
random variables have mean zero and variance $p_i(1-p_i)\leq p_i$. The previous
analysis applies almost verbatim. The only difference being that
$\E[X_{i_1}\ldots X_{i_{2l}}]$ can now be bounded from above by $p_{j_1}\ldots
p_{j_k}$ where $j_1,\ldots j_k$ are the unique indices (they are repeated an
even number of times so the moments of even orders are bounded by the
variance).

\section*{Problem 2}

\paragraph{(a)} We will use the following key observation: for any $a$, if $x$
is a uniform random variable, then so is $x\oplus a$. This follows immediately
from the fact that $x\mapsto x\oplus a$ is a one-to-one mapping.

We first prove that $\mathcal{H}$ is two-wise independent. For
two distinct keys $x$ and $y$, there is a character $i$ such that $x_i \neq
y_i$. Without loss of generality, we can assume that this character is $0$.
Given $x$, $H_0(y_0)$ is a completely random value (since $H_0(y_0)$ is chosen
independently of $H_0(x_0)$). Then it follows from our original key observation
that $h(y)$ is uniformly random (since $h(y) = H_0(y_0) \oplus \cdots$).

We can now prove that $\mathcal{H}$ is three-wise independent. Let us consider
three distinct keys, $x$, $y$, and $z$. Then there exists a key for which there
exists a character which is different from the characters of the other two keys
at this position.

Otherwise, assume that it is not the case, then for each position, the
character of each key is equal to the character of one of the other two keys.
For example, we have $x_1 = y_1$ or $x_1 = z_1$. Assume wlog that $x_1 = y_1$.
But then since $z_1 = y_1$ or $z_1 = x_1$, we get that $x_1 = y_1 = z_1$. We
can repeat this argument for each position and obtain that $x=y=z$,
a contradiction.

Wlog of generality, assume that $z$ has a position at which it is different
from $x$ and $y$. Again wlog, we can assume that this position is $0$.  Then
$H_0(z_0)$ is uniformly random given $x$ and $y$, implying that $h(z)$ is
uniformly random given $x$ and $y$ (using our original observation again). But
we already know that $h(x)$ and $h(y)$ behave like two independent uniform
random variables since $\mathcal{H}$ is 2-wise independent. This implies hat
$h(x)$, $h(y)$ and $h(z)$ are three independent uniform variables and concludes
our proof.

\paragraph{(b)} We give a counter example for $u=4$, $c=2$. In this case we
write the keys in base two. Let us consider the four different keys $x= 01$,
$y=10$, $z=11$ and $w=00$. Let us write $a_i = H_0(i)$ and $b_i = H_1(i)$ for
$i\in\{1,2\}$. Then we have $h(x) = b_0\oplus a_1$, $h(y) = b_1\oplus a_0$,
$h(z) = b_1\oplus a_1$ and $h(w) = b_0\oplus a_0$. But then we see that $h(z)
= h(x)\oplus h(y)\oplus h(z)$ (all the terms which appear twice cancel out).
Hence, no matter the random choices made by $H_0$ and $H_1$, $h(z)$ is fully
determined once $h(x)$, $h(y)$ and $h(z)$ have been observed.

\paragraph{(c)} We first prove the hint.

\begin{lemma}
    Any set $S$ of size $k\geq 2$ contains a subset $S'$ of size at least $\log
    k$ on which $\mathcal{H}$ behaves completely randomly.
\end{lemma}

\begin{proof}
    We prove this lemma by induction on $k\geq 2$.

    For $k=2$, $\log k = 1$. $\mathcal{H}$ behaves completely randomly on any
    set of size 1.

    Assume that the lemma is true for any set of size $k-1$, and let us consider
    a set $S$ of size $k\geq 3$. Their exists a position such that not all words
    in $S$ have the same character at this position (otherwise all words in $S$
    would be equal and $S$ would be of size 1). Wlog, let us assume that this
    position is $0$.

    Let  us denote by $l\geq 2$ the number of different characters which appear at
    position $0$ in words of $S$: $l \eqdef |\{x_0: x_0\in S\}|$. We can now cluster
    the words of $S$ in $l$ different buckets, each bucket containing all the
    words sharing the same character at position $0$. Let us denote by $S^*$
    the largest of these buckets. We have that $|S^*|\geq \frac{k}{l}$. We can
    now construct a subset $S'$ as follows:
    \begin{itemize*}
        \item Apply the induction hypothesis to $|S^*|$ and obtain a set $T$ on
            which $\mathcal{H}$ behaves completely randomly.
        \item Keep exactly one word in each of the remaining $l-1$ buckets
            other than $S^*$ to create a second set $T'$.
        \item Let $S' = T'\cup T$.
    \end{itemize*}
    We now prove that $S'$ satisfies the requirements of the lemma:
    \begin{itemize*}
        \item by the induction hypothesis, $|T| \geq \log\frac{k}{l} = \log
            k -\log l$. By construction, $|T'| = l-1$ and $T$ and $T'$ are
            disjoint. Hence $S' \geq \log k - \log l + l-1 \geq \log k$, since
            $l-1\geq \log l$. Thus, $S'$ has the required size.
        \item By construction all the elements in $T'$ have
            different characters at position 0 and different from the
            characters at position 0 of the words in $T'$. Hence, the given words
            in $T$, $\mathcal{H}$ behaves completely randomly on $T'$ by the
            same observation as the one we used in \textbf{(b)}. But by the
            induction hypothesis, $\mathcal{H}$ behaves completely randomly on
            $T$. As a consequence $\mathcal{H}$ behaves completely randomly on
            $S'$.
    \end{itemize*}
    This shows that the lemma is true for any set of size $k$ and concludes the
    proof.
\end{proof}

Let us denote by $L$ the longest linked list and by $l$ its length (this is
a random variable):
\begin{align*}
    \Pr[l\geq k] &\leq \Pr[\exists\, k\text{ elements mapping to $L$}]\\
                 &\leq \Pr[\exists\, S\text{ s.t. } |S|\geq\log k, \text{ $S$ maps to $L$ and
    $\mathcal{H}$ is completely random over $S$}]
\end{align*}
where the second inequality follows from the lemma. Now applying a union bound:
\begin{align*}
    \Pr[l\geq k] &\leq \sum_{S,\, |S| = \log k}\Pr[S\text{ maps to $L$ and
    $\mathcal{H}$ random over $S$}]\\
    &\leq\binom{n}{\log k}m\frac{1}{m^{\log k}}
    \leq\frac{n^{\log k}}{(\log k)!}\frac{1}{m^{\log k -1}}
\end{align*}
Using $m> n^{1.01}$, we get:
\begin{displaymath}
    \Pr[l\geq k] \leq \frac{n^{-0.1\log k + 1.01}}{(\log k)!}
\end{displaymath}
We see that by taking $k$ large enough such that $-0.1\log k + 1.01$ becomes
negative, the numerator becomes smaller than 1 (regardless of $n$). By
consequence there exists are universal constant $K$ such that:
\begin{displaymath}
    \Pr[l\geq K]\leq \frac{1}{3}
\end{displaymath}
The probability that the largest list has length more than $K$ is less than
$\frac{1}{3}$. Hence, with probability at least $\frac{2}{3}$, all the linked
lists have length less than $K = O(1)$.

\section*{Problem 3}

\paragraph{(a)} We define our scheme $Z$ to be the standard unbiased estimator
of $\Pr[h(A) = h(B)]$ for $h$ uniformly random: this is the fraction of time
$h_i(A)$ agrees with $h_i(B)$ for $1\leq i\leq k$. Formally:
\begin{displaymath}
    \boxed{Z \eqdef \frac{1}{k}\sum_{i=1}^k \mathbf{1}\{h_i(A) = h_i(B)\}}
\end{displaymath}
Let us denote by $X_i = \mathbf{1}\{h_i(A) = h_i(B)\}$. We have $\E[X_i]
= \Pr_{h\sim\mathcal{U}}[h(A) = h(B)] = J(A, B)\eqdef p$. Hence:
\begin{displaymath}
    \Pr[|Z-J(A,B)|\geq \eps]
    = \Pr\left[\left|\sum_{i=1}^k (X_i - p)\right|\geq k\eps\right]
\end{displaymath}
Using Chebyshev's inequality (moment method of order 2):
\begin{displaymath}
    \Pr[|Z-J(A,B)|\geq \eps]
    \leq\frac{1}{(k\eps)^2}\E\left[\left(\sum_{i=1}^k(X_i - p)\right)^2\right]
\end{displaymath}
We have:
\begin{displaymath}
    \E\left[\left(\sum_{i=1}^k(X_i - p)\right)^2\right]
    = \sum_{1\leq i,j\leq k}\E\big[(X_i-p)(X_j-p)\big]
\end{displaymath}
By pairwise independence of $X_1,\ldots X_k$, and since $X_i-p$ has mean zero,
we have that $\E\big[(X_i-p)(X_j-p)\big] = 0$ whenever $i\neq j$. Hence:
\begin{displaymath}
    \Pr[|Z-J(A,B)|\geq \eps]
    \leq \frac{1}{(k\eps)^2}\sum_{i=1}^k\E[(X_i-p)^2]\leq \frac{1}{4k\eps^2}
\end{displaymath}
where the last inequality uses $\E[(X_i-p)^2] = p(1-p)\leq \frac{1}{4}$. Hence,
choosing $k=\left\lceil\frac{3}{4\eps^2}\right\rceil$ is enough to obtain:
\begin{displaymath}
    \Pr[|Z-J(A,B)|\geq \eps]\leq \frac{1}{3}
\end{displaymath}

\paragraph{(b)}
\begin{itemize*}
    \item In case we want a probability of success of $\delta$, the previous
        analysis shows that we can choose $k=\frac{1}{4\eps^2\delta}$.
    \item If the $s_i$ were fully independent, then we would be able to apply
        the Chernoff bound and obtain:
        \begin{displaymath}
            \Pr[|Z-J(A,B)|\geq \eps]\leq 2e^{-C\eps^2\frac{k}{p}}\leq2e^{-C\eps^2k}
        \end{displaymath}
        Hence, choosing
        $k=\left\lceil\frac{1}{C\eps^2}\log\frac{2}{\delta}\right\rceil$ will
        yield a probability of failure of at most $\delta$.
    \item We note that if the median of $Z_1\ldots Z_t$ (denoted by $Z$) is at
        distance more than $\eps$ of $J(A,B)$, this implies that at least half
        of the $Z_i$'s are at distance at least $\eps$ of $J(A,B)$. This
        implies that there is a subset of size $\frac{t}{2}$ of the $Z_i$'s at
        distance at most $\eps$ of $J(A,B)$. Since there are $\binom{t}{t/2}$
        such subsets, applying a union bound:
        \begin{displaymath}
            \Pr[|Z-J(A,B)|\geq \eps]\leq\binom{t}{\frac{t}{2}}\frac{1}{3^t}
        \end{displaymath}
        where $\frac{1}{3^t}$ is the probability that all the $Z_i$'s in a set
        of size $\frac{t}{2}$ are at distance at least $\eps$ of $J(A,B)$ (the
        $Z_i's$ are all independent). Using the standard bound on binomial
        coefficients:
        \begin{displaymath}
            \Pr[|Z-J(A,B)|\geq \eps]\leq\left(\frac{\sqrt{2e}}{3}\right)^t
        \end{displaymath}
        This implies that choosing:
        \begin{displaymath}
            t = \left\lceil\frac{\log\frac{1}{\delta}}{\log\frac{\sqrt{2e}}{3}}\right\rceil
            = O\left(\log\frac{1}{\delta}\right)
        \end{displaymath}
        will give us a probability of failure of at most $\delta$.
\end{itemize*}

\section*{Problem 4}

Time spent: Problem 1 (4 hours), Problem 2 (3 hours), Problem 3 (2 hours).
\textbf{Total}: 9 hours.

\end{document}