\documentclass[10pt]{article} \usepackage{fullpage} \usepackage[utf8x]{inputenc} \usepackage{amsmath,amsfonts,amsthm} \usepackage[english]{babel} \usepackage[capitalize, noabbrev]{cleveref} % these are compressed lists to help fit into a 1 page limit \newenvironment{enumerate*}% {\vspace{-2ex} \begin{enumerate} % \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}% {\end{enumerate}} \newenvironment{itemize*}% {\vspace{-2ex} \begin{itemize} % \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}% {\end{itemize}} \newenvironment{description*}% {\vspace{-2ex} \begin{description} % \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}% {\end{description}} \DeclareMathOperator*{\E}{\mathbb{E}} \let\Pr\relax \DeclareMathOperator*{\Pr}{\mathbb{P}} \newcommand{\inprod}[1]{\left\langle #1 \right\rangle} \newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}} \newcommand{\llbracket}{[\![} \newcommand{\eps}{\varepsilon} \newtheorem{theorem}{Theorem} \newtheorem{lemma}{Lemma} \author{Thibaut Horel} \title{CS 224 Problem Set 2 -- Solutions} \begin{document} \maketitle Collaborators: Eric Balkanski and Jean Pouget-Abadie \section*{Problem 1} \paragraph{(a)} Let us consider the function $f$ defined for $x>-1$ by: \begin{displaymath} \forall x>-1,\quad f(x) = \frac{(1+x)\log(1+x)-x}{x^2} \end{displaymath} Using the fact that $\log(1+x) = x -\frac{x^2}{2} + o(x^2)$ as $x\rightarrow 0$, we have: \begin{displaymath} f(x) = \frac{{\frac{x^2}{2} + o(x^2)}}{x^2} \end{displaymath} Hence, $f(x)\rightarrow \frac{1}{2}$ as $x\rightarrow 0$ and the function is well-defined and continuous (or can be continuously extended) at 0. Since it is continuous elsewhere, we can denote by $C$ its minimum on the interval $[-\frac{1}{2},\frac{1}{2}]$. Then, for $0< \eps < \frac{1}{2}$: \begin{displaymath} (1+\eps)\log(1+\eps) - \eps \geq C\eps^2 \end{displaymath} and \begin{displaymath} (1-\eps)\log(1-\eps) + \eps \geq C\eps^2 \end{displaymath} By applying the exponential function, these two inequality imply: \begin{equation}\label{eq:1} \left(\frac{e^\eps}{(1+\eps)^{1+\eps}}\right)\leq e^{-C\eps^2} \quad\textrm{and}\quad \left(\frac{e^{-\eps}}{(1-\eps)^{1-\eps}}\right)\leq e^{-C\eps^2} \end{equation} A union bound and the two Chernoff bounds given in the problem statement give: \begin{displaymath} \Pr\big[|X-\mu|\geq \eps\mu\big] \leq \left(\frac{e^\eps}{(1+\eps)^{1+\eps}}\right) + \left(\frac{e^{-\eps}}{(1-\eps)^{1-\eps}}\right) \end{displaymath} The right-hand side of this inequality is bounded from above by $2e^{-C\eps^2}$ using \cref{eq:1}. This concludes the proof of this question. \paragraph{(b)} Using the symmetrization trick seen in class, we can write: \begin{displaymath} \E\left[(X-\mu)^{2l}\right] \leq 2^{2l} \E\left[\left(\sum_i \sigma_i X_i\right)^{2l}\right] = 2^{2l}\sum_{1\leq i_1,\ldots, i_{2l}\leq n}\E\left[\sigma_{i_1}\ldots\sigma_{i_{2l}}\right] \E\left[X_{i_1}\ldots X_{i_{2l}}\right] \end{displaymath} As in class, the only non-zero terms in this sum are the ones where each unique index in $i_1,\ldots, i_{2l}$ is repeated an even number of times. Furthermore, if there are $k$ unique such indices, then $\E\left[X_{i_1}\ldots X_{i_{2l}}\right] = p^k$. By reordering the sum based on how many such $k$ unique indices one can have (at most $l$ since each index is repeated at least twice), we get: \begin{displaymath} \E\left[(X-\mu)^{2l}\right]\leq 2^{2l} \sum_{k=1}^l p^k m_k \end{displaymath} where $m_k$ is the number of ways to chose $2l$ indices in $\{1,\ldots,n\}$ such that each index is repeated an even number of times. We can bound $m_k$: \begin{displaymath} m_k \leq \binom{n}{k} k^{2l} \end{displaymath} indeed there are $\binom{n}{k}$ ways of choosing the set of $k$ unique indices. Once this set is fixed, each of the indices in $i_1,\ldots,i_{2l}$ must map to one of those $k$ indices. There are $k^{2l}$ ways of choosing this mapping. This implies: \begin{displaymath} \E\left[(X-\mu)^{2l}\right]\leq 2^{2l} \sum_{k=1}^l p^k\binom{n}{k}k^{2l} \end{displaymath} Using the hint, we can bound $\binom{n}{k} \leq \left(\frac{ne}{k}\right)^k$ leading to: \begin{displaymath} \E\left[(X-\mu)^{2l}\right]\leq 2^{2l} \sum_{k=1}^l p^k\left(\frac{ne}{k}\right)^k k^{2l} \leq (2l)^{2l} \sum_{k=1}^l\left(\frac{nep}{l}\right)^k \end{displaymath} the last inequality coming from $k^{2l-k}\leq l^{2l-k}$. Assuming that $\frac{nep}{l}\geq 2$ (this will be satisfied later when we chose $l$), we can bound the geometric series by twice its maximum term: \begin{displaymath} \E\left[(X-\mu)^{2l}\right]\leq 2\cdot(2l)^{2l} \left(\frac{nep}{l}\right)^l = 2 \left(4\mu el\right)^l \end{displaymath} Hence: \begin{displaymath} \Pr\left[|X-\mu|\geq \eps\mu\right]\leq 2\cdot\left(\frac{4\mu el}{\eps^2\mu^2}\right)^l = 2\cdot\left(\frac{4el}{\eps^2\mu}\right)^l \end{displaymath} Choosing $l = \left\lceil\frac{\eps^2\mu}{5e}\right\rceil$, we obtain\footnote{Note that we could possibly have an issue when $\eps^2\mu\leq 5e$ but the Chernoff bound becomes trivial in this case: if $\eps^2\mu$ is bounded by a constant, there exists a constant for which the Chernoff bound is true, we can then take the minimum of this constant and the $C'$ that we are constructing to obtain a global Chernoff bound.}: \begin{displaymath} \Pr\left[|X-\mu|\geq \eps\mu\right] \leq 2\cdot\left(\frac{4}{5}\right)^{C\mu\eps^2} \end{displaymath} where $C=\frac{1}{5e}$. The right-hand side can be bounded from above by $2e^{-C'\mu\eps^2}$ where $C' = C\log(5/4)>0$. This concludes the proof. \paragraph{(c)} The case were the $p_i$ are not all the same can be reduced to the previous case by considering the random variables $X_i' = X - p_i$. These random variables have mean zero and variance $p_i(1-p_i)\leq p_i$. The previous analysis applies almost verbatim. The only difference being that $\E[X_{i_1}\ldots X_{i_{2l}}]$ can now be bounded from above by $p_{j_1}\ldots p_{j_k}$ where $j_1,\ldots j_k$ are the unique indices (they are repeated an even number of times so the moments of even orders are bounded by the variance). \section*{Problem 2} \paragraph{(a)} We will use the following key observation: for any $a$, if $x$ is a uniform random variable, then so is $x\oplus a$. This follows immediately from the fact that $x\mapsto x\oplus a$ is a one-to-one mapping. We first prove that $\mathcal{H}$ is two-wise independent. For two distinct keys $x$ and $y$, there is a character $i$ such that $x_i \neq y_i$. Without loss of generality, we can assume that this character is $0$. Given $x$, $H_0(y_0)$ is a completely random value (since $H_0(y_0)$ is chosen independently of $H_0(x_0)$). Then it follows from our original key observation that $h(y)$ is uniformly random (since $h(y) = H_0(y_0) \oplus \cdots$). We can now prove that $\mathcal{H}$ is three-wise independent. Let us consider three distinct keys, $x$, $y$, and $z$. Then there exists a key for which there exists a character which is different from the characters of the other two keys at this position. Otherwise, assume that it is not the case, then for each position, the character of each key is equal to the character of one of the other two keys. For example, we have $x_1 = y_1$ or $x_1 = z_1$. Assume wlog that $x_1 = y_1$. But then since $z_1 = y_1$ or $z_1 = x_1$, we get that $x_1 = y_1 = z_1$. We can repeat this argument for each position and obtain that $x=y=z$, a contradiction. Wlog of generality, assume that $z$ has a position at which it is different from $x$ and $y$. Again wlog, we can assume that this position is $0$. Then $H_0(z_0)$ is uniformly random given $x$ and $y$, implying that $h(z)$ is uniformly random given $x$ and $y$ (using our original observation again). But we already know that $h(x)$ and $h(y)$ behave like two independent uniform random variables since $\mathcal{H}$ is 2-wise independent. This implies hat $h(x)$, $h(y)$ and $h(z)$ are three independent uniform variables and concludes our proof. \paragraph{(b)} We give a counter example for $u=4$, $c=2$. In this case we write the keys in base two. Let us consider the four different keys $x= 01$, $y=10$, $z=11$ and $w=00$. Let us write $a_i = H_0(i)$ and $b_i = H_1(i)$ for $i\in\{1,2\}$. Then we have $h(x) = b_0\oplus a_1$, $h(y) = b_1\oplus a_0$, $h(z) = b_1\oplus a_1$ and $h(w) = b_0\oplus a_0$. But then we see that $h(z) = h(x)\oplus h(y)\oplus h(z)$ (all the terms which appear twice cancel out). Hence, no matter the random choices made by $H_0$ and $H_1$, $h(z)$ is fully determined once $h(x)$, $h(y)$ and $h(z)$ have been observed. \paragraph{(c)} We first prove the hint. \begin{lemma} Any set $S$ of size $k\geq 2$ contains a subset $S'$ of size at least $\log k$ on which $\mathcal{H}$ behaves completely randomly. \end{lemma} \begin{proof} We prove this lemma by induction on $k\geq 2$. For $k=2$, $\log k = 1$. $\mathcal{H}$ behaves completely randomly on any set of size 1. Assume that the lemma is true for any set of size $k-1$, and let us consider a set $S$ of size $k\geq 3$. Their exists a position such that not all words in $S$ have the same character at this position (otherwise all words in $S$ would be equal and $S$ would be of size 1). Wlog, let us assume that this position is $0$. Let us denote by $l\geq 2$ the number of different characters which appear at position $0$ in words of $S$: $l \eqdef |\{x_0: x_0\in S\}|$. We can now cluster the words of $S$ in $l$ different buckets, each bucket containing all the words sharing the same character at position $0$. Let us denote by $S^*$ the largest of these buckets. We have that $|S^*|\geq \frac{k}{l}$. We can now construct a subset $S'$ as follows: \begin{itemize*} \item Apply the induction hypothesis to $|S^*|$ and obtain a set $T$ on which $\mathcal{H}$ behaves completely randomly. \item Keep exactly one word in each of the remaining $l-1$ buckets other than $S^*$ to create a second set $T'$. \item Let $S' = T'\cup T$. \end{itemize*} We now prove that $S'$ satisfies the requirements of the lemma: \begin{itemize*} \item by the induction hypothesis, $|T| \geq \log\frac{k}{l} = \log k -\log l$. By construction, $|T'| = l-1$ and $T$ and $T'$ are disjoint. Hence $S' \geq \log k - \log l + l-1 \geq \log k$, since $l-1\geq \log l$. Thus, $S'$ has the required size. \item By construction all the elements in $T'$ have different characters at position 0 and different from the characters at position 0 of the words in $T'$. Hence, the given words in $T$, $\mathcal{H}$ behaves completely randomly on $T'$ by the same observation as the one we used in \textbf{(b)}. But by the induction hypothesis, $\mathcal{H}$ behaves completely randomly on $T$. As a consequence $\mathcal{H}$ behaves completely randomly on $S'$. \end{itemize*} This shows that the lemma is true for any set of size $k$ and concludes the proof. \end{proof} Let us denote by $L$ the longest linked list and by $l$ its length (this is a random variable): \begin{align*} \Pr[l\geq k] &\leq \Pr[\exists\, k\text{ elements mapping to $L$}]\\ &\leq \Pr[\exists\, S\text{ s.t. } |S|\geq\log k, \text{ $S$ maps to $L$ and $\mathcal{H}$ is completely random over $S$}] \end{align*} where the second inequality follows from the lemma. Now applying a union bound: \begin{align*} \Pr[l\geq k] &\leq \sum_{S,\, |S| = \log k}\Pr[S\text{ maps to $L$ and $\mathcal{H}$ random over $S$}]\\ &\leq\binom{n}{\log k}m\frac{1}{m^{\log k}} \leq\frac{n^{\log k}}{(\log k)!}\frac{1}{m^{\log k -1}} \end{align*} Using $m> n^{1.01}$, we get: \begin{displaymath} \Pr[l\geq k] \leq \frac{n^{-0.1\log k + 1.01}}{(\log k)!} \end{displaymath} We see that by taking $k$ large enough such that $-0.1\log k + 1.01$ becomes negative, the numerator becomes smaller than 1 (regardless of $n$). By consequence there exists are universal constant $K$ such that: \begin{displaymath} \Pr[l\geq K]\leq \frac{1}{3} \end{displaymath} The probability that the largest list has length more than $K$ is less than $\frac{1}{3}$. Hence, with probability at least $\frac{2}{3}$, all the linked lists have length less than $K = O(1)$. \section*{Problem 3} \paragraph{(a)} We define our scheme $Z$ to be the standard unbiased estimator of $\Pr[h(A) = h(B)]$ for $h$ uniformly random: this is the fraction of time $h_i(A)$ agrees with $h_i(B)$ for $1\leq i\leq k$. Formally: \begin{displaymath} \boxed{Z \eqdef \frac{1}{k}\sum_{i=1}^k \mathbf{1}\{h_i(A) = h_i(B)\}} \end{displaymath} Let us denote by $X_i = \mathbf{1}\{h_i(A) = h_i(B)\}$. We have $\E[X_i] = \Pr_{h\sim\mathcal{U}}[h(A) = h(B)] = J(A, B)\eqdef p$. Hence: \begin{displaymath} \Pr[|Z-J(A,B)|\geq \eps] = \Pr\left[\left|\sum_{i=1}^k (X_i - p)\right|\geq k\eps\right] \end{displaymath} Using Chebyshev's inequality (moment method of order 2): \begin{displaymath} \Pr[|Z-J(A,B)|\geq \eps] \leq\frac{1}{(k\eps)^2}\E\left[\left(\sum_{i=1}^k(X_i - p)\right)^2\right] \end{displaymath} We have: \begin{displaymath} \E\left[\left(\sum_{i=1}^k(X_i - p)\right)^2\right] = \sum_{1\leq i,j\leq k}\E\big[(X_i-p)(X_j-p)\big] \end{displaymath} By pairwise independence of $X_1,\ldots X_k$, and since $X_i-p$ has mean zero, we have that $\E\big[(X_i-p)(X_j-p)\big] = 0$ whenever $i\neq j$. Hence: \begin{displaymath} \Pr[|Z-J(A,B)|\geq \eps] \leq \frac{1}{(k\eps)^2}\sum_{i=1}^k\E[(X_i-p)^2]\leq \frac{1}{4k\eps^2} \end{displaymath} where the last inequality uses $\E[(X_i-p)^2] = p(1-p)\leq \frac{1}{4}$. Hence, choosing $k=\left\lceil\frac{3}{4\eps^2}\right\rceil$ is enough to obtain: \begin{displaymath} \Pr[|Z-J(A,B)|\geq \eps]\leq \frac{1}{3} \end{displaymath} \paragraph{(b)} \begin{itemize*} \item In case we want a probability of success of $\delta$, the previous analysis shows that we can choose $k=\frac{1}{4\eps^2\delta}$. \item If the $s_i$ were fully independent, then we would be able to apply the Chernoff bound and obtain: \begin{displaymath} \Pr[|Z-J(A,B)|\geq \eps]\leq 2e^{-C\eps^2\frac{k}{p}}\leq2e^{-C\eps^2k} \end{displaymath} Hence, choosing $k=\left\lceil\frac{1}{C\eps^2}\log\frac{2}{\delta}\right\rceil$ will yield a probability of failure of at most $\delta$. \item We note that if the median of $Z_1\ldots Z_t$ (denoted by $Z$) is at distance more than $\eps$ of $J(A,B)$, this implies that at least half of the $Z_i$'s are at distance at least $\eps$ of $J(A,B)$. This implies that there is a subset of size $\frac{t}{2}$ of the $Z_i$'s at distance at most $\eps$ of $J(A,B)$. Since there are $\binom{t}{t/2}$ such subsets, applying a union bound: \begin{displaymath} \Pr[|Z-J(A,B)|\geq \eps]\leq\binom{t}{\frac{t}{2}}\frac{1}{3^t} \end{displaymath} where $\frac{1}{3^t}$ is the probability that all the $Z_i$'s in a set of size $\frac{t}{2}$ are at distance at least $\eps$ of $J(A,B)$ (the $Z_i's$ are all independent). Using the standard bound on binomial coefficients: \begin{displaymath} \Pr[|Z-J(A,B)|\geq \eps]\leq\left(\frac{\sqrt{2e}}{3}\right)^t \end{displaymath} This implies that choosing: \begin{displaymath} t = \left\lceil\frac{\log\frac{1}{\delta}}{\log\frac{\sqrt{2e}}{3}}\right\rceil = O\left(\log\frac{1}{\delta}\right) \end{displaymath} will give us a probability of failure of at most $\delta$. \end{itemize*} \section*{Problem 4} Time spent: Problem 1 (4 hours), Problem 2 (3 hours), Problem 3 (2 hours). \textbf{Total}: 9 hours. \end{document}