[ps2] Initial commit

author: Thibaut Horel <thibaut.horel@gmail.com> 2014-09-24 22:49:53 -0400
committer: Thibaut Horel <thibaut.horel@gmail.com> 2014-09-24 22:49:53 -0400
commit: 19cb1a6d624db0b5ab903d983916a8116b3c0a9f (patch)
tree: 26bea5234c4fd0ba32f374abac83e657fafe3738
parent: ad3232936c06880e6465ddbd9f1e9e75c314d9e7 (diff)
download: cs224-19cb1a6d624db0b5ab903d983916a8116b3c0a9f.tar.gz
1 files changed, 313 insertions, 0 deletions
diff --git a/ps2/main.tex b/ps2/main.tex
new file mode 100644
index 0000000..45638ab
--- /dev/null
+++ b/ps2/main.tex
@@ -0,0 +1,313 @@
+\documentclass[10pt]{article}
+\usepackage{fullpage}
+\usepackage[utf8x]{inputenc}
+\usepackage{amsmath,amsfonts,amsthm}
+\usepackage[english]{babel}
+\usepackage[capitalize, noabbrev]{cleveref}
+
+% these are compressed lists to help fit into a 1 page limit
+\newenvironment{enumerate*}%
+  {\vspace{-2ex} \begin{enumerate} %
+     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
+  {\end{enumerate}}
+ 
+\newenvironment{itemize*}%
+  {\vspace{-2ex} \begin{itemize} %
+     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
+  {\end{itemize}}
+ 
+\newenvironment{description*}%
+  {\vspace{-2ex} \begin{description} %
+     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
+  {\end{description}}
+
+\DeclareMathOperator*{\E}{\mathbb{E}}
+\let\Pr\relax
+\DeclareMathOperator*{\Pr}{\mathbb{P}}
+
+\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
+\newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}}
+\newcommand{\llbracket}{[\![}
+\newcommand{\eps}{\varepsilon}
+
+\newtheorem{theorem}{Theorem}
+\newtheorem{lemma}{Lemma}
+
+\author{Thibaut Horel}
+\title{CS 229r Problem Set 2 -- Solutions}
+
+\begin{document}
+
+\maketitle
+\section*{Problem 1}
+
+\paragraph{(a)} Let us consider the function $f$ defined for $x>-1$ by:
+\begin{displaymath}
+    \forall x>-1,\quad f(x) = \frac{(1+x)\log(1+x)-x}{x^2}
+\end{displaymath}
+Using the fact that $\log(1+x) = x -\frac{x^2}{2} + o(x^2)$ as $x\rightarrow
+0$, we have:
+\begin{displaymath}
+    f(x) = \frac{{\frac{x^2}{2} + o(x^2)}}{x^2}
+\end{displaymath}
+Hence, $f(x)\rightarrow \frac{1}{2}$ as $x\rightarrow 0$ and the function is
+well-defined and continuous (or can be continuously extended) at 0. Since it is
+continuous elsewhere, we can denote by $C$ its minimum on the interval
+$[-\frac{1}{2},\frac{1}{2}]$. Then, for $0< \eps < \frac{1}{2}$:
+\begin{displaymath}
+    (1+\eps)\log(1+\eps) - \eps \geq C\eps^2
+\end{displaymath}
+and
+\begin{displaymath}
+    (1-\eps)\log(1-\eps) + \eps \geq C\eps^2
+\end{displaymath}
+By applying the exponential function, these two inequality imply:
+\begin{equation}\label{eq:1}
+    \left(\frac{e^\eps}{(1+\eps)^{1+\eps}}\right)\leq e^{-C\eps^2}
+    \quad\textrm{and}\quad
+    \left(\frac{e^{-\eps}}{(1-\eps)^{1-\eps}}\right)\leq e^{-C\eps^2}
+\end{equation}
+A union bound and the two Chernoff bounds given in the problem statement give:
+\begin{displaymath}
+    \Pr\big[|X-\mu|\geq \eps\mu\big] \leq 
+    \left(\frac{e^\eps}{(1+\eps)^{1+\eps}}\right) + 
+    \left(\frac{e^{-\eps}}{(1-\eps)^{1-\eps}}\right)
+\end{displaymath}
+The right-hand side of this inequality is bounded from above by $2e^{-C\eps^2}$
+using \cref{eq:1}. This concludes the proof of this question.
+
+\paragraph{(b)} Using the symmetrization trick seen in class, we can write:
+\begin{displaymath}
+    \E\left[(X-\mu)^{2l}\right]
+    \leq 2^{2l} \E\left[\left(\sum_i \sigma_i X_i\right)^{2l}\right]
+    = 2^{2l}\sum_{1\leq i_1,\ldots, i_{2l}\leq n}\E\left[\sigma_{i_1}\ldots\sigma_{i_{2l}}\right]
+    \E\left[X_{i_1}\ldots X_{i_{2l}}\right]
+\end{displaymath}
+As in class, the only non-zero terms in this sum are the ones where each unique
+index in $i_1,\ldots, i_{2l}$ is repeated an even number of times. Furthermore,
+if there are $k$ unique such indices, then $\E\left[X_{i_1}\ldots
+X_{i_{2l}}\right] = p^k$. By reordering the sum based on how many such $k$
+unique indices one can have (at most $l$ since each index is repeated at least
+twice), we get:
+\begin{displaymath}
+    \E\left[(X-\mu)^{2l}\right]\leq 2^{2l}
+    \sum_{k=1}^l p^k m_k
+\end{displaymath}
+where $m_k$ is the number of ways to chose $2l$ indices in $\{1,\ldots,n\}$
+such that each index is repeated an even number of times. We can bound $m_k$:
+\begin{displaymath}
+    m_k \leq \binom{n}{k} k^{2l}
+\end{displaymath}
+indeed there are $\binom{n}{k}$ ways of choosing the set of $k$ unique indices.
+Once this set is fixed, each of the indices in $i_1,\ldots,i_{2l}$ must map to
+one of those $k$ indices. There are $k^{2l}$ ways of choosing this mapping.
+This implies:
+\begin{displaymath}
+    \E\left[(X-\mu)^{2l}\right]\leq 2^{2l}
+    \sum_{k=1}^l p^k\binom{n}{k}k^{2l}
+\end{displaymath}
+Using the hint, we can bound $\binom{n}{k} \leq \left(\frac{ne}{k}\right)^k$
+leading to:
+\begin{displaymath}
+    \E\left[(X-\mu)^{2l}\right]\leq 2^{2l}
+    \sum_{k=1}^l p^k\left(\frac{ne}{k}\right)^k k^{2l}
+    \leq (2l)^{2l} \sum_{k=1}^l\left(\frac{nep}{l}\right)^k
+\end{displaymath}
+the last inequality coming from $k^{2l-k}\leq l^{2l-k}$. Assuming that
+$\frac{nep}{l}\geq 2$ (this will be satisfied later when we chose $l$), we can
+bound the geometric series by twice its maximum term:
+\begin{displaymath}
+    \E\left[(X-\mu)^{2l}\right]\leq 2\cdot(2l)^{2l}
+    \left(\frac{nep}{l}\right)^l = 
+    2 \left(4\mu el\right)^l
+\end{displaymath}
+Hence:
+\begin{displaymath}
+    \Pr\left[|X-\mu|\geq \eps\mu\right]\leq 
+    2\cdot\left(\frac{4\mu el}{\eps^2\mu^2}\right)^l = 
+    2\cdot\left(\frac{4el}{\eps^2\mu}\right)^l 
+\end{displaymath}
+Choosing $l = \left\lceil\frac{\eps^2\mu}{5e}\right\rceil$, we obtain\footnote{Note that
+    we could possibly have an issue when $\eps^2\mu\leq 5e$ but the Chernoff
+bound becomes trivial in this case: if $\eps^2\mu$ is bounded by a constant,
+there exists a constant for which the Chernoff bound is true, we can then take
+the minimum of this constant and the $C'$ that we are constructing to obtain
+a global Chernoff bound.}:
+\begin{displaymath}
+    \Pr\left[|X-\mu|\geq \eps\mu\right]
+    \leq
+    2\cdot\left(\frac{4}{5}\right)^{C\mu\eps^2}
+\end{displaymath}
+where $C=\frac{1}{5e}$. The right-hand side can be bounded from above by
+$2e^{-C'\mu\eps^2}$ where $C' = C\log(5/4)>0$. This concludes the proof.
+
+
+\section*{Problem 2}
+
+\paragraph{(a)} We will use the following key observation: for any $a$, if $x$
+is a uniform random variable, then so is $x\oplus a$. This follows immediately
+from the fact that $x\mapsto x\oplus a$ is a one-to-one mapping.
+
+We first prove that $\mathcal{H}$ is two-wise independent. For
+two distinct keys $x$ and $y$, there is a character $i$ such that $x_i \neq
+y_i$. Without loss of generality, we can assume that this character is $0$.
+Given $x$, $H_0(y_0)$ is a completely random value (since $H_0(y_0)$ is chosen
+independently of $H_0(x_0)$). Then it follows from our original key observation
+that $h(y)$ is uniformly random (since $h(y) = H_0(y_0) \oplus \cdots$).
+
+We can now prove that $\mathcal{H}$ is three-wise independent. Let us consider
+three distinct keys, $x$, $y$, and $z$. Then there exists a key for which there
+exists a character which is different from the characters of the other two keys
+at this position.
+
+Otherwise, assume that it is not the case, then for each position, the
+character of each key is equal to the character of one of the other two keys.
+For example, we have $x_1 = y_1$ or $x_1 = z_1$. Assume wlog that $x_1 = y_1$.
+But then since $z_1 = y_1$ or $z_1 = x_1$, we get that $x_1 = y_1 = z_1$. We
+can repeat this argument for each position and obtain that $x=y=z$,
+a contradiction.
+
+Wlog of generality, assume that $z$ has a position at which it is different
+from $x$ and $y$. Again wlog, we can assume that this position is $0$.  Then
+$H_0(z_0)$ is uniformly random given $x$ and $y$, implying that $h(z)$ is
+uniformly random given $x$ and $y$ (using our original observation again). But
+we already know that $h(x)$ and $h(y)$ behave like two independent uniform
+random variables since $\mathcal{H}$ is 2-wise independent. This implies hat
+$h(x)$, $h(y)$ and $h(z)$ are three independent uniform variables and concludes
+our proof.
+
+\paragraph{(b)} We give a counter example for $u=4$, $c=2$. In this case we
+write the keys in base two. Let us consider the four different keys $x= 01$,
+$y=10$, $z=11$ and $w=00$. Let us write $a_i = H_0(i)$ and $b_i = H_1(i)$ for
+$i\in\{1,2\}$. Then we have $h(x) = b_0\oplus a_1$, $h(y) = b_1\oplus a_0$,
+$h(z) = b_1\oplus a_1$ and $h(w) = b_0\oplus a_0$. But then we see that $h(z)
+= h(x)\oplus h(y)\oplus h(z)$ (all the terms which appear twice cancel out).
+Hence, no matter the random choices made by $H_0$ and $H_1$, $h(z)$ is fully
+determined once $h(x)$, $h(y)$ and $h(z)$ have been observed.
+
+\paragraph{(c)} We first prove the hint.
+
+\begin{lemma}
+    Any set $S$ of size $k\geq 2$ contains a subset $S'$ of size at least $\log
+    k$ on which $\mathcal{H}$ behaves completely randomly.
+\end{lemma}
+
+\begin{proof}
+    We prove this lemma by induction on $k\geq 2$.
+
+    For $k=2$, $\log k = 1$. $\mathcal{H}$ behaves completely randomly on any
+    set of size 1.
+
+    Assume that the lemma is true for any set of size $k-1$, and let us consider
+    a set $S$ of size $k\geq 3$. Their exists a position such that not all words
+    in $S$ have the same character at this position (otherwise all words in $S$
+    would be equal and $S$ would be of size 1). Wlog, let us assume that this
+    position is $0$.
+
+    Let  us denote by $l\geq 2$ the number of different characters which appear at
+    position $0$ in words of $S$: $l \eqdef |\{x_0: x_0\in S\}|$. We can now cluster
+    the words of $S$ in $l$ different buckets, each bucket containing all the
+    words sharing the same character at position $0$. Let us denote by $S^*$
+    the largest of these buckets. We have that $|S^*|\geq \frac{k}{l}$. We can
+    now construct a subset $S'$ as follows:
+    \begin{itemize*}
+        \item Apply the induction hypothesis to $|S^*|$ and obtain a set $T$ on
+            which $\mathcal{H}$ behaves completely randomly.
+        \item Keep exactly one word in each of the remaining $l-1$ buckets
+            other than $S^*$ to create a second set $T'$.
+        \item Let $S' = T'\cup T$.
+    \end{itemize*}
+    We now prove that $S'$ satisfies the requirements of the lemma:
+    \begin{itemize*}
+        \item by the induction hypothesis, $|T| \geq \log\frac{k}{l} = \log
+            k -\log l$. By construction, $|T'| = l-1$ and $T$ and $T'$ are
+            disjoint. Hence $S' \geq \log k - \log l + l-1 \geq \log k$, since
+            $l-1\geq \log l$. Thus, $S'$ has the required size.
+        \item By construction all the elements in $T'$ have
+            different characters at position 0 and different from the
+            characters at position 0 of the words in $T'$. Hence, the given words
+            in $T$, $\mathca{H}$ behaves completely randomly on $T'$ by the
+            same observation as the one we used in \textbf{(b)}. But by the
+            induction hypothesis, $\mathcal{H}$ behaves completely randomly on
+            $T$. As a consequence $\mathcal{H}$ behaves completely randomly on
+            $S'$.
+    \end{itemize*}
+    This shows that the lemma is true for any set of size $k$ and concludes the
+    proof.
+\end{proof}
+
+Let us denote by $L$ the longest linked list and by $l$ its length (this is
+a random variable):
+\begin{align*}
+    \Pr[l\geq k] &\leq \Pr[\exists\, k\text{ elements mapping to $L$}]\\
+                 &\leq \Pr[\exists\, S\text{ s.t. } |S|\geq\log k, \text{ $S$ maps to $L$ and
+    $\mathcal{H}$ is completely random over $S$}]
+\end{align*}
+where the second inequality follows from the lemma. Now applying a union bound:
+\begin{align*}
+    \Pr[l\geq k] &\leq \sum_{S,\, |S| = \log k}\Pr[S\text{ maps to $L$ and
+    $\mathcal{H}$ random over $S$}]\\
+    &\leq\binom{n}{\log k}m\frac{1}{m^{\log k}}
+    \leq\frac{n^\log k}{(\log k)!}\frac{1}{m^{\log k -1}}
+\end{align*}
+Using $m> n^{1.01}$, we get:
+\begin{displaymath}
+    \Pr[l\geq k] \leq \frac{n^{-0.1\log k + 1.01}}{(\log k)!}
+\end{displaymath}
+We see that by taking $k$ large enough such that $-0.1\log k + 1.01$ becomes
+negative, the numerator becomes smaller than 1 (regardless of $n$). By
+consequence there exists are universal constant $K$ such that:
+\begin{displaymath}
+    \Pr[l\geq K]\leq \frac{1}{3}
+\end{displaymath}
+The probability that the largest list has length more than $K$ is less than
+$\frac{1}{3}$. Hence, with probability at least $\frac{2}{3}$, all the linked
+lists have length less than $K = O(1)$.
+
+\section*{Problem 3}
+
+\paragraph{(a)} We define our scheme $Z$ to be the standard unbiased estimator
+of $\Pr[h(A) = h(B)]$ for $h$ uniformly random: this is the fraction of time
+$h_i(A)$ agrees with $h_i(B)$ for $1\leq i\leq k$. Formally:
+\begin{displaymath}
+    \boxed{Z \eqdef \frac{1}{k}\sum_{i=1}^k \mathbf{1}\{h_i(A) = h_i(B)\}}
+\end{displaymath}
+Let us denote by $X_i = \mathbf{1}\{h_i(A) = h_i(B)\}$. We have $\E[X_i]
+= \Pr_{h\sim\mathcal{U}}[h(A) = h(B)] = J(A, B)\eqdef p$. Hence:
+\begin{displaymath}
+    \Pr[|Z-J(A,B)|\geq \eps]
+    = \Pr\left[\left|\sum_{i=1}^k (X_i - p)\right|\geq k\eps\right]
+\end{displaymath}
+Using Chebyshev's inequality (moment method of order 2):
+\begin{displaymath}
+    \Pr[|Z-J(A,B)|\geq \eps]
+    \leq\frac{1}{(k\eps)^2}\E\left[\left(\sum_{i=1}^k(X_i - p)\right)^2\right]
+\end{displaymath}
+We have:
+\begin{displaymath}
+    \E\left[\left(\sum_{i=1}^k(X_i - p)\right)^2\right]
+    = \sum_{1\leq i,j\leq k}\E\big[(X_i-p)(X_j-p)\big]
+\end{displaymath}
+By pairwise independence of $X_1,\ldots X_k$, and since $X_i-p$ has mean zero,
+we have that $\E\big[(X_i-p)(X_j-p)\big] = 0$ whenever $i\neq j$. Hence:
+\begin{displaymath}
+    \Pr[|Z-J(A,B)|\geq \eps]
+    \leq \frac{1}{(k\eps)^2}\sum_{i=1}^k\E[(X_i-p)^2]\leq \frac{1}{4k\eps^2}
+\end{displaymath}
+where the last inequality uses $\E[(X_i-p)^2] = p(1-p)\leq \frac{1}{4}$. Hence,
+choosing $k=\left\lceil\frac{3}{4\eps^2}\right\rceil$ is enough to obtain:
+\begin{displaymath}
+    \Pr[|Z-J(A,B)|\geq \eps]\leq \frac{1}{3}
+\end{displaymath}
+
+\paragraph{(b)}
+\begin{itemize*}
+    \item In case we want a probability of success of $\delta$,
+\end{itemize*}
+
+\section*{Problem 4}
+
+Time spent: Problem 1 (4 hours), Problem 2 (3 hours), Problem 3 (2 hours).
+\textbf{Total}: 9 hours.
+
+\end{document}
author	Thibaut Horel <thibaut.horel@gmail.com>	2014-09-24 22:49:53 -0400
committer	Thibaut Horel <thibaut.horel@gmail.com>	2014-09-24 22:49:53 -0400
commit	19cb1a6d624db0b5ab903d983916a8116b3c0a9f (patch)
tree	26bea5234c4fd0ba32f374abac83e657fafe3738
parent	ad3232936c06880e6465ddbd9f1e9e75c314d9e7 (diff)
download	cs224-19cb1a6d624db0b5ab903d983916a8116b3c0a9f.tar.gz