[HW4] Problem 1 and 2HEAD master

author: Thibaut Horel <thibaut.horel@gmail.com> 2015-11-16 12:35:05 -0500
committer: Thibaut Horel <thibaut.horel@gmail.com> 2015-11-16 12:35:05 -0500
commit: adc7cb7256c8fcc11e7fd85866d6d3e2dcb319c1 (patch)
tree: 9b0065b6215919e86fc0ea3f377ea6bf536b4bf2
parent: 61f644a6a7d36dc5c15d957c48d10675ab3627ae (diff)
download: cs281-master.tar.gz
10 files changed, 752 insertions, 0 deletions
diff --git a/hw4/2.pdf b/hw4/2.pdf
new file mode 100644
index 0000000..fe7cdae
--- /dev/null
+++ b/hw4/2.pdf
diff --git a/hw4/2.py b/hw4/2.py
new file mode 100644
index 0000000..275be3e
--- /dev/null
+++ b/hw4/2.py
@@ -0,0 +1,81 @@
+import sys
+from itertools import islice
+import numpy as np
+from scipy.sparse import coo_matrix
+from math import sqrt
+
+
+def get_ratings(filename):
+    with open(filename) as fh:
+        for line in fh:
+            yield map(int, line.strip().split())
+
+
+def get_train_test(filename):
+    l = [(i, j) for (i, j, _) in get_ratings(filename)]
+    n = max(i for (i, _) in l)
+    m = max(j for (_, j) in l)
+    g = get_ratings(filename)
+    train = islice(g, 100000)
+    test = islice(g, 100000)
+    return n, m, list(train), list(test)
+
+
+def sparse_matrix(ratings):
+    i, j, data = zip(*ratings)
+    S = coo_matrix((data, (i, j)))
+    return S.tocsc(), S.tocsr()
+
+
+def get_users(Rr):
+    return [i for i in xrange(Rr.shape[0]) if len(Rr[i].nonzero()[1])]
+
+
+def get_jokes(Rc):
+    return [j for j in xrange(Rc.shape[1]) if len(Rc[:, j].nonzero()[0])]
+
+
+def sample_users(U, V, Rr, users):
+    for i in users:
+        r = Rr[i]
+        ind = r.nonzero()[1]
+        v = V[ind]
+        isigma = np.identity(k) / 5. + np.dot(v.T, v)
+        sigma = np.linalg.inv(isigma)
+        U[i] = np.random.multivariate_normal(np.dot(sigma, r.dot(V)[0]),
+                                             sigma)
+
+
+def sample_jokes(U, V, Rc, jokes):
+    for j in jokes:
+        r = Rc[:, j]
+        u = U[r.nonzero()[0]]
+        isigma = np.identity(k) / 5. + np.dot(u.T, u)
+        sigma = np.linalg.inv(isigma)
+        V[j] = np.random.multivariate_normal(np.dot(sigma, r.T.dot(U)[0]),
+                                             sigma)
+
+
+def sample(U, V, Rr, Rc, users, jokes):
+    sample_users(U, V, Rr, users)
+    sample_jokes(U, V, Rc, jokes)
+
+
+def likelihood(ratings, U, V):
+    return sum((r - np.inner(U[i], V[j])) ** 2 for i, j, r in ratings)
+
+
+if __name__ == "__main__":
+    n, m, train, test = get_train_test(sys.argv[1])
+    Rc, Rr = sparse_matrix(train)
+    users = get_users(Rr)  # users with at least one rating
+    jokes = get_jokes(Rc)  # jokes with at least one rating
+    for k in xrange(1, 11):
+        with open("gibbs_" + str(k) + ".txt", "w") as fh:
+            U = np.random.normal(0, sqrt(5), size=(Rc.shape[0], k))
+            V = np.random.normal(0, sqrt(5), size=(Rc.shape[1], k))
+            for e in xrange(100):
+                fh.write("\t".join(map(str, [e, likelihood(train, U, V),
+                                    likelihood(test, U, V)])) + "\n")
+                fh.flush()
+                sample(U, V, Rr, Rc, users, jokes)
diff --git a/hw4/2.tex b/hw4/2.tex
new file mode 100644
index 0000000..6376208
--- /dev/null
+++ b/hw4/2.tex
@@ -0,0 +1,83 @@
+\begin{Verbatim}[commandchars=\\\{\}]
+\PY{k+kn}{import} \PY{n+nn}{sys}
+\PY{k+kn}{from} \PY{n+nn}{itertools} \PY{k+kn}{import} \PY{n}{islice}
+\PY{k+kn}{import} \PY{n+nn}{numpy} \PY{k+kn}{as} \PY{n+nn}{np}
+\PY{k+kn}{from} \PY{n+nn}{scipy.sparse} \PY{k+kn}{import} \PY{n}{coo\PYZus{}matrix}
+\PY{k+kn}{from} \PY{n+nn}{math} \PY{k+kn}{import} \PY{n}{sqrt}
+
+
+\PY{k}{def} \PY{n+nf}{get\PYZus{}ratings}\PY{p}{(}\PY{n}{filename}\PY{p}{)}\PY{p}{:}
+    \PY{k}{with} \PY{n+nb}{open}\PY{p}{(}\PY{n}{filename}\PY{p}{)} \PY{k}{as} \PY{n}{fh}\PY{p}{:}
+        \PY{k}{for} \PY{n}{line} \PY{o+ow}{in} \PY{n}{fh}\PY{p}{:}
+            \PY{k}{yield} \PY{n+nb}{map}\PY{p}{(}\PY{n+nb}{int}\PY{p}{,} \PY{n}{line}\PY{o}{.}\PY{n}{strip}\PY{p}{(}\PY{p}{)}\PY{o}{.}\PY{n}{split}\PY{p}{(}\PY{p}{)}\PY{p}{)}
+
+
+\PY{k}{def} \PY{n+nf}{get\PYZus{}train\PYZus{}test}\PY{p}{(}\PY{n}{filename}\PY{p}{)}\PY{p}{:}
+    \PY{n}{l} \PY{o}{=} \PY{p}{[}\PY{p}{(}\PY{n}{i}\PY{p}{,} \PY{n}{j}\PY{p}{)} \PY{k}{for} \PY{p}{(}\PY{n}{i}\PY{p}{,} \PY{n}{j}\PY{p}{,} \PY{n}{\PYZus{}}\PY{p}{)} \PY{o+ow}{in} \PY{n}{get\PYZus{}ratings}\PY{p}{(}\PY{n}{filename}\PY{p}{)}\PY{p}{]}
+    \PY{n}{n} \PY{o}{=} \PY{n+nb}{max}\PY{p}{(}\PY{n}{i} \PY{k}{for} \PY{p}{(}\PY{n}{i}\PY{p}{,} \PY{n}{\PYZus{}}\PY{p}{)} \PY{o+ow}{in} \PY{n}{l}\PY{p}{)}
+    \PY{n}{m} \PY{o}{=} \PY{n+nb}{max}\PY{p}{(}\PY{n}{j} \PY{k}{for} \PY{p}{(}\PY{n}{\PYZus{}}\PY{p}{,} \PY{n}{j}\PY{p}{)} \PY{o+ow}{in} \PY{n}{l}\PY{p}{)}
+    \PY{n}{g} \PY{o}{=} \PY{n}{get\PYZus{}ratings}\PY{p}{(}\PY{n}{filename}\PY{p}{)}
+    \PY{n}{train} \PY{o}{=} \PY{n}{islice}\PY{p}{(}\PY{n}{g}\PY{p}{,} \PY{l+m+mi}{100000}\PY{p}{)}
+    \PY{n}{test} \PY{o}{=} \PY{n}{islice}\PY{p}{(}\PY{n}{g}\PY{p}{,} \PY{l+m+mi}{100000}\PY{p}{)}
+    \PY{k}{return} \PY{n}{n}\PY{p}{,} \PY{n}{m}\PY{p}{,} \PY{n+nb}{list}\PY{p}{(}\PY{n}{train}\PY{p}{)}\PY{p}{,} \PY{n+nb}{list}\PY{p}{(}\PY{n}{test}\PY{p}{)}
+
+
+\PY{k}{def} \PY{n+nf}{sparse\PYZus{}matrix}\PY{p}{(}\PY{n}{ratings}\PY{p}{)}\PY{p}{:}
+    \PY{n}{i}\PY{p}{,} \PY{n}{j}\PY{p}{,} \PY{n}{data} \PY{o}{=} \PY{n+nb}{zip}\PY{p}{(}\PY{o}{*}\PY{n}{ratings}\PY{p}{)}
+    \PY{n}{S} \PY{o}{=} \PY{n}{coo\PYZus{}matrix}\PY{p}{(}\PY{p}{(}\PY{n}{data}\PY{p}{,} \PY{p}{(}\PY{n}{i}\PY{p}{,} \PY{n}{j}\PY{p}{)}\PY{p}{)}\PY{p}{)}
+    \PY{k}{return} \PY{n}{S}\PY{o}{.}\PY{n}{tocsc}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{S}\PY{o}{.}\PY{n}{tocsr}\PY{p}{(}\PY{p}{)}
+
+
+\PY{k}{def} \PY{n+nf}{get\PYZus{}users}\PY{p}{(}\PY{n}{Rr}\PY{p}{)}\PY{p}{:}
+    \PY{k}{return} \PY{p}{[}\PY{n}{i} \PY{k}{for} \PY{n}{i} \PY{o+ow}{in} \PY{n+nb}{xrange}\PY{p}{(}\PY{n}{Rr}\PY{o}{.}\PY{n}{shape}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{p}{)} \PY{k}{if} \PY{n+nb}{len}\PY{p}{(}\PY{n}{Rr}\PY{p}{[}\PY{n}{i}\PY{p}{]}\PY{o}{.}\PY{n}{nonzero}\PY{p}{(}\PY{p}{)}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{]}\PY{p}{)}\PY{p}{]}
+
+
+\PY{k}{def} \PY{n+nf}{get\PYZus{}jokes}\PY{p}{(}\PY{n}{Rc}\PY{p}{)}\PY{p}{:}
+    \PY{k}{return} \PY{p}{[}\PY{n}{j} \PY{k}{for} \PY{n}{j} \PY{o+ow}{in} \PY{n+nb}{xrange}\PY{p}{(}\PY{n}{Rc}\PY{o}{.}\PY{n}{shape}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{]}\PY{p}{)} \PY{k}{if} \PY{n+nb}{len}\PY{p}{(}\PY{n}{Rc}\PY{p}{[}\PY{p}{:}\PY{p}{,} \PY{n}{j}\PY{p}{]}\PY{o}{.}\PY{n}{nonzero}\PY{p}{(}\PY{p}{)}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{p}{)}\PY{p}{]}
+
+
+\PY{k}{def} \PY{n+nf}{sample\PYZus{}users}\PY{p}{(}\PY{n}{U}\PY{p}{,} \PY{n}{V}\PY{p}{,} \PY{n}{Rr}\PY{p}{,} \PY{n}{users}\PY{p}{)}\PY{p}{:}
+    \PY{k}{for} \PY{n}{i} \PY{o+ow}{in} \PY{n}{users}\PY{p}{:}
+        \PY{n}{r} \PY{o}{=} \PY{n}{Rr}\PY{p}{[}\PY{n}{i}\PY{p}{]}
+        \PY{n}{ind} \PY{o}{=} \PY{n}{r}\PY{o}{.}\PY{n}{nonzero}\PY{p}{(}\PY{p}{)}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{]}
+        \PY{n}{v} \PY{o}{=} \PY{n}{V}\PY{p}{[}\PY{n}{ind}\PY{p}{]}
+        \PY{n}{isigma} \PY{o}{=} \PY{n}{np}\PY{o}{.}\PY{n}{identity}\PY{p}{(}\PY{n}{k}\PY{p}{)} \PY{o}{/} \PY{l+m+mf}{5.} \PY{o}{+} \PY{n}{np}\PY{o}{.}\PY{n}{dot}\PY{p}{(}\PY{n}{v}\PY{o}{.}\PY{n}{T}\PY{p}{,} \PY{n}{v}\PY{p}{)}
+        \PY{n}{sigma} \PY{o}{=} \PY{n}{np}\PY{o}{.}\PY{n}{linalg}\PY{o}{.}\PY{n}{inv}\PY{p}{(}\PY{n}{isigma}\PY{p}{)}
+        \PY{n}{U}\PY{p}{[}\PY{n}{i}\PY{p}{]} \PY{o}{=} \PY{n}{np}\PY{o}{.}\PY{n}{random}\PY{o}{.}\PY{n}{multivariate\PYZus{}normal}\PY{p}{(}\PY{n}{np}\PY{o}{.}\PY{n}{dot}\PY{p}{(}\PY{n}{sigma}\PY{p}{,} \PY{n}{r}\PY{o}{.}\PY{n}{dot}\PY{p}{(}\PY{n}{V}\PY{p}{)}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{p}{)}\PY{p}{,}
+                                             \PY{n}{sigma}\PY{p}{)}
+
+
+\PY{k}{def} \PY{n+nf}{sample\PYZus{}jokes}\PY{p}{(}\PY{n}{U}\PY{p}{,} \PY{n}{V}\PY{p}{,} \PY{n}{Rc}\PY{p}{,} \PY{n}{jokes}\PY{p}{)}\PY{p}{:}
+    \PY{k}{for} \PY{n}{j} \PY{o+ow}{in} \PY{n}{jokes}\PY{p}{:}
+        \PY{n}{r} \PY{o}{=} \PY{n}{Rc}\PY{p}{[}\PY{p}{:}\PY{p}{,} \PY{n}{j}\PY{p}{]}
+        \PY{n}{u} \PY{o}{=} \PY{n}{U}\PY{p}{[}\PY{n}{r}\PY{o}{.}\PY{n}{nonzero}\PY{p}{(}\PY{p}{)}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{p}{]}
+        \PY{n}{isigma} \PY{o}{=} \PY{n}{np}\PY{o}{.}\PY{n}{identity}\PY{p}{(}\PY{n}{k}\PY{p}{)} \PY{o}{/} \PY{l+m+mf}{5.} \PY{o}{+} \PY{n}{np}\PY{o}{.}\PY{n}{dot}\PY{p}{(}\PY{n}{u}\PY{o}{.}\PY{n}{T}\PY{p}{,} \PY{n}{u}\PY{p}{)}
+        \PY{n}{sigma} \PY{o}{=} \PY{n}{np}\PY{o}{.}\PY{n}{linalg}\PY{o}{.}\PY{n}{inv}\PY{p}{(}\PY{n}{isigma}\PY{p}{)}
+        \PY{n}{V}\PY{p}{[}\PY{n}{j}\PY{p}{]} \PY{o}{=} \PY{n}{np}\PY{o}{.}\PY{n}{random}\PY{o}{.}\PY{n}{multivariate\PYZus{}normal}\PY{p}{(}\PY{n}{np}\PY{o}{.}\PY{n}{dot}\PY{p}{(}\PY{n}{sigma}\PY{p}{,} \PY{n}{r}\PY{o}{.}\PY{n}{T}\PY{o}{.}\PY{n}{dot}\PY{p}{(}\PY{n}{U}\PY{p}{)}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{p}{)}\PY{p}{,}
+                                             \PY{n}{sigma}\PY{p}{)}
+
+
+\PY{k}{def} \PY{n+nf}{sample}\PY{p}{(}\PY{n}{U}\PY{p}{,} \PY{n}{V}\PY{p}{,} \PY{n}{Rr}\PY{p}{,} \PY{n}{Rc}\PY{p}{,} \PY{n}{users}\PY{p}{,} \PY{n}{jokes}\PY{p}{)}\PY{p}{:}
+    \PY{n}{sample\PYZus{}users}\PY{p}{(}\PY{n}{U}\PY{p}{,} \PY{n}{V}\PY{p}{,} \PY{n}{Rr}\PY{p}{,} \PY{n}{users}\PY{p}{)}
+    \PY{n}{sample\PYZus{}jokes}\PY{p}{(}\PY{n}{U}\PY{p}{,} \PY{n}{V}\PY{p}{,} \PY{n}{Rc}\PY{p}{,} \PY{n}{jokes}\PY{p}{)}
+
+
+\PY{k}{def} \PY{n+nf}{likelihood}\PY{p}{(}\PY{n}{ratings}\PY{p}{,} \PY{n}{U}\PY{p}{,} \PY{n}{V}\PY{p}{)}\PY{p}{:}
+    \PY{k}{return} \PY{n+nb}{sum}\PY{p}{(}\PY{p}{(}\PY{n}{r} \PY{o}{\PYZhy{}} \PY{n}{np}\PY{o}{.}\PY{n}{inner}\PY{p}{(}\PY{n}{U}\PY{p}{[}\PY{n}{i}\PY{p}{]}\PY{p}{,} \PY{n}{V}\PY{p}{[}\PY{n}{j}\PY{p}{]}\PY{p}{)}\PY{p}{)} \PY{o}{*}\PY{o}{*} \PY{l+m+mi}{2} \PY{k}{for} \PY{n}{i}\PY{p}{,} \PY{n}{j}\PY{p}{,} \PY{n}{r} \PY{o+ow}{in} \PY{n}{ratings}\PY{p}{)}
+
+
+\PY{k}{if} \PY{n}{\PYZus{}\PYZus{}name\PYZus{}\PYZus{}} \PY{o}{==} \PY{l+s}{\PYZdq{}}\PY{l+s}{\PYZus{}\PYZus{}main\PYZus{}\PYZus{}}\PY{l+s}{\PYZdq{}}\PY{p}{:}
+    \PY{n}{n}\PY{p}{,} \PY{n}{m}\PY{p}{,} \PY{n}{train}\PY{p}{,} \PY{n}{test} \PY{o}{=} \PY{n}{get\PYZus{}train\PYZus{}test}\PY{p}{(}\PY{n}{sys}\PY{o}{.}\PY{n}{argv}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{]}\PY{p}{)}
+    \PY{n}{Rc}\PY{p}{,} \PY{n}{Rr} \PY{o}{=} \PY{n}{sparse\PYZus{}matrix}\PY{p}{(}\PY{n}{train}\PY{p}{)}
+    \PY{n}{users} \PY{o}{=} \PY{n}{get\PYZus{}users}\PY{p}{(}\PY{n}{Rr}\PY{p}{)}  \PY{c}{\PYZsh{} users with at least one rating}
+    \PY{n}{jokes} \PY{o}{=} \PY{n}{get\PYZus{}jokes}\PY{p}{(}\PY{n}{Rc}\PY{p}{)}  \PY{c}{\PYZsh{} jokes with at least one rating}
+    \PY{k}{for} \PY{n}{k} \PY{o+ow}{in} \PY{n+nb}{xrange}\PY{p}{(}\PY{l+m+mi}{1}\PY{p}{,} \PY{l+m+mi}{11}\PY{p}{)}\PY{p}{:}
+        \PY{k}{with} \PY{n+nb}{open}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{gibbs\PYZus{}}\PY{l+s}{\PYZdq{}} \PY{o}{+} \PY{n+nb}{str}\PY{p}{(}\PY{n}{k}\PY{p}{)} \PY{o}{+} \PY{l+s}{\PYZdq{}}\PY{l+s}{.txt}\PY{l+s}{\PYZdq{}}\PY{p}{,} \PY{l+s}{\PYZdq{}}\PY{l+s}{w}\PY{l+s}{\PYZdq{}}\PY{p}{)} \PY{k}{as} \PY{n}{fh}\PY{p}{:}
+            \PY{n}{U} \PY{o}{=} \PY{n}{np}\PY{o}{.}\PY{n}{random}\PY{o}{.}\PY{n}{normal}\PY{p}{(}\PY{l+m+mi}{0}\PY{p}{,} \PY{n}{sqrt}\PY{p}{(}\PY{l+m+mi}{5}\PY{p}{)}\PY{p}{,} \PY{n}{size}\PY{o}{=}\PY{p}{(}\PY{n}{Rc}\PY{o}{.}\PY{n}{shape}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{p}{,} \PY{n}{k}\PY{p}{)}\PY{p}{)}
+            \PY{n}{V} \PY{o}{=} \PY{n}{np}\PY{o}{.}\PY{n}{random}\PY{o}{.}\PY{n}{normal}\PY{p}{(}\PY{l+m+mi}{0}\PY{p}{,} \PY{n}{sqrt}\PY{p}{(}\PY{l+m+mi}{5}\PY{p}{)}\PY{p}{,} \PY{n}{size}\PY{o}{=}\PY{p}{(}\PY{n}{Rc}\PY{o}{.}\PY{n}{shape}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{]}\PY{p}{,} \PY{n}{k}\PY{p}{)}\PY{p}{)}
+            \PY{k}{for} \PY{n}{e} \PY{o+ow}{in} \PY{n+nb}{xrange}\PY{p}{(}\PY{l+m+mi}{100}\PY{p}{)}\PY{p}{:}
+                \PY{n}{fh}\PY{o}{.}\PY{n}{write}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s+se}{\PYZbs{}t}\PY{l+s}{\PYZdq{}}\PY{o}{.}\PY{n}{join}\PY{p}{(}\PY{n+nb}{map}\PY{p}{(}\PY{n+nb}{str}\PY{p}{,} \PY{p}{[}\PY{n}{e}\PY{p}{,} \PY{n}{likelihood}\PY{p}{(}\PY{n}{train}\PY{p}{,} \PY{n}{U}\PY{p}{,} \PY{n}{V}\PY{p}{)}\PY{p}{,}
+                                    \PY{n}{likelihood}\PY{p}{(}\PY{n}{test}\PY{p}{,} \PY{n}{U}\PY{p}{,} \PY{n}{V}\PY{p}{)}\PY{p}{]}\PY{p}{)}\PY{p}{)} \PY{o}{+} \PY{l+s}{\PYZdq{}}\PY{l+s+se}{\PYZbs{}n}\PY{l+s}{\PYZdq{}}\PY{p}{)}
+                \PY{n}{fh}\PY{o}{.}\PY{n}{flush}\PY{p}{(}\PY{p}{)}
+                \PY{n}{sample}\PY{p}{(}\PY{n}{U}\PY{p}{,} \PY{n}{V}\PY{p}{,} \PY{n}{Rr}\PY{p}{,} \PY{n}{Rc}\PY{p}{,} \PY{n}{users}\PY{p}{,} \PY{n}{jokes}\PY{p}{)}
+\end{Verbatim}
diff --git a/hw4/3.py b/hw4/3.py
new file mode 100644
index 0000000..24c40a6
--- /dev/null
+++ b/hw4/3.py
@@ -0,0 +1,89 @@
+import sys
+from itertools import islice
+
+import autograd.numpy as np
+from autograd import grad
+
+
+def get_ratings(filename):
+    with open(filename) as fh:
+        for line in fh:
+            yield map(int, line.strip().split())
+
+
+def get_train_test(filename):
+    l = [(i, j) for (i, j, _) in get_ratings(filename)]
+    n = max(i for (i, _) in l)
+    m = max(j for (_, j) in l)
+    g = get_ratings(filename)
+    train = islice(g, 100000)
+    test = islice(g, 100000)
+    return n, m, list(train), list(test)
+
+
+def kl(muu, muv, sigmau, sigmav):
+    sigma = 5.
+    r = (np.log(sigma / sigmav).sum() + np.log(sigma / sigmau).sum()
+         + np.sum(sigmau ** 2 / (2 * sigma ** 2))
+         + np.sum(sigmav ** 2 / (2 * sigma ** 2))
+         + np.sum(muu ** 2 / (2 * sigma ** 2))
+         + np.sum(muv ** 2 / (2 * sigma ** 2))
+         )
+    return r
+
+
+def likelihood(params, r):
+    su, mu, sv, mv = params
+    s = len(mu)
+    ns = 50
+    u = su * np.random.normal(size=(ns, s)) + mu
+    v = sv * np.random.normal(size=(ns, s)) + mv
+    p = np.sum(u * v, axis=1)
+    m = np.sum((r - p) ** 2 / 2.)
+    return m / float(ns)
+
+gl = grad(likelihood)
+
+
+def sgd(ratings, muu, muv, sigmau, sigmav):
+    lamb = 0.05
+    sigma = 5.
+    eps = 1e-5
+
+    for e in xrange(10):
+        print l(ratings, muu, muv, sigmau, sigmav)
+        # gradient update on the KL part
+        sigmau = sigmau + lamb * 1. / sigmau
+        sigmav = sigmav + lamb * 1. / sigmav
+        sigmau = sigmau - lamb * sigmau / (sigma ** 2)
+        sigmav = sigmav - lamb * sigmav / (sigma ** 2)
+        muu = muu - lamb * muu / (sigma ** 2)
+        muv = muv - lamb * muv / (sigma ** 2)
+
+        for k, (i, j, r) in enumerate(ratings):
+            if k % 1000 == 0:
+                print k
+            su, mu, sv, mv = sigmau[i], muu[i], sigmav[j], muv[j]
+            gsu, gmu, gsv, gmv = gl((su, mu, sv, mv), r)
+            sigmau[i] = np.maximum(sigmau[i] - lamb * gsu, eps)
+            muu[i] = muu[i] - lamb * gmu
+            sigmav[j] = np.maximum(sigmav[j] - lamb * gsv, eps)
+            muv[j] = muv[j] - lamb * gmv
+
+
+def l(ratings, muu, muv, sigmau, sigmav):
+    res = kl(muu, muv, sigmau, sigmav)
+    for (i, j, r) in ratings:
+        su, mu, sv, mv = sigmau[i], muu[i], sigmav[j], muv[j]
+        res += likelihood((su, mu, sv, mv), r)
+    return res
+
+
+if __name__ == "__main__":
+    n, m, train, test = get_train_test(sys.argv[1])
+    print len(train)
+    k = 2
+    muu, sigmau = 0.01 * np.random.normal(size=(n+1, k)), 0.01 * np.ones((n+1, k))
+    muv, sigmav = 0.01 * np.random.normal(size=(m+1, k)), 0.01 * np.ones((n+1, k))
+    sgd(train, muu, muv, sigmau, sigmav)
+    print l(train, muu, muv, sigmau, sigmav)
diff --git a/hw4/def.tex b/hw4/def.tex
new file mode 100644
index 0000000..c2e21ce
--- /dev/null
+++ b/hw4/def.tex
@@ -0,0 +1,95 @@
+\usepackage{fancyvrb}
+\usepackage{color}
+\makeatletter
+\def\PY@reset{\let\PY@it=\relax \let\PY@bf=\relax%
+    \let\PY@ul=\relax \let\PY@tc=\relax%
+    \let\PY@bc=\relax \let\PY@ff=\relax}
+\def\PY@tok#1{\csname PY@tok@#1\endcsname}
+\def\PY@toks#1+{\ifx\relax#1\empty\else%
+    \PY@tok{#1}\expandafter\PY@toks\fi}
+\def\PY@do#1{\PY@bc{\PY@tc{\PY@ul{%
+    \PY@it{\PY@bf{\PY@ff{#1}}}}}}}
+\def\PY#1#2{\PY@reset\PY@toks#1+\relax+\PY@do{#2}}
+
+\expandafter\def\csname PY@tok@mo\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
+\expandafter\def\csname PY@tok@ne\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.82,0.25,0.23}{##1}}}
+\expandafter\def\csname PY@tok@cs\endcsname{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}}
+\expandafter\def\csname PY@tok@mf\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
+\expandafter\def\csname PY@tok@nl\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.63,0.63,0.00}{##1}}}
+\expandafter\def\csname PY@tok@kn\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
+\expandafter\def\csname PY@tok@il\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
+\expandafter\def\csname PY@tok@sr\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.40,0.53}{##1}}}
+\expandafter\def\csname PY@tok@ss\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
+\expandafter\def\csname PY@tok@vi\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
+\expandafter\def\csname PY@tok@se\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.73,0.40,0.13}{##1}}}
+\expandafter\def\csname PY@tok@vg\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
+\expandafter\def\csname PY@tok@gi\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.63,0.00}{##1}}}
+\expandafter\def\csname PY@tok@sb\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
+\expandafter\def\csname PY@tok@nf\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,1.00}{##1}}}
+\expandafter\def\csname PY@tok@sd\endcsname{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
+\expandafter\def\csname PY@tok@gh\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,0.50}{##1}}}
+\expandafter\def\csname PY@tok@sc\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
+\expandafter\def\csname PY@tok@kt\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.69,0.00,0.25}{##1}}}
+\expandafter\def\csname PY@tok@ni\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.60,0.60,0.60}{##1}}}
+\expandafter\def\csname PY@tok@vc\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
+\expandafter\def\csname PY@tok@o\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
+\expandafter\def\csname PY@tok@gr\endcsname{\def\PY@tc##1{\textcolor[rgb]{1.00,0.00,0.00}{##1}}}
+\expandafter\def\csname PY@tok@cm\endcsname{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}}
+\expandafter\def\csname PY@tok@w\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.73,0.73}{##1}}}
+\expandafter\def\csname PY@tok@cp\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.74,0.48,0.00}{##1}}}
+\expandafter\def\csname PY@tok@go\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}}
+\expandafter\def\csname PY@tok@gd\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.63,0.00,0.00}{##1}}}
+\expandafter\def\csname PY@tok@mb\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
+\expandafter\def\csname PY@tok@gt\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.27,0.87}{##1}}}
+\expandafter\def\csname PY@tok@no\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.53,0.00,0.00}{##1}}}
+\expandafter\def\csname PY@tok@mh\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
+\expandafter\def\csname PY@tok@c1\endcsname{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}}
+\expandafter\def\csname PY@tok@nb\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
+\expandafter\def\csname PY@tok@nd\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.67,0.13,1.00}{##1}}}
+\expandafter\def\csname PY@tok@mi\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
+\expandafter\def\csname PY@tok@kd\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
+\expandafter\def\csname PY@tok@s2\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
+\expandafter\def\csname PY@tok@gp\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,0.50}{##1}}}
+\expandafter\def\csname PY@tok@si\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.73,0.40,0.53}{##1}}}
+\expandafter\def\csname PY@tok@na\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.49,0.56,0.16}{##1}}}
+\expandafter\def\csname PY@tok@nc\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,1.00}{##1}}}
+\expandafter\def\csname PY@tok@kp\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
+\expandafter\def\csname PY@tok@k\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
+\expandafter\def\csname PY@tok@c\endcsname{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}}
+\expandafter\def\csname PY@tok@s\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
+\expandafter\def\csname PY@tok@sx\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
+\expandafter\def\csname PY@tok@s1\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
+\expandafter\def\csname PY@tok@gu\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.50,0.00,0.50}{##1}}}
+\expandafter\def\csname PY@tok@bp\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
+\expandafter\def\csname PY@tok@kr\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
+\expandafter\def\csname PY@tok@nn\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,1.00}{##1}}}
+\expandafter\def\csname PY@tok@err\endcsname{\def\PY@bc##1{\setlength{\fboxsep}{0pt}\fcolorbox[rgb]{1.00,0.00,0.00}{1,1,1}{\strut ##1}}}
+\expandafter\def\csname PY@tok@nv\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
+\expandafter\def\csname PY@tok@ge\endcsname{\let\PY@it=\textit}
+\expandafter\def\csname PY@tok@kc\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
+\expandafter\def\csname PY@tok@sh\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
+\expandafter\def\csname PY@tok@m\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
+\expandafter\def\csname PY@tok@nt\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
+\expandafter\def\csname PY@tok@gs\endcsname{\let\PY@bf=\textbf}
+\expandafter\def\csname PY@tok@ow\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.67,0.13,1.00}{##1}}}
+
+\def\PYZbs{\char`\\}
+\def\PYZus{\char`\_}
+\def\PYZob{\char`\{}
+\def\PYZcb{\char`\}}
+\def\PYZca{\char`\^}
+\def\PYZam{\char`\&}
+\def\PYZlt{\char`\<}
+\def\PYZgt{\char`\>}
+\def\PYZsh{\char`\#}
+\def\PYZpc{\char`\%}
+\def\PYZdl{\char`\$}
+\def\PYZhy{\char`\-}
+\def\PYZsq{\char`\'}
+\def\PYZdq{\char`\"}
+\def\PYZti{\char`\~}
+% for compatibility with earlier versions
+\def\PYZat{@}
+\def\PYZlb{[}
+\def\PYZrb{]}
+\makeatother
diff --git a/hw4/harvardml.cls b/hw4/harvardml.cls
new file mode 100644
index 0000000..e285173
--- /dev/null
+++ b/hw4/harvardml.cls
@@ -0,0 +1,97 @@
+% Ryan Adams
+% School of Engineering and Applied Sciences
+% Harvard University
+% v0.01, 31 August 2013
+% Based on HMC Math Dept. template by Eric J. Malm.
+\NeedsTeXFormat{LaTeX2e}[1995/01/01]
+\ProvidesClass{harvardml}
+[2013/08/31 v0.01 Harvard ML Assignment Class]
+
+\RequirePackage{ifpdf}
+
+\newif\ifhmlset@submit
+\DeclareOption{submit}{%
+  \hmlset@submittrue%
+}
+\DeclareOption{nosubmit}{%
+  \hmlset@submitfalse%
+}
+
+\DeclareOption*{\PassOptionsToClass{\CurrentOption}{article}}
+\ExecuteOptions{nosubmit}
+\ProcessOptions\relax
+
+\LoadClass[10pt,letterpaper]{article}
+
+\newif\ifhmlset@header
+
+\hmlset@headertrue
+
+\RequirePackage{mathpazo}
+\RequirePackage{palatino}
+\RequirePackage{amsmath}
+\RequirePackage{amssymb}
+\RequirePackage{amsthm}
+\RequirePackage{fullpage}
+\RequirePackage{mdframed}
+
+\newtheoremstyle{hmlplain}
+                {3pt}% Space above
+                {3pt}% Space below
+                {}% Body font
+                {}% Indent amount
+                {\bfseries}% Theorem head font
+                {\\*[3pt]}% Punctuation after theorem head
+                {.5em}% Space after theorem head
+                {}% Theorem head spec (can be left empty, meaning `normal')
+
+\def\titlebar{\hrule height2pt\vskip .25in\vskip-\parskip}
+
+\newcommand{\headerblock}{%
+  \noindent\begin{minipage}{0.33\textwidth}
+  \begin{flushleft}
+  \ifhmlset@submit
+    \mbox{\hmlset@name}\\
+    \mbox{\tt \hmlset@email}\\
+    \mbox{\hmlset@course}
+  \fi
+  \end{flushleft}
+  \end{minipage}
+  \noindent\begin{minipage}{0.33\textwidth}
+  \begin{center}
+    \mbox{\Large\hmlset@assignment}\protect\\
+    Due: \hmlset@duedate
+  \end{center}
+  \end{minipage}
+  \noindent\begin{minipage}{0.33\textwidth}
+  \begin{flushright}
+  \ifhmlset@submit
+    Collaborators: \hmlset@collaborators
+    \fi
+  \end{flushright}
+  \end{minipage}
+  \vspace{0.1cm}
+  \titlebar
+}
+
+\ifhmlset@header\AtBeginDocument{\headerblock}\fi
+
+\def\hmlset@name{}
+\def\hmlset@email{}
+\def\hmlset@course{}
+\def\hmlset@assignment{}
+\def\hmlset@duedate{}
+\def\hmlset@collaborators{}
+\def\hmlset@extraline{}
+
+% commands to set header block info
+\newcommand{\name}[1]{\def\hmlset@name{#1}}
+\newcommand{\email}[1]{\def\hmlset@email{#1}}
+\newcommand{\course}[1]{\def\hmlset@course{#1}}
+\newcommand{\assignment}[1]{\def\hmlset@assignment{#1}}
+\newcommand{\duedate}[1]{\def\hmlset@duedate{#1}}
+\newcommand{\collaborators}[1]{\def\hmlset@collaborators{#1}}
+\newcommand{\extraline}[1]{\def\hmlset@extraline{#1}}
+
+\theoremstyle{hmlplain}
+\newmdtheoremenv[skipabove=\topsep,skipbelow=\topsep,nobreak=true]{problem}{Problem}
diff --git a/hw4/main.tex b/hw4/main.tex
new file mode 100644
index 0000000..414f37d
--- /dev/null
+++ b/hw4/main.tex
@@ -0,0 +1,262 @@
+% No 'submit' option for the problems by themselves.
+\documentclass[submit]{harvardml}
+% Use the 'submit' option when you submit your solutions.
+%\documentclass[submit]{harvardml}
+
+\usepackage{url}
+
+% Put in your full name and email address.
+\name{Thibaut Horel}
+\email{thorel@seas.harvard.edu}
+
+% List any people you worked with.
+\collaborators{%
+}
+
+% You don't need to change these.
+\course{CS281-F13}
+\assignment{Assignment \#4}
+\duedate{23:59pm November 13, 2015}
+
+% Useful macros
+\newcommand{\bw}{\boldsymbol{w}}
+\newcommand{\distNorm}{\mathcal{N}}
+\newcommand{\given}{\,|\,}
+\newcommand{\ident}{\mathbb{I}}
+\newcommand{\btheta}{\boldsymbol{\theta}}
+\newcommand{\bz}{\boldsymbol{z}}
+\newcommand{\balpha}{\boldsymbol{\alpha}}
+\newcommand{\bbeta}{\boldsymbol{\beta}}
+\usepackage{amsmath}
+\usepackage{hyperref}
+\usepackage{graphicx}
+
+% Some useful macros.
+\newcommand{\R}{\mathbb{R}}
+\newcommand{\E}{\mathbb{E}}
+\newcommand{\var}{\text{var}}
+\newcommand{\cov}{\text{cov}}
+\newcommand{\N}{\mathcal{N}}
+\newcommand{\ep}{\varepsilon}
+
+\theoremstyle{plain}
+\newtheorem{lemma}{Lemma}
+\input{def.tex}
+\begin{document}
+
+\begin{problem}[10 pts]
+  For a target density~$p(x)$ and proposal
+  density~$q(x'\gets x)$, the Metropolis--Hastings transition
+  operator is given by
+  \begin{align*}
+    T(x'\gets x) &= q(x'\gets x)\,\min\left\{1,
+    \frac{\tilde p(x')\,q(x\gets x')}{\tilde p(x)\,q(x'\gets x)}
+    \right\}
+  \end{align*}
+  where $\tilde p(x)$ is the unnormalized target density.
+  
+  Show that this transition operator satisfies detailed balance.
+\end{problem}
+
+\paragraph{Solution.} We need to show that for any two states $x$ and $x'$:
+\begin{displaymath}
+    p(x)T(x'\gets x) = p(x')T(x\gets x')
+\end{displaymath}
+Let us expand both sides, we need to show that:
+\begin{displaymath}
+    p(x) q(x'\gets x)\,\min\left\{1,
+    \frac{\tilde p(x')\,q(x\gets x')}{\tilde p(x)\,q(x'\gets x)}
+    \right\}
+    = p(x')q(x\gets x')\,\min\left\{1,
+    \frac{\tilde p(x)\,q(x'\gets x)}{\tilde p(x')\,q(x\gets x')}
+    \right\}
+\end{displaymath}
+We distinguish two cases:
+\begin{itemize}
+    \item $\tilde{p}(x')q(x\gets x') \leq \tilde{p}(x)q(x'\gets x)$: in that
+        case the $\min$ in the left-hand side collapses to its second argument
+        and the left-hand side collapses to:
+        \begin{displaymath}
+            p(x')q(x\gets x')
+        \end{displaymath}
+        But in that case the $\min$ in the right-hand side collapses to $1$
+        (its first argument) and the right-hand side collapses to:
+        \begin{displaymath}
+            p(x')q(x\gets x')
+        \end{displaymath}
+        and we see that the two sides are equal.
+    \item $\tilde{p}(x')q(x\gets x') > \tilde{p}(x)q(x'\gets x)$: this case is
+        treated in exactly the same way as the above case by observing that it
+        is equivalent after swapping the role of $x$ and $x'$.
+\end{itemize}
+The two cases together conclude the proof of detailed balance.
+
+\newpage
+
+\section*{Modeling users and jokes with a Bayesian latent bilinear model}
+
+The next two questions will develop Bayesian inference methods for the simplest version of the latent bilinear model you used to model jokes ratings in HW3.
+
+
+The data set we'll use is the same as in HW3, a modified and preprocessed variant of the Jester data set.
+However, to make things easier (and to make being Bayesian more worthwhile) {\bf we'll only use the first 100,000 joke ratings} for your training set.  The second 100,000 ratings will form your test set.
+
+\subsection*{The model}
+
+The model is the same as in HW3, but with Gaussian priors on the latent parameter matrices $U$ and $V$.
+
+Let~${r_{i,j}\in\{1,2,3,4,5\}}$ be the rating of user $i$ on joke $j$.  A latent linear model introduces a vector ${u_i\in\R^K}$ for each user and a vector~${v_j\in\R^K}$ for each joke.  Then, each rating is modeled as a noisy version of the appropriate inner product. Specifically,
+\[
+r_{i,j} \sim \mathcal{N}(u_i^T v_j, \sigma_\epsilon^2).
+\]
+Fix $\sigma_\epsilon^2$ to be 1.0, and $K = 2$.
+
+We put independent Gaussian priors on each element of $U$ and $V$:
+\[U_{i,k} \sim \mathcal{N}(0, \sigma_U^2=5)\]
+\[V_{i,k} \sim \mathcal{N}(0, \sigma_V^2=5)\]
+
+\begin{problem}[Gibbs sampling, 30pts]
+
+.
+
+\begin{enumerate}
+
+\item Write down the Gibbs update equations for U and V.  That is to say, write their conditional distibutions, conditioned on all the other variables as well as the training data:
+%
+$$p(U | V, R )$$
+$$p(V | U, R )$$
+
+Because the model is bi-linear, these updates should have fairly simple forms.
+
+\item Write a Gibbs sampler for $U$ and $V$ that updates $U$ and $V$ in turn.
+
+\item Run the Gibbs sampler for 100 steps (i.e. update both $U$ and $V$ 100 times).
+Plot the training and test-set log-likelihood as a function of the number of steps through your training set.
+That is, use all previous samples of $U, V$ to evaluate the predictive probability of all ratings.
+
+\item One reason to be Bayesian is that you don't have to worry about overfitting.
+Run the your Gibbs sampler for $K = 1$ to $K = 10$, and plot the training and test-set log-likelihood for each value of $K$.  How do the shapes of these curve differ from the curves you saw when doing maximum likelihood estimation in HW3?
+
+
+\end{enumerate}
+\end{problem}
+
+\paragraph{Solution.} 1. Using the results about systems of Gaussians from
+Murphy's chapter 4, we get:
+\begin{displaymath}
+    u_i | R, V \sim \mathcal{N}\left(\frac{\Sigma_1}{\sigma_\epsilon^2} V_i^Tr_i,
+        \Sigma_1\right)
+\end{displaymath}
+where $r_i$ is the vector of ratings given by user $i$, $V_i$ is the projection
+of matrix $V$ which only keeps the jokes (rows) rated by user $i$, and $\Sigma$
+is given by:
+\begin{displaymath}
+    \Sigma_1^{-1} = \frac{Id}{\sigma_U^2} + \frac{V_i^TV_i}{\sigma^2_\epsilon}
+\end{displaymath}
+
+Similarly, we get that:
+\begin{displaymath}
+    v_j | R, v \sim
+    \mathcal{N}\left(\frac{\Sigma_2}{\sigma_\epsilon^2}U_j^Tr_j,
+    \Sigma_2\right)
+\end{displaymath}
+where $U_j$ is the projection of matrix $U$ by only keeping the rows
+(users) which have rated joke $j$, $r_j$ is the vector of ratings given to joke
+$j$, and $\Sigma_2$ is given by:
+\begin{displaymath}
+    \Sigma_2^{-1} = \frac{Id}{\sigma_V^2} + \frac{U_j^TU_j}{\sigma^2_\epsilon}
+\end{displaymath}
+
+2. Please see the code in the appendix \textsf{2.py}. I used two copies of the
+   ratings matrix: one in CSC format and one in CSR format to be able to
+   quickly extract $r_i$ the ratings of a given user and $r_j$ the ratings of
+   a given joke, as well as the indices of these ratings to index the matrices
+   $U_i$ and $V_j$ appearing in the above formulas.
+
+3. The plot of the likelihood (up to a multiplicative factor) can be found
+   below:
+\begin{center}
+    \includegraphics[width=0.8\textwidth]{2.pdf}
+\end{center}
+
+4. For clarity I decided to plot the MSE (mean squared error) in log scale
+   instead of the log-likelihood. Otherwise the lines were hard to distinguish.
+   Note that the MSE is (up to a multiplicative factor) simply the opposite of
+   the log-likelihood because we have a Gaussian model.
+
+   The plot of the training MSE for different values of $K$ can be found below:
+\begin{center}
+    \includegraphics[width=0.7\textwidth]{train.pdf}
+\end{center}
+And for the test MSE:
+\begin{center}
+    \includegraphics[width=0.7\textwidth]{test.pdf}
+\end{center}
+As we can see, the MSE is smaller or the training dataset, which is to be
+expected. However, unlike in the previous assignment, the test MSE gets better
+when $K$ increases. We do not observe the overfitting phenomenon where the
+train MSE decreases with $K$ but the test MSE starts increasing again or large
+values of $K$.
+
+
+\begin{problem}[Stochastic Variational Inference, 30pts]
+
+Now we'll repeat the same experiments, but using stochastic variational inference instead.
+
+Recall that variational inference optimizes a lower bound on the log marginal likelihood (integrating out parameters $\theta$), like so:
+\begin{align}
+\log p(x) & = \log \int p(x, \theta) d\theta = \log \int p(x|\theta) p(\theta) d\theta \\
+& = \log \int \frac{q_\phi(\theta)}{q_\phi(\theta)} p(x|\theta) p(\theta) d\theta 
+  = \log \mathbb{E}_{q_\phi} \frac{1}{q(\theta)} p(x|\theta) p(\theta) d\theta \\
+& \geq \mathbb{E}_{q_\phi} \log \left[ \frac{1}{q_\phi(\theta)} p(x|\theta) p(\theta) \right] 
+ = \underbrace{-\mathbb{E}_{q_\phi} \log q_\phi(\theta)}_{\textnormal{entropy}}  + \underbrace{\mathbb{E}_{q_\phi} \log p(\theta)}_{\textnormal{prior}} + \underbrace{\mathbb{E}_{q_\phi} \log p(x|\theta)}_{\textnormal{likelihood}}
+= \mathcal{L}(\phi)
+\end{align}
+%
+In this case, $\theta = U,V$ and $x = R$:
+%
+\begin{align}
+\mathcal{L}(\phi) = -\mathbb{E}_{q_\phi} \log q_\phi(U, V) + \mathbb{E}_{q_\phi} \log p(U, V) + \sum_{n=1}^N \mathbb{E}_{q_\phi} \log p(r_n | U, V) 
+\end{align}
+%
+The nice part about writing the lower bound as a set of expectations is that if we can sample from $q_\phi(\theta)$, then we can estimate the bound (and its gradient) by simple Monte Carlo.
+That is, we can sample from $q(\theta)$, and estimate the bound or its gradients by evaluating at the samples and averaging them.
+
+This is a very general formula that works for many different priors, likelihoods and variational approximations.
+In this case, we'll keep things simple and choose $q(U,V)$ to be a Gaussian factorized over every single entry of each matrix (the same form as the prior).
+Thus our variational parameters will consist of a mean and variance for each entry in U and V.
+% $\phi^{(\mu U)}_{ik}$, $\phi^{(\sigma^2 U)}_{ik}$, $\phi^{(\mu V)}_{jk}$, and $\phi^{(\sigma^2 V)}_{jk}$.
+
+To make our method scalable, we have to update the variational parameters without passing over the whole dataset each time.
+We can do this by taking Monte Carlo estimates over ratings.
+However, we to need scale our estimates so that their expectations are the same as if we did the sum exactly.
+
+
+\begin{enumerate}
+
+\item 
+Write an unbiased Monte Carlo estimate of the lower bound, depending only on one rating at a time.
+Write the entropy and prior terms exactly, without using Monte Carlo.  Hint: You can combine them into a KL divergence between two Gaussians.
+
+\item Write the gradient of your estimate of the lower bound w.r.t. the variational parameters.
+
+Hint: To capture all the ways in which the gradient depends on the parameters, if your gradient has an expectation of the form $\mathbb{E}_{X \sim \mathcal{N}(\mu, \sigma)}[f(X)]$, re-write it in terms of $\mathbb{E}_{Z \sim \mathcal{N}(0, 1)}[f(Z \sigma + \mu)]$.
+
+\item For $K = 2$, optimize the variational parameters for 10 epochs over the first 100,000 datapoints.  Use whichever optimization method you prefer.
+
+Plot the training and test-set log-likelihood as a function of the number of epochs, as well as the marginal likelihood lower bound.
+That is to say: at the end of each epoch, evaluate the log of the average predictive probability of all ratings in the training and test sets using 100 samples from q(U,V).
+The lower bound is the sum of entropy, prior and likelihood terms, while the training-set and test-set likelihoods only use the likelihood term.
+
+\item Fit your variational model for $K = 1$ to $K = 10$, and plot the training-set log-likelihood, test-set log-likelihood, and lower bound for each value of $K$.
+How do the shapes of these curves differ?
+
+\item What are the pros and cons of Gibbs sampling versus stochastic variational inference?
+
+\end{enumerate}
+\end{problem}
+  
+\appendix
+\section{\textsf{2.py}}
+\input{2.tex}
+\end{document}
diff --git a/hw4/plot.py b/hw4/plot.py
new file mode 100644
index 0000000..0ed704a
--- /dev/null
+++ b/hw4/plot.py
@@ -0,0 +1,45 @@
+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sbn
+
+sbn.set_style("white")
+
+
+def three():
+    with open("gibbs_2.txt") as fh:
+        values = [map(float, line.strip().split()) for line in fh]
+        a, b, c = zip(*values)
+        plt.figure(figsize=(8, 6))
+        plt.plot(a, -np.array(b), label="train")
+        plt.plot(a, -np.array(c), label="test")
+        plt.legend(loc="lower right")
+        plt.xlabel("Epoch")
+        plt.ylabel("Likelihood")
+        plt.savefig("2.pdf")
+
+
+def four():
+    plt.figure(figsize=(8, 6))
+    for i in xrange(1, 11):
+        with open("gibbs_" + str(i) + ".txt") as fh:
+            values = [map(float, line.strip().split()) for line in fh]
+            a, b, c = zip(*values)
+            plt.plot(a, np.array(b), label="K=" + str(i))
+    plt.legend(loc="upper right")
+    plt.xlabel("Epoch")
+    plt.ylabel("MSE")
+    plt.axes().set_yscale("log")
+    plt.savefig("train.pdf")
+
+    plt.figure(figsize=(8, 6))
+    for i in xrange(1, 11):
+        with open("gibbs_" + str(i) + ".txt") as fh:
+            values = [map(float, line.strip().split()) for line in fh]
+            a, b, c = zip(*values)
+            plt.plot(a, np.array(c), label="K=" + str(i))
+    plt.legend(loc="upper right")
+    plt.xlabel("Epoch")
+    plt.ylabel("MSE")
+    plt.axes().set_yscale("log")
+    plt.savefig("test.pdf")
+four()
diff --git a/hw4/test.pdf b/hw4/test.pdf
new file mode 100644
index 0000000..aab8706
--- /dev/null
+++ b/hw4/test.pdf
diff --git a/hw4/train.pdf b/hw4/train.pdf
new file mode 100644
index 0000000..1d32341
--- /dev/null
+++ b/hw4/train.pdf
author	Thibaut Horel <thibaut.horel@gmail.com>	2015-11-16 12:35:05 -0500
committer	Thibaut Horel <thibaut.horel@gmail.com>	2015-11-16 12:35:05 -0500
commit	adc7cb7256c8fcc11e7fd85866d6d3e2dcb319c1 (patch)
tree	9b0065b6215919e86fc0ea3f377ea6bf536b4bf2
parent	61f644a6a7d36dc5c15d957c48d10675ab3627ae (diff)
download	cs281-master.tar.gz