diff options
| author | Thibaut Horel <thibaut.horel@gmail.com> | 2015-11-16 12:35:05 -0500 |
|---|---|---|
| committer | Thibaut Horel <thibaut.horel@gmail.com> | 2015-11-16 12:35:05 -0500 |
| commit | adc7cb7256c8fcc11e7fd85866d6d3e2dcb319c1 (patch) | |
| tree | 9b0065b6215919e86fc0ea3f377ea6bf536b4bf2 | |
| parent | 61f644a6a7d36dc5c15d957c48d10675ab3627ae (diff) | |
| download | cs281-master.tar.gz | |
| -rw-r--r-- | hw4/2.pdf | bin | 0 -> 13566 bytes | |||
| -rw-r--r-- | hw4/2.py | 81 | ||||
| -rw-r--r-- | hw4/2.tex | 83 | ||||
| -rw-r--r-- | hw4/3.py | 89 | ||||
| -rw-r--r-- | hw4/def.tex | 95 | ||||
| -rw-r--r-- | hw4/harvardml.cls | 97 | ||||
| -rw-r--r-- | hw4/main.tex | 262 | ||||
| -rw-r--r-- | hw4/plot.py | 45 | ||||
| -rw-r--r-- | hw4/test.pdf | bin | 0 -> 17947 bytes | |||
| -rw-r--r-- | hw4/train.pdf | bin | 0 -> 17764 bytes |
10 files changed, 752 insertions, 0 deletions
diff --git a/hw4/2.pdf b/hw4/2.pdf Binary files differnew file mode 100644 index 0000000..fe7cdae --- /dev/null +++ b/hw4/2.pdf diff --git a/hw4/2.py b/hw4/2.py new file mode 100644 index 0000000..275be3e --- /dev/null +++ b/hw4/2.py @@ -0,0 +1,81 @@ +import sys +from itertools import islice +import numpy as np +from scipy.sparse import coo_matrix +from math import sqrt + + +def get_ratings(filename): + with open(filename) as fh: + for line in fh: + yield map(int, line.strip().split()) + + +def get_train_test(filename): + l = [(i, j) for (i, j, _) in get_ratings(filename)] + n = max(i for (i, _) in l) + m = max(j for (_, j) in l) + g = get_ratings(filename) + train = islice(g, 100000) + test = islice(g, 100000) + return n, m, list(train), list(test) + + +def sparse_matrix(ratings): + i, j, data = zip(*ratings) + S = coo_matrix((data, (i, j))) + return S.tocsc(), S.tocsr() + + +def get_users(Rr): + return [i for i in xrange(Rr.shape[0]) if len(Rr[i].nonzero()[1])] + + +def get_jokes(Rc): + return [j for j in xrange(Rc.shape[1]) if len(Rc[:, j].nonzero()[0])] + + +def sample_users(U, V, Rr, users): + for i in users: + r = Rr[i] + ind = r.nonzero()[1] + v = V[ind] + isigma = np.identity(k) / 5. + np.dot(v.T, v) + sigma = np.linalg.inv(isigma) + U[i] = np.random.multivariate_normal(np.dot(sigma, r.dot(V)[0]), + sigma) + + +def sample_jokes(U, V, Rc, jokes): + for j in jokes: + r = Rc[:, j] + u = U[r.nonzero()[0]] + isigma = np.identity(k) / 5. + np.dot(u.T, u) + sigma = np.linalg.inv(isigma) + V[j] = np.random.multivariate_normal(np.dot(sigma, r.T.dot(U)[0]), + sigma) + + +def sample(U, V, Rr, Rc, users, jokes): + sample_users(U, V, Rr, users) + sample_jokes(U, V, Rc, jokes) + + +def likelihood(ratings, U, V): + return sum((r - np.inner(U[i], V[j])) ** 2 for i, j, r in ratings) + + +if __name__ == "__main__": + n, m, train, test = get_train_test(sys.argv[1]) + Rc, Rr = sparse_matrix(train) + users = get_users(Rr) # users with at least one rating + jokes = get_jokes(Rc) # jokes with at least one rating + for k in xrange(1, 11): + with open("gibbs_" + str(k) + ".txt", "w") as fh: + U = np.random.normal(0, sqrt(5), size=(Rc.shape[0], k)) + V = np.random.normal(0, sqrt(5), size=(Rc.shape[1], k)) + for e in xrange(100): + fh.write("\t".join(map(str, [e, likelihood(train, U, V), + likelihood(test, U, V)])) + "\n") + fh.flush() + sample(U, V, Rr, Rc, users, jokes) diff --git a/hw4/2.tex b/hw4/2.tex new file mode 100644 index 0000000..6376208 --- /dev/null +++ b/hw4/2.tex @@ -0,0 +1,83 @@ +\begin{Verbatim}[commandchars=\\\{\}] +\PY{k+kn}{import} \PY{n+nn}{sys} +\PY{k+kn}{from} \PY{n+nn}{itertools} \PY{k+kn}{import} \PY{n}{islice} +\PY{k+kn}{import} \PY{n+nn}{numpy} \PY{k+kn}{as} \PY{n+nn}{np} +\PY{k+kn}{from} \PY{n+nn}{scipy.sparse} \PY{k+kn}{import} \PY{n}{coo\PYZus{}matrix} +\PY{k+kn}{from} \PY{n+nn}{math} \PY{k+kn}{import} \PY{n}{sqrt} + + +\PY{k}{def} \PY{n+nf}{get\PYZus{}ratings}\PY{p}{(}\PY{n}{filename}\PY{p}{)}\PY{p}{:} + \PY{k}{with} \PY{n+nb}{open}\PY{p}{(}\PY{n}{filename}\PY{p}{)} \PY{k}{as} \PY{n}{fh}\PY{p}{:} + \PY{k}{for} \PY{n}{line} \PY{o+ow}{in} \PY{n}{fh}\PY{p}{:} + \PY{k}{yield} \PY{n+nb}{map}\PY{p}{(}\PY{n+nb}{int}\PY{p}{,} \PY{n}{line}\PY{o}{.}\PY{n}{strip}\PY{p}{(}\PY{p}{)}\PY{o}{.}\PY{n}{split}\PY{p}{(}\PY{p}{)}\PY{p}{)} + + +\PY{k}{def} \PY{n+nf}{get\PYZus{}train\PYZus{}test}\PY{p}{(}\PY{n}{filename}\PY{p}{)}\PY{p}{:} + \PY{n}{l} \PY{o}{=} \PY{p}{[}\PY{p}{(}\PY{n}{i}\PY{p}{,} \PY{n}{j}\PY{p}{)} \PY{k}{for} \PY{p}{(}\PY{n}{i}\PY{p}{,} \PY{n}{j}\PY{p}{,} \PY{n}{\PYZus{}}\PY{p}{)} \PY{o+ow}{in} \PY{n}{get\PYZus{}ratings}\PY{p}{(}\PY{n}{filename}\PY{p}{)}\PY{p}{]} + \PY{n}{n} \PY{o}{=} \PY{n+nb}{max}\PY{p}{(}\PY{n}{i} \PY{k}{for} \PY{p}{(}\PY{n}{i}\PY{p}{,} \PY{n}{\PYZus{}}\PY{p}{)} \PY{o+ow}{in} \PY{n}{l}\PY{p}{)} + \PY{n}{m} \PY{o}{=} \PY{n+nb}{max}\PY{p}{(}\PY{n}{j} \PY{k}{for} \PY{p}{(}\PY{n}{\PYZus{}}\PY{p}{,} \PY{n}{j}\PY{p}{)} \PY{o+ow}{in} \PY{n}{l}\PY{p}{)} + \PY{n}{g} \PY{o}{=} \PY{n}{get\PYZus{}ratings}\PY{p}{(}\PY{n}{filename}\PY{p}{)} + \PY{n}{train} \PY{o}{=} \PY{n}{islice}\PY{p}{(}\PY{n}{g}\PY{p}{,} \PY{l+m+mi}{100000}\PY{p}{)} + \PY{n}{test} \PY{o}{=} \PY{n}{islice}\PY{p}{(}\PY{n}{g}\PY{p}{,} \PY{l+m+mi}{100000}\PY{p}{)} + \PY{k}{return} \PY{n}{n}\PY{p}{,} \PY{n}{m}\PY{p}{,} \PY{n+nb}{list}\PY{p}{(}\PY{n}{train}\PY{p}{)}\PY{p}{,} \PY{n+nb}{list}\PY{p}{(}\PY{n}{test}\PY{p}{)} + + +\PY{k}{def} \PY{n+nf}{sparse\PYZus{}matrix}\PY{p}{(}\PY{n}{ratings}\PY{p}{)}\PY{p}{:} + \PY{n}{i}\PY{p}{,} \PY{n}{j}\PY{p}{,} \PY{n}{data} \PY{o}{=} \PY{n+nb}{zip}\PY{p}{(}\PY{o}{*}\PY{n}{ratings}\PY{p}{)} + \PY{n}{S} \PY{o}{=} \PY{n}{coo\PYZus{}matrix}\PY{p}{(}\PY{p}{(}\PY{n}{data}\PY{p}{,} \PY{p}{(}\PY{n}{i}\PY{p}{,} \PY{n}{j}\PY{p}{)}\PY{p}{)}\PY{p}{)} + \PY{k}{return} \PY{n}{S}\PY{o}{.}\PY{n}{tocsc}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{S}\PY{o}{.}\PY{n}{tocsr}\PY{p}{(}\PY{p}{)} + + +\PY{k}{def} \PY{n+nf}{get\PYZus{}users}\PY{p}{(}\PY{n}{Rr}\PY{p}{)}\PY{p}{:} + \PY{k}{return} \PY{p}{[}\PY{n}{i} \PY{k}{for} \PY{n}{i} \PY{o+ow}{in} \PY{n+nb}{xrange}\PY{p}{(}\PY{n}{Rr}\PY{o}{.}\PY{n}{shape}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{p}{)} \PY{k}{if} \PY{n+nb}{len}\PY{p}{(}\PY{n}{Rr}\PY{p}{[}\PY{n}{i}\PY{p}{]}\PY{o}{.}\PY{n}{nonzero}\PY{p}{(}\PY{p}{)}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{]}\PY{p}{)}\PY{p}{]} + + +\PY{k}{def} \PY{n+nf}{get\PYZus{}jokes}\PY{p}{(}\PY{n}{Rc}\PY{p}{)}\PY{p}{:} + \PY{k}{return} \PY{p}{[}\PY{n}{j} \PY{k}{for} \PY{n}{j} \PY{o+ow}{in} \PY{n+nb}{xrange}\PY{p}{(}\PY{n}{Rc}\PY{o}{.}\PY{n}{shape}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{]}\PY{p}{)} \PY{k}{if} \PY{n+nb}{len}\PY{p}{(}\PY{n}{Rc}\PY{p}{[}\PY{p}{:}\PY{p}{,} \PY{n}{j}\PY{p}{]}\PY{o}{.}\PY{n}{nonzero}\PY{p}{(}\PY{p}{)}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{p}{)}\PY{p}{]} + + +\PY{k}{def} \PY{n+nf}{sample\PYZus{}users}\PY{p}{(}\PY{n}{U}\PY{p}{,} \PY{n}{V}\PY{p}{,} \PY{n}{Rr}\PY{p}{,} \PY{n}{users}\PY{p}{)}\PY{p}{:} + \PY{k}{for} \PY{n}{i} \PY{o+ow}{in} \PY{n}{users}\PY{p}{:} + \PY{n}{r} \PY{o}{=} \PY{n}{Rr}\PY{p}{[}\PY{n}{i}\PY{p}{]} + \PY{n}{ind} \PY{o}{=} \PY{n}{r}\PY{o}{.}\PY{n}{nonzero}\PY{p}{(}\PY{p}{)}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{]} + \PY{n}{v} \PY{o}{=} \PY{n}{V}\PY{p}{[}\PY{n}{ind}\PY{p}{]} + \PY{n}{isigma} \PY{o}{=} \PY{n}{np}\PY{o}{.}\PY{n}{identity}\PY{p}{(}\PY{n}{k}\PY{p}{)} \PY{o}{/} \PY{l+m+mf}{5.} \PY{o}{+} \PY{n}{np}\PY{o}{.}\PY{n}{dot}\PY{p}{(}\PY{n}{v}\PY{o}{.}\PY{n}{T}\PY{p}{,} \PY{n}{v}\PY{p}{)} + \PY{n}{sigma} \PY{o}{=} \PY{n}{np}\PY{o}{.}\PY{n}{linalg}\PY{o}{.}\PY{n}{inv}\PY{p}{(}\PY{n}{isigma}\PY{p}{)} + \PY{n}{U}\PY{p}{[}\PY{n}{i}\PY{p}{]} \PY{o}{=} \PY{n}{np}\PY{o}{.}\PY{n}{random}\PY{o}{.}\PY{n}{multivariate\PYZus{}normal}\PY{p}{(}\PY{n}{np}\PY{o}{.}\PY{n}{dot}\PY{p}{(}\PY{n}{sigma}\PY{p}{,} \PY{n}{r}\PY{o}{.}\PY{n}{dot}\PY{p}{(}\PY{n}{V}\PY{p}{)}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{p}{)}\PY{p}{,} + \PY{n}{sigma}\PY{p}{)} + + +\PY{k}{def} \PY{n+nf}{sample\PYZus{}jokes}\PY{p}{(}\PY{n}{U}\PY{p}{,} \PY{n}{V}\PY{p}{,} \PY{n}{Rc}\PY{p}{,} \PY{n}{jokes}\PY{p}{)}\PY{p}{:} + \PY{k}{for} \PY{n}{j} \PY{o+ow}{in} \PY{n}{jokes}\PY{p}{:} + \PY{n}{r} \PY{o}{=} \PY{n}{Rc}\PY{p}{[}\PY{p}{:}\PY{p}{,} \PY{n}{j}\PY{p}{]} + \PY{n}{u} \PY{o}{=} \PY{n}{U}\PY{p}{[}\PY{n}{r}\PY{o}{.}\PY{n}{nonzero}\PY{p}{(}\PY{p}{)}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{p}{]} + \PY{n}{isigma} \PY{o}{=} \PY{n}{np}\PY{o}{.}\PY{n}{identity}\PY{p}{(}\PY{n}{k}\PY{p}{)} \PY{o}{/} \PY{l+m+mf}{5.} \PY{o}{+} \PY{n}{np}\PY{o}{.}\PY{n}{dot}\PY{p}{(}\PY{n}{u}\PY{o}{.}\PY{n}{T}\PY{p}{,} \PY{n}{u}\PY{p}{)} + \PY{n}{sigma} \PY{o}{=} \PY{n}{np}\PY{o}{.}\PY{n}{linalg}\PY{o}{.}\PY{n}{inv}\PY{p}{(}\PY{n}{isigma}\PY{p}{)} + \PY{n}{V}\PY{p}{[}\PY{n}{j}\PY{p}{]} \PY{o}{=} \PY{n}{np}\PY{o}{.}\PY{n}{random}\PY{o}{.}\PY{n}{multivariate\PYZus{}normal}\PY{p}{(}\PY{n}{np}\PY{o}{.}\PY{n}{dot}\PY{p}{(}\PY{n}{sigma}\PY{p}{,} \PY{n}{r}\PY{o}{.}\PY{n}{T}\PY{o}{.}\PY{n}{dot}\PY{p}{(}\PY{n}{U}\PY{p}{)}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{p}{)}\PY{p}{,} + \PY{n}{sigma}\PY{p}{)} + + +\PY{k}{def} \PY{n+nf}{sample}\PY{p}{(}\PY{n}{U}\PY{p}{,} \PY{n}{V}\PY{p}{,} \PY{n}{Rr}\PY{p}{,} \PY{n}{Rc}\PY{p}{,} \PY{n}{users}\PY{p}{,} \PY{n}{jokes}\PY{p}{)}\PY{p}{:} + \PY{n}{sample\PYZus{}users}\PY{p}{(}\PY{n}{U}\PY{p}{,} \PY{n}{V}\PY{p}{,} \PY{n}{Rr}\PY{p}{,} \PY{n}{users}\PY{p}{)} + \PY{n}{sample\PYZus{}jokes}\PY{p}{(}\PY{n}{U}\PY{p}{,} \PY{n}{V}\PY{p}{,} \PY{n}{Rc}\PY{p}{,} \PY{n}{jokes}\PY{p}{)} + + +\PY{k}{def} \PY{n+nf}{likelihood}\PY{p}{(}\PY{n}{ratings}\PY{p}{,} \PY{n}{U}\PY{p}{,} \PY{n}{V}\PY{p}{)}\PY{p}{:} + \PY{k}{return} \PY{n+nb}{sum}\PY{p}{(}\PY{p}{(}\PY{n}{r} \PY{o}{\PYZhy{}} \PY{n}{np}\PY{o}{.}\PY{n}{inner}\PY{p}{(}\PY{n}{U}\PY{p}{[}\PY{n}{i}\PY{p}{]}\PY{p}{,} \PY{n}{V}\PY{p}{[}\PY{n}{j}\PY{p}{]}\PY{p}{)}\PY{p}{)} \PY{o}{*}\PY{o}{*} \PY{l+m+mi}{2} \PY{k}{for} \PY{n}{i}\PY{p}{,} \PY{n}{j}\PY{p}{,} \PY{n}{r} \PY{o+ow}{in} \PY{n}{ratings}\PY{p}{)} + + +\PY{k}{if} \PY{n}{\PYZus{}\PYZus{}name\PYZus{}\PYZus{}} \PY{o}{==} \PY{l+s}{\PYZdq{}}\PY{l+s}{\PYZus{}\PYZus{}main\PYZus{}\PYZus{}}\PY{l+s}{\PYZdq{}}\PY{p}{:} + \PY{n}{n}\PY{p}{,} \PY{n}{m}\PY{p}{,} \PY{n}{train}\PY{p}{,} \PY{n}{test} \PY{o}{=} \PY{n}{get\PYZus{}train\PYZus{}test}\PY{p}{(}\PY{n}{sys}\PY{o}{.}\PY{n}{argv}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{]}\PY{p}{)} + \PY{n}{Rc}\PY{p}{,} \PY{n}{Rr} \PY{o}{=} \PY{n}{sparse\PYZus{}matrix}\PY{p}{(}\PY{n}{train}\PY{p}{)} + \PY{n}{users} \PY{o}{=} \PY{n}{get\PYZus{}users}\PY{p}{(}\PY{n}{Rr}\PY{p}{)} \PY{c}{\PYZsh{} users with at least one rating} + \PY{n}{jokes} \PY{o}{=} \PY{n}{get\PYZus{}jokes}\PY{p}{(}\PY{n}{Rc}\PY{p}{)} \PY{c}{\PYZsh{} jokes with at least one rating} + \PY{k}{for} \PY{n}{k} \PY{o+ow}{in} \PY{n+nb}{xrange}\PY{p}{(}\PY{l+m+mi}{1}\PY{p}{,} \PY{l+m+mi}{11}\PY{p}{)}\PY{p}{:} + \PY{k}{with} \PY{n+nb}{open}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{gibbs\PYZus{}}\PY{l+s}{\PYZdq{}} \PY{o}{+} \PY{n+nb}{str}\PY{p}{(}\PY{n}{k}\PY{p}{)} \PY{o}{+} \PY{l+s}{\PYZdq{}}\PY{l+s}{.txt}\PY{l+s}{\PYZdq{}}\PY{p}{,} \PY{l+s}{\PYZdq{}}\PY{l+s}{w}\PY{l+s}{\PYZdq{}}\PY{p}{)} \PY{k}{as} \PY{n}{fh}\PY{p}{:} + \PY{n}{U} \PY{o}{=} \PY{n}{np}\PY{o}{.}\PY{n}{random}\PY{o}{.}\PY{n}{normal}\PY{p}{(}\PY{l+m+mi}{0}\PY{p}{,} \PY{n}{sqrt}\PY{p}{(}\PY{l+m+mi}{5}\PY{p}{)}\PY{p}{,} \PY{n}{size}\PY{o}{=}\PY{p}{(}\PY{n}{Rc}\PY{o}{.}\PY{n}{shape}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}\PY{p}{,} \PY{n}{k}\PY{p}{)}\PY{p}{)} + \PY{n}{V} \PY{o}{=} \PY{n}{np}\PY{o}{.}\PY{n}{random}\PY{o}{.}\PY{n}{normal}\PY{p}{(}\PY{l+m+mi}{0}\PY{p}{,} \PY{n}{sqrt}\PY{p}{(}\PY{l+m+mi}{5}\PY{p}{)}\PY{p}{,} \PY{n}{size}\PY{o}{=}\PY{p}{(}\PY{n}{Rc}\PY{o}{.}\PY{n}{shape}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{]}\PY{p}{,} \PY{n}{k}\PY{p}{)}\PY{p}{)} + \PY{k}{for} \PY{n}{e} \PY{o+ow}{in} \PY{n+nb}{xrange}\PY{p}{(}\PY{l+m+mi}{100}\PY{p}{)}\PY{p}{:} + \PY{n}{fh}\PY{o}{.}\PY{n}{write}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s+se}{\PYZbs{}t}\PY{l+s}{\PYZdq{}}\PY{o}{.}\PY{n}{join}\PY{p}{(}\PY{n+nb}{map}\PY{p}{(}\PY{n+nb}{str}\PY{p}{,} \PY{p}{[}\PY{n}{e}\PY{p}{,} \PY{n}{likelihood}\PY{p}{(}\PY{n}{train}\PY{p}{,} \PY{n}{U}\PY{p}{,} \PY{n}{V}\PY{p}{)}\PY{p}{,} + \PY{n}{likelihood}\PY{p}{(}\PY{n}{test}\PY{p}{,} \PY{n}{U}\PY{p}{,} \PY{n}{V}\PY{p}{)}\PY{p}{]}\PY{p}{)}\PY{p}{)} \PY{o}{+} \PY{l+s}{\PYZdq{}}\PY{l+s+se}{\PYZbs{}n}\PY{l+s}{\PYZdq{}}\PY{p}{)} + \PY{n}{fh}\PY{o}{.}\PY{n}{flush}\PY{p}{(}\PY{p}{)} + \PY{n}{sample}\PY{p}{(}\PY{n}{U}\PY{p}{,} \PY{n}{V}\PY{p}{,} \PY{n}{Rr}\PY{p}{,} \PY{n}{Rc}\PY{p}{,} \PY{n}{users}\PY{p}{,} \PY{n}{jokes}\PY{p}{)} +\end{Verbatim} diff --git a/hw4/3.py b/hw4/3.py new file mode 100644 index 0000000..24c40a6 --- /dev/null +++ b/hw4/3.py @@ -0,0 +1,89 @@ +import sys +from itertools import islice + +import autograd.numpy as np +from autograd import grad + + +def get_ratings(filename): + with open(filename) as fh: + for line in fh: + yield map(int, line.strip().split()) + + +def get_train_test(filename): + l = [(i, j) for (i, j, _) in get_ratings(filename)] + n = max(i for (i, _) in l) + m = max(j for (_, j) in l) + g = get_ratings(filename) + train = islice(g, 100000) + test = islice(g, 100000) + return n, m, list(train), list(test) + + +def kl(muu, muv, sigmau, sigmav): + sigma = 5. + r = (np.log(sigma / sigmav).sum() + np.log(sigma / sigmau).sum() + + np.sum(sigmau ** 2 / (2 * sigma ** 2)) + + np.sum(sigmav ** 2 / (2 * sigma ** 2)) + + np.sum(muu ** 2 / (2 * sigma ** 2)) + + np.sum(muv ** 2 / (2 * sigma ** 2)) + ) + return r + + +def likelihood(params, r): + su, mu, sv, mv = params + s = len(mu) + ns = 50 + u = su * np.random.normal(size=(ns, s)) + mu + v = sv * np.random.normal(size=(ns, s)) + mv + p = np.sum(u * v, axis=1) + m = np.sum((r - p) ** 2 / 2.) + return m / float(ns) + +gl = grad(likelihood) + + +def sgd(ratings, muu, muv, sigmau, sigmav): + lamb = 0.05 + sigma = 5. + eps = 1e-5 + + for e in xrange(10): + print l(ratings, muu, muv, sigmau, sigmav) + # gradient update on the KL part + sigmau = sigmau + lamb * 1. / sigmau + sigmav = sigmav + lamb * 1. / sigmav + sigmau = sigmau - lamb * sigmau / (sigma ** 2) + sigmav = sigmav - lamb * sigmav / (sigma ** 2) + muu = muu - lamb * muu / (sigma ** 2) + muv = muv - lamb * muv / (sigma ** 2) + + for k, (i, j, r) in enumerate(ratings): + if k % 1000 == 0: + print k + su, mu, sv, mv = sigmau[i], muu[i], sigmav[j], muv[j] + gsu, gmu, gsv, gmv = gl((su, mu, sv, mv), r) + sigmau[i] = np.maximum(sigmau[i] - lamb * gsu, eps) + muu[i] = muu[i] - lamb * gmu + sigmav[j] = np.maximum(sigmav[j] - lamb * gsv, eps) + muv[j] = muv[j] - lamb * gmv + + +def l(ratings, muu, muv, sigmau, sigmav): + res = kl(muu, muv, sigmau, sigmav) + for (i, j, r) in ratings: + su, mu, sv, mv = sigmau[i], muu[i], sigmav[j], muv[j] + res += likelihood((su, mu, sv, mv), r) + return res + + +if __name__ == "__main__": + n, m, train, test = get_train_test(sys.argv[1]) + print len(train) + k = 2 + muu, sigmau = 0.01 * np.random.normal(size=(n+1, k)), 0.01 * np.ones((n+1, k)) + muv, sigmav = 0.01 * np.random.normal(size=(m+1, k)), 0.01 * np.ones((n+1, k)) + sgd(train, muu, muv, sigmau, sigmav) + print l(train, muu, muv, sigmau, sigmav) diff --git a/hw4/def.tex b/hw4/def.tex new file mode 100644 index 0000000..c2e21ce --- /dev/null +++ b/hw4/def.tex @@ -0,0 +1,95 @@ +\usepackage{fancyvrb} +\usepackage{color} +\makeatletter +\def\PY@reset{\let\PY@it=\relax \let\PY@bf=\relax% + \let\PY@ul=\relax \let\PY@tc=\relax% + \let\PY@bc=\relax \let\PY@ff=\relax} +\def\PY@tok#1{\csname PY@tok@#1\endcsname} +\def\PY@toks#1+{\ifx\relax#1\empty\else% + \PY@tok{#1}\expandafter\PY@toks\fi} +\def\PY@do#1{\PY@bc{\PY@tc{\PY@ul{% + \PY@it{\PY@bf{\PY@ff{#1}}}}}}} +\def\PY#1#2{\PY@reset\PY@toks#1+\relax+\PY@do{#2}} + +\expandafter\def\csname PY@tok@mo\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}} +\expandafter\def\csname PY@tok@ne\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.82,0.25,0.23}{##1}}} +\expandafter\def\csname PY@tok@cs\endcsname{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}} +\expandafter\def\csname PY@tok@mf\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}} +\expandafter\def\csname PY@tok@nl\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.63,0.63,0.00}{##1}}} +\expandafter\def\csname PY@tok@kn\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}} +\expandafter\def\csname PY@tok@il\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}} +\expandafter\def\csname PY@tok@sr\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.40,0.53}{##1}}} +\expandafter\def\csname PY@tok@ss\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}} +\expandafter\def\csname PY@tok@vi\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}} +\expandafter\def\csname PY@tok@se\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.73,0.40,0.13}{##1}}} +\expandafter\def\csname PY@tok@vg\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}} +\expandafter\def\csname PY@tok@gi\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.63,0.00}{##1}}} +\expandafter\def\csname PY@tok@sb\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}} +\expandafter\def\csname PY@tok@nf\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,1.00}{##1}}} +\expandafter\def\csname PY@tok@sd\endcsname{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}} +\expandafter\def\csname PY@tok@gh\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,0.50}{##1}}} +\expandafter\def\csname PY@tok@sc\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}} +\expandafter\def\csname PY@tok@kt\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.69,0.00,0.25}{##1}}} +\expandafter\def\csname PY@tok@ni\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.60,0.60,0.60}{##1}}} +\expandafter\def\csname PY@tok@vc\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}} +\expandafter\def\csname PY@tok@o\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}} +\expandafter\def\csname PY@tok@gr\endcsname{\def\PY@tc##1{\textcolor[rgb]{1.00,0.00,0.00}{##1}}} +\expandafter\def\csname PY@tok@cm\endcsname{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}} +\expandafter\def\csname PY@tok@w\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.73,0.73}{##1}}} +\expandafter\def\csname PY@tok@cp\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.74,0.48,0.00}{##1}}} +\expandafter\def\csname PY@tok@go\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}} +\expandafter\def\csname PY@tok@gd\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.63,0.00,0.00}{##1}}} +\expandafter\def\csname PY@tok@mb\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}} +\expandafter\def\csname PY@tok@gt\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.27,0.87}{##1}}} +\expandafter\def\csname PY@tok@no\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.53,0.00,0.00}{##1}}} +\expandafter\def\csname PY@tok@mh\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}} +\expandafter\def\csname PY@tok@c1\endcsname{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}} +\expandafter\def\csname PY@tok@nb\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}} +\expandafter\def\csname PY@tok@nd\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.67,0.13,1.00}{##1}}} +\expandafter\def\csname PY@tok@mi\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}} +\expandafter\def\csname PY@tok@kd\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}} +\expandafter\def\csname PY@tok@s2\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}} +\expandafter\def\csname PY@tok@gp\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,0.50}{##1}}} +\expandafter\def\csname PY@tok@si\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.73,0.40,0.53}{##1}}} +\expandafter\def\csname PY@tok@na\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.49,0.56,0.16}{##1}}} +\expandafter\def\csname PY@tok@nc\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,1.00}{##1}}} +\expandafter\def\csname PY@tok@kp\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}} +\expandafter\def\csname PY@tok@k\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}} +\expandafter\def\csname PY@tok@c\endcsname{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}} +\expandafter\def\csname PY@tok@s\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}} +\expandafter\def\csname PY@tok@sx\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}} +\expandafter\def\csname PY@tok@s1\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}} +\expandafter\def\csname PY@tok@gu\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.50,0.00,0.50}{##1}}} +\expandafter\def\csname PY@tok@bp\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}} +\expandafter\def\csname PY@tok@kr\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}} +\expandafter\def\csname PY@tok@nn\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,1.00}{##1}}} +\expandafter\def\csname PY@tok@err\endcsname{\def\PY@bc##1{\setlength{\fboxsep}{0pt}\fcolorbox[rgb]{1.00,0.00,0.00}{1,1,1}{\strut ##1}}} +\expandafter\def\csname PY@tok@nv\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}} +\expandafter\def\csname PY@tok@ge\endcsname{\let\PY@it=\textit} +\expandafter\def\csname PY@tok@kc\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}} +\expandafter\def\csname PY@tok@sh\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}} +\expandafter\def\csname PY@tok@m\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}} +\expandafter\def\csname PY@tok@nt\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}} +\expandafter\def\csname PY@tok@gs\endcsname{\let\PY@bf=\textbf} +\expandafter\def\csname PY@tok@ow\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.67,0.13,1.00}{##1}}} + +\def\PYZbs{\char`\\} +\def\PYZus{\char`\_} +\def\PYZob{\char`\{} +\def\PYZcb{\char`\}} +\def\PYZca{\char`\^} +\def\PYZam{\char`\&} +\def\PYZlt{\char`\<} +\def\PYZgt{\char`\>} +\def\PYZsh{\char`\#} +\def\PYZpc{\char`\%} +\def\PYZdl{\char`\$} +\def\PYZhy{\char`\-} +\def\PYZsq{\char`\'} +\def\PYZdq{\char`\"} +\def\PYZti{\char`\~} +% for compatibility with earlier versions +\def\PYZat{@} +\def\PYZlb{[} +\def\PYZrb{]} +\makeatother diff --git a/hw4/harvardml.cls b/hw4/harvardml.cls new file mode 100644 index 0000000..e285173 --- /dev/null +++ b/hw4/harvardml.cls @@ -0,0 +1,97 @@ +% Ryan Adams +% School of Engineering and Applied Sciences +% Harvard University +% v0.01, 31 August 2013 +% Based on HMC Math Dept. template by Eric J. Malm. +\NeedsTeXFormat{LaTeX2e}[1995/01/01] +\ProvidesClass{harvardml} +[2013/08/31 v0.01 Harvard ML Assignment Class] + +\RequirePackage{ifpdf} + +\newif\ifhmlset@submit +\DeclareOption{submit}{% + \hmlset@submittrue% +} +\DeclareOption{nosubmit}{% + \hmlset@submitfalse% +} + +\DeclareOption*{\PassOptionsToClass{\CurrentOption}{article}} +\ExecuteOptions{nosubmit} +\ProcessOptions\relax + +\LoadClass[10pt,letterpaper]{article} + +\newif\ifhmlset@header + +\hmlset@headertrue + +\RequirePackage{mathpazo} +\RequirePackage{palatino} +\RequirePackage{amsmath} +\RequirePackage{amssymb} +\RequirePackage{amsthm} +\RequirePackage{fullpage} +\RequirePackage{mdframed} + +\newtheoremstyle{hmlplain} + {3pt}% Space above + {3pt}% Space below + {}% Body font + {}% Indent amount + {\bfseries}% Theorem head font + {\\*[3pt]}% Punctuation after theorem head + {.5em}% Space after theorem head + {}% Theorem head spec (can be left empty, meaning `normal') + +\def\titlebar{\hrule height2pt\vskip .25in\vskip-\parskip} + +\newcommand{\headerblock}{% + \noindent\begin{minipage}{0.33\textwidth} + \begin{flushleft} + \ifhmlset@submit + \mbox{\hmlset@name}\\ + \mbox{\tt \hmlset@email}\\ + \mbox{\hmlset@course} + \fi + \end{flushleft} + \end{minipage} + \noindent\begin{minipage}{0.33\textwidth} + \begin{center} + \mbox{\Large\hmlset@assignment}\protect\\ + Due: \hmlset@duedate + \end{center} + \end{minipage} + \noindent\begin{minipage}{0.33\textwidth} + \begin{flushright} + \ifhmlset@submit + Collaborators: \hmlset@collaborators + \fi + \end{flushright} + \end{minipage} + \vspace{0.1cm} + \titlebar +} + +\ifhmlset@header\AtBeginDocument{\headerblock}\fi + +\def\hmlset@name{} +\def\hmlset@email{} +\def\hmlset@course{} +\def\hmlset@assignment{} +\def\hmlset@duedate{} +\def\hmlset@collaborators{} +\def\hmlset@extraline{} + +% commands to set header block info +\newcommand{\name}[1]{\def\hmlset@name{#1}} +\newcommand{\email}[1]{\def\hmlset@email{#1}} +\newcommand{\course}[1]{\def\hmlset@course{#1}} +\newcommand{\assignment}[1]{\def\hmlset@assignment{#1}} +\newcommand{\duedate}[1]{\def\hmlset@duedate{#1}} +\newcommand{\collaborators}[1]{\def\hmlset@collaborators{#1}} +\newcommand{\extraline}[1]{\def\hmlset@extraline{#1}} + +\theoremstyle{hmlplain} +\newmdtheoremenv[skipabove=\topsep,skipbelow=\topsep,nobreak=true]{problem}{Problem} diff --git a/hw4/main.tex b/hw4/main.tex new file mode 100644 index 0000000..414f37d --- /dev/null +++ b/hw4/main.tex @@ -0,0 +1,262 @@ +% No 'submit' option for the problems by themselves. +\documentclass[submit]{harvardml} +% Use the 'submit' option when you submit your solutions. +%\documentclass[submit]{harvardml} + +\usepackage{url} + +% Put in your full name and email address. +\name{Thibaut Horel} +\email{thorel@seas.harvard.edu} + +% List any people you worked with. +\collaborators{% +} + +% You don't need to change these. +\course{CS281-F13} +\assignment{Assignment \#4} +\duedate{23:59pm November 13, 2015} + +% Useful macros +\newcommand{\bw}{\boldsymbol{w}} +\newcommand{\distNorm}{\mathcal{N}} +\newcommand{\given}{\,|\,} +\newcommand{\ident}{\mathbb{I}} +\newcommand{\btheta}{\boldsymbol{\theta}} +\newcommand{\bz}{\boldsymbol{z}} +\newcommand{\balpha}{\boldsymbol{\alpha}} +\newcommand{\bbeta}{\boldsymbol{\beta}} +\usepackage{amsmath} +\usepackage{hyperref} +\usepackage{graphicx} + +% Some useful macros. +\newcommand{\R}{\mathbb{R}} +\newcommand{\E}{\mathbb{E}} +\newcommand{\var}{\text{var}} +\newcommand{\cov}{\text{cov}} +\newcommand{\N}{\mathcal{N}} +\newcommand{\ep}{\varepsilon} + +\theoremstyle{plain} +\newtheorem{lemma}{Lemma} +\input{def.tex} +\begin{document} + +\begin{problem}[10 pts] + For a target density~$p(x)$ and proposal + density~$q(x'\gets x)$, the Metropolis--Hastings transition + operator is given by + \begin{align*} + T(x'\gets x) &= q(x'\gets x)\,\min\left\{1, + \frac{\tilde p(x')\,q(x\gets x')}{\tilde p(x)\,q(x'\gets x)} + \right\} + \end{align*} + where $\tilde p(x)$ is the unnormalized target density. + + Show that this transition operator satisfies detailed balance. +\end{problem} + +\paragraph{Solution.} We need to show that for any two states $x$ and $x'$: +\begin{displaymath} + p(x)T(x'\gets x) = p(x')T(x\gets x') +\end{displaymath} +Let us expand both sides, we need to show that: +\begin{displaymath} + p(x) q(x'\gets x)\,\min\left\{1, + \frac{\tilde p(x')\,q(x\gets x')}{\tilde p(x)\,q(x'\gets x)} + \right\} + = p(x')q(x\gets x')\,\min\left\{1, + \frac{\tilde p(x)\,q(x'\gets x)}{\tilde p(x')\,q(x\gets x')} + \right\} +\end{displaymath} +We distinguish two cases: +\begin{itemize} + \item $\tilde{p}(x')q(x\gets x') \leq \tilde{p}(x)q(x'\gets x)$: in that + case the $\min$ in the left-hand side collapses to its second argument + and the left-hand side collapses to: + \begin{displaymath} + p(x')q(x\gets x') + \end{displaymath} + But in that case the $\min$ in the right-hand side collapses to $1$ + (its first argument) and the right-hand side collapses to: + \begin{displaymath} + p(x')q(x\gets x') + \end{displaymath} + and we see that the two sides are equal. + \item $\tilde{p}(x')q(x\gets x') > \tilde{p}(x)q(x'\gets x)$: this case is + treated in exactly the same way as the above case by observing that it + is equivalent after swapping the role of $x$ and $x'$. +\end{itemize} +The two cases together conclude the proof of detailed balance. + +\newpage + +\section*{Modeling users and jokes with a Bayesian latent bilinear model} + +The next two questions will develop Bayesian inference methods for the simplest version of the latent bilinear model you used to model jokes ratings in HW3. + + +The data set we'll use is the same as in HW3, a modified and preprocessed variant of the Jester data set. +However, to make things easier (and to make being Bayesian more worthwhile) {\bf we'll only use the first 100,000 joke ratings} for your training set. The second 100,000 ratings will form your test set. + +\subsection*{The model} + +The model is the same as in HW3, but with Gaussian priors on the latent parameter matrices $U$ and $V$. + +Let~${r_{i,j}\in\{1,2,3,4,5\}}$ be the rating of user $i$ on joke $j$. A latent linear model introduces a vector ${u_i\in\R^K}$ for each user and a vector~${v_j\in\R^K}$ for each joke. Then, each rating is modeled as a noisy version of the appropriate inner product. Specifically, +\[ +r_{i,j} \sim \mathcal{N}(u_i^T v_j, \sigma_\epsilon^2). +\] +Fix $\sigma_\epsilon^2$ to be 1.0, and $K = 2$. + +We put independent Gaussian priors on each element of $U$ and $V$: +\[U_{i,k} \sim \mathcal{N}(0, \sigma_U^2=5)\] +\[V_{i,k} \sim \mathcal{N}(0, \sigma_V^2=5)\] + +\begin{problem}[Gibbs sampling, 30pts] + +. + +\begin{enumerate} + +\item Write down the Gibbs update equations for U and V. That is to say, write their conditional distibutions, conditioned on all the other variables as well as the training data: +% +$$p(U | V, R )$$ +$$p(V | U, R )$$ + +Because the model is bi-linear, these updates should have fairly simple forms. + +\item Write a Gibbs sampler for $U$ and $V$ that updates $U$ and $V$ in turn. + +\item Run the Gibbs sampler for 100 steps (i.e. update both $U$ and $V$ 100 times). +Plot the training and test-set log-likelihood as a function of the number of steps through your training set. +That is, use all previous samples of $U, V$ to evaluate the predictive probability of all ratings. + +\item One reason to be Bayesian is that you don't have to worry about overfitting. +Run the your Gibbs sampler for $K = 1$ to $K = 10$, and plot the training and test-set log-likelihood for each value of $K$. How do the shapes of these curve differ from the curves you saw when doing maximum likelihood estimation in HW3? + + +\end{enumerate} +\end{problem} + +\paragraph{Solution.} 1. Using the results about systems of Gaussians from +Murphy's chapter 4, we get: +\begin{displaymath} + u_i | R, V \sim \mathcal{N}\left(\frac{\Sigma_1}{\sigma_\epsilon^2} V_i^Tr_i, + \Sigma_1\right) +\end{displaymath} +where $r_i$ is the vector of ratings given by user $i$, $V_i$ is the projection +of matrix $V$ which only keeps the jokes (rows) rated by user $i$, and $\Sigma$ +is given by: +\begin{displaymath} + \Sigma_1^{-1} = \frac{Id}{\sigma_U^2} + \frac{V_i^TV_i}{\sigma^2_\epsilon} +\end{displaymath} + +Similarly, we get that: +\begin{displaymath} + v_j | R, v \sim + \mathcal{N}\left(\frac{\Sigma_2}{\sigma_\epsilon^2}U_j^Tr_j, + \Sigma_2\right) +\end{displaymath} +where $U_j$ is the projection of matrix $U$ by only keeping the rows +(users) which have rated joke $j$, $r_j$ is the vector of ratings given to joke +$j$, and $\Sigma_2$ is given by: +\begin{displaymath} + \Sigma_2^{-1} = \frac{Id}{\sigma_V^2} + \frac{U_j^TU_j}{\sigma^2_\epsilon} +\end{displaymath} + +2. Please see the code in the appendix \textsf{2.py}. I used two copies of the + ratings matrix: one in CSC format and one in CSR format to be able to + quickly extract $r_i$ the ratings of a given user and $r_j$ the ratings of + a given joke, as well as the indices of these ratings to index the matrices + $U_i$ and $V_j$ appearing in the above formulas. + +3. The plot of the likelihood (up to a multiplicative factor) can be found + below: +\begin{center} + \includegraphics[width=0.8\textwidth]{2.pdf} +\end{center} + +4. For clarity I decided to plot the MSE (mean squared error) in log scale + instead of the log-likelihood. Otherwise the lines were hard to distinguish. + Note that the MSE is (up to a multiplicative factor) simply the opposite of + the log-likelihood because we have a Gaussian model. + + The plot of the training MSE for different values of $K$ can be found below: +\begin{center} + \includegraphics[width=0.7\textwidth]{train.pdf} +\end{center} +And for the test MSE: +\begin{center} + \includegraphics[width=0.7\textwidth]{test.pdf} +\end{center} +As we can see, the MSE is smaller or the training dataset, which is to be +expected. However, unlike in the previous assignment, the test MSE gets better +when $K$ increases. We do not observe the overfitting phenomenon where the +train MSE decreases with $K$ but the test MSE starts increasing again or large +values of $K$. + + +\begin{problem}[Stochastic Variational Inference, 30pts] + +Now we'll repeat the same experiments, but using stochastic variational inference instead. + +Recall that variational inference optimizes a lower bound on the log marginal likelihood (integrating out parameters $\theta$), like so: +\begin{align} +\log p(x) & = \log \int p(x, \theta) d\theta = \log \int p(x|\theta) p(\theta) d\theta \\ +& = \log \int \frac{q_\phi(\theta)}{q_\phi(\theta)} p(x|\theta) p(\theta) d\theta + = \log \mathbb{E}_{q_\phi} \frac{1}{q(\theta)} p(x|\theta) p(\theta) d\theta \\ +& \geq \mathbb{E}_{q_\phi} \log \left[ \frac{1}{q_\phi(\theta)} p(x|\theta) p(\theta) \right] + = \underbrace{-\mathbb{E}_{q_\phi} \log q_\phi(\theta)}_{\textnormal{entropy}} + \underbrace{\mathbb{E}_{q_\phi} \log p(\theta)}_{\textnormal{prior}} + \underbrace{\mathbb{E}_{q_\phi} \log p(x|\theta)}_{\textnormal{likelihood}} += \mathcal{L}(\phi) +\end{align} +% +In this case, $\theta = U,V$ and $x = R$: +% +\begin{align} +\mathcal{L}(\phi) = -\mathbb{E}_{q_\phi} \log q_\phi(U, V) + \mathbb{E}_{q_\phi} \log p(U, V) + \sum_{n=1}^N \mathbb{E}_{q_\phi} \log p(r_n | U, V) +\end{align} +% +The nice part about writing the lower bound as a set of expectations is that if we can sample from $q_\phi(\theta)$, then we can estimate the bound (and its gradient) by simple Monte Carlo. +That is, we can sample from $q(\theta)$, and estimate the bound or its gradients by evaluating at the samples and averaging them. + +This is a very general formula that works for many different priors, likelihoods and variational approximations. +In this case, we'll keep things simple and choose $q(U,V)$ to be a Gaussian factorized over every single entry of each matrix (the same form as the prior). +Thus our variational parameters will consist of a mean and variance for each entry in U and V. +% $\phi^{(\mu U)}_{ik}$, $\phi^{(\sigma^2 U)}_{ik}$, $\phi^{(\mu V)}_{jk}$, and $\phi^{(\sigma^2 V)}_{jk}$. + +To make our method scalable, we have to update the variational parameters without passing over the whole dataset each time. +We can do this by taking Monte Carlo estimates over ratings. +However, we to need scale our estimates so that their expectations are the same as if we did the sum exactly. + + +\begin{enumerate} + +\item +Write an unbiased Monte Carlo estimate of the lower bound, depending only on one rating at a time. +Write the entropy and prior terms exactly, without using Monte Carlo. Hint: You can combine them into a KL divergence between two Gaussians. + +\item Write the gradient of your estimate of the lower bound w.r.t. the variational parameters. + +Hint: To capture all the ways in which the gradient depends on the parameters, if your gradient has an expectation of the form $\mathbb{E}_{X \sim \mathcal{N}(\mu, \sigma)}[f(X)]$, re-write it in terms of $\mathbb{E}_{Z \sim \mathcal{N}(0, 1)}[f(Z \sigma + \mu)]$. + +\item For $K = 2$, optimize the variational parameters for 10 epochs over the first 100,000 datapoints. Use whichever optimization method you prefer. + +Plot the training and test-set log-likelihood as a function of the number of epochs, as well as the marginal likelihood lower bound. +That is to say: at the end of each epoch, evaluate the log of the average predictive probability of all ratings in the training and test sets using 100 samples from q(U,V). +The lower bound is the sum of entropy, prior and likelihood terms, while the training-set and test-set likelihoods only use the likelihood term. + +\item Fit your variational model for $K = 1$ to $K = 10$, and plot the training-set log-likelihood, test-set log-likelihood, and lower bound for each value of $K$. +How do the shapes of these curves differ? + +\item What are the pros and cons of Gibbs sampling versus stochastic variational inference? + +\end{enumerate} +\end{problem} + +\appendix +\section{\textsf{2.py}} +\input{2.tex} +\end{document} diff --git a/hw4/plot.py b/hw4/plot.py new file mode 100644 index 0000000..0ed704a --- /dev/null +++ b/hw4/plot.py @@ -0,0 +1,45 @@ +import matplotlib.pyplot as plt +import numpy as np +import seaborn as sbn + +sbn.set_style("white") + + +def three(): + with open("gibbs_2.txt") as fh: + values = [map(float, line.strip().split()) for line in fh] + a, b, c = zip(*values) + plt.figure(figsize=(8, 6)) + plt.plot(a, -np.array(b), label="train") + plt.plot(a, -np.array(c), label="test") + plt.legend(loc="lower right") + plt.xlabel("Epoch") + plt.ylabel("Likelihood") + plt.savefig("2.pdf") + + +def four(): + plt.figure(figsize=(8, 6)) + for i in xrange(1, 11): + with open("gibbs_" + str(i) + ".txt") as fh: + values = [map(float, line.strip().split()) for line in fh] + a, b, c = zip(*values) + plt.plot(a, np.array(b), label="K=" + str(i)) + plt.legend(loc="upper right") + plt.xlabel("Epoch") + plt.ylabel("MSE") + plt.axes().set_yscale("log") + plt.savefig("train.pdf") + + plt.figure(figsize=(8, 6)) + for i in xrange(1, 11): + with open("gibbs_" + str(i) + ".txt") as fh: + values = [map(float, line.strip().split()) for line in fh] + a, b, c = zip(*values) + plt.plot(a, np.array(c), label="K=" + str(i)) + plt.legend(loc="upper right") + plt.xlabel("Epoch") + plt.ylabel("MSE") + plt.axes().set_yscale("log") + plt.savefig("test.pdf") +four() diff --git a/hw4/test.pdf b/hw4/test.pdf Binary files differnew file mode 100644 index 0000000..aab8706 --- /dev/null +++ b/hw4/test.pdf diff --git a/hw4/train.pdf b/hw4/train.pdf Binary files differnew file mode 100644 index 0000000..1d32341 --- /dev/null +++ b/hw4/train.pdf |
