8 files changed, 824 insertions, 0 deletions
diff --git a/hw3/2.py b/hw3/2.py
new file mode 100644
index 0000000..6ccbcfd
--- /dev/null
+++ b/hw3/2.py
@@ -0,0 +1,86 @@
+from numpy.random import normal
+import numpy as np
+from pickle import load
+from random import shuffle
+
+
+def init_vectors(K, sigma=0.01):
+    U = normal(0, sigma, size=(n+1, K))
+    V = normal(0, sigma, size=(m+1, K))
+    return U, V
+
+
+def init_vectors_bis(K, sigma=0.01):
+    U = normal(0, sigma, size=(n+1, K))
+    V = normal(0, sigma, size=(m+1, K))
+    a = np.ones(n+1)
+    b = np.ones(m+1)
+    g = 1
+    return U, V, a, b, g
+
+
+def sgd_step(U, V, lamb=0.05, sigmas=1.0):
+    keys = train.keys()
+    shuffle(keys)
+    for (i, j) in keys:
+        r = train[(i, j)]
+        ui = np.copy(U[i])
+        vj = np.copy(V[j])
+        a = (lamb / sigmas) * (float(r) - np.inner(ui, vj))
+        U[i] += a * vj
+        V[j] += a * ui
+
+
+def sgd_step_bis(U, V, a, b, g, lamb=0.05, sigmas=1.0):
+    keys = train.keys()
+    shuffle(keys)
+    for (i, j) in keys:
+        r = train[(i, j)]
+        ui = np.copy(U[i])
+        vj = np.copy(V[j])
+        e = (lamb / sigmas) * (float(r) - np.inner(ui, vj) - a[i] - b[j] - g)
+        U[i] += e * vj
+        V[j] += e * ui
+        a[i] += e
+        b[j] += e
+        g += e
+    return g
+
+
+def mse(U, V, d):
+    serror = sum((float(r) - np.inner(U[i], V[j])) ** 2
+                 for (i, j), r in d.iteritems())
+    return serror / len(d)
+
+
+def mse_bis(U, V, a, b, g, d):
+    serror = sum((float(r) - np.inner(U[i], V[j]) - a[i] - b[j] - g) ** 2
+                 for (i, j), r in d.iteritems())
+    return serror / len(d)
+
+
+def part_c():
+    for K in xrange(1, 11):
+        U, V = init_vectors(K)
+        for t in xrange(10):
+            sgd_step(U, V)
+        print K, mse(U, V, train), mse(U, V, test)
+
+
+def part_d():
+    U, V, a, b, g = init_vectors_bis(2)
+    for t in xrange(10):
+        print t
+        g = sgd_step_bis(U, V, a, b, g)
+    print g
+    i = range(len(b))
+    mi = min(i, key=lambda x: b[x])
+    ma = max(i, key=lambda x: b[x])
+    print mi, b[mi]
+    print ma, b[ma]
+    print mse_bis(U, V, a, b, g, train), mse_bis(U, V, a, b, g, test)
+
+
+if __name__ == "__main__":
+    n, m, train, test = load(open("data.pickle", "rb"))
+    part_d()
diff --git a/hw3/3.py b/hw3/3.py
new file mode 100644
index 0000000..b2d3e62
--- /dev/null
+++ b/hw3/3.py
@@ -0,0 +1,100 @@
+import sys
+from itertools import groupby, chain
+import autograd.numpy as np
+from autograd import grad
+import autograd.scipy as sp
+from math import sqrt
+
+b = np.array([-1e100, -4., -2., 2., 4., 1e100])
+
+
+def get_ratings():
+    with open(sys.argv[1]) as fh:
+        for line in fh:
+            i, j, r = map(int, line.strip().split())
+            yield (j - 1, r)
+
+
+def split_train_test():
+    l = list(get_ratings())
+    l.sort(key=lambda x: x[0])
+    for k, g in groupby(l, key=lambda x: x[0]):
+        g = list(g)
+        n = int(0.95 * len(g))
+        train = g[:n]
+        test = g[n:]
+        yield train, test
+
+
+def build_train_test():
+    train, test = zip(*split_train_test())
+    train = np.array(list(chain.from_iterable(train)))
+    test = np.array(list(chain.from_iterable(test)))
+    return train, test
+
+
+def log_posterior(ws, batch):
+    w = ws[:-1]
+    sigmas = ws[-1]
+    indices = batch[:, 0]
+    ratings = batch[:, 1]
+    m = np.dot(X, w)[indices]
+    n = b[ratings]
+    o = b[ratings - 1]
+    a1 = sp.stats.norm.logcdf(1. / sigmas * (n - m))
+    a2 = sp.stats.norm.logcdf(1. / sigmas * (o - m))
+    return - np.sum(np.log(1 - np.exp(a2 - a1)) + a1) + 0.5 * np.sum(w * w)
+
+
+def log_posterior_bis(ws, batch):
+    w = ws[:-1]
+    sigmas = ws[-1]
+    indices = batch[:, 0]
+    ratings = batch[:, 1]
+    m = np.dot(X, w)[indices]
+    return - np.sum(-(m - ratings) ** 2 / (2 * sigmas)) + 0.5 * np.sum(w * w)
+
+grad_log_posterior = grad(log_posterior)
+
+
+def sgd():
+    b1 = 0.9
+    b2 = 0.999
+    b1t = b1
+    b2t = b2
+    eps = 1e-8
+    alpha = 0.001
+    w = np.ones(X.shape[1] + 1)
+    m = np.zeros(X.shape[1] + 1)
+    v = np.zeros(X.shape[1] + 1)
+    for i in xrange(100):
+        print i
+        for k in xrange(len(train) / 100):
+            batch = train[k * 100: k * 100 + 100, :]
+            g = grad_log_posterior(w, batch)
+            m = b1 * m + (1. - b1) * g
+            v = b2 * v + (1. - b2) * (g * g)
+            mt = m / (1. - b1t)
+            vt = v / (1. - b2t)
+            tmp = alpha * mt / (np.sqrt(vt) + eps)
+            w = w - tmp
+        b1t *= b1
+        b2t *= b2
+        print sqrt(compute_mse(w, test))
+
+
+def compute_mse(w, t):
+    w = w[:-1]
+    ratings = np.dot(X, w)
+    ratings = np.searchsorted(b, ratings)
+    ratings = ratings[t[:, 0]]
+    s = np.sum((ratings - t[:, 1]) ** 2)
+    return float(s) / len(t)
+
+
+if __name__ == "__main__":
+    train, test = build_train_test()
+    X = np.loadtxt(sys.argv[2])
+    w = np.ones(X.shape[1] + 1)
+    #print grad_log_posterior(w, train)
+    sgd()
diff --git a/hw3/build_train_test.py b/hw3/build_train_test.py
new file mode 100644
index 0000000..2498ce0
--- /dev/null
+++ b/hw3/build_train_test.py
@@ -0,0 +1,28 @@
+import sys
+from random import shuffle
+from pickle import dump
+
+
+def get_ratings(filename):
+    d = {}
+    with open(filename) as fh:
+        for line in fh:
+            i, j, r = map(int, line.strip().split())
+            d[(i, j)] = r
+    n = max(i for (i, j) in d)
+    m = max(j for (i, j) in d)
+    return d, n, m
+
+
+def split_train_test():
+    keys = ratings.keys()
+    s = int(1e5)
+    shuffle(keys)
+    test = {k: ratings[k] for k in keys[:s]}
+    train = {k: ratings[k] for k in keys[s:]}
+    return train, test
+
+if __name__ == "__main__":
+    ratings, n, m = get_ratings(sys.argv[1])
+    train, test = split_train_test()
+    dump((n, m, train, test), open("data.pickle", "wb"))
diff --git a/hw3/harvardml.cls b/hw3/harvardml.cls
new file mode 100644
index 0000000..e285173
--- /dev/null
+++ b/hw3/harvardml.cls
@@ -0,0 +1,97 @@
+% Ryan Adams
+% School of Engineering and Applied Sciences
+% Harvard University
+% v0.01, 31 August 2013
+% Based on HMC Math Dept. template by Eric J. Malm.
+\NeedsTeXFormat{LaTeX2e}[1995/01/01]
+\ProvidesClass{harvardml}
+[2013/08/31 v0.01 Harvard ML Assignment Class]
+
+\RequirePackage{ifpdf}
+
+\newif\ifhmlset@submit
+\DeclareOption{submit}{%
+  \hmlset@submittrue%
+}
+\DeclareOption{nosubmit}{%
+  \hmlset@submitfalse%
+}
+
+\DeclareOption*{\PassOptionsToClass{\CurrentOption}{article}}
+\ExecuteOptions{nosubmit}
+\ProcessOptions\relax
+
+\LoadClass[10pt,letterpaper]{article}
+
+\newif\ifhmlset@header
+
+\hmlset@headertrue
+
+\RequirePackage{mathpazo}
+\RequirePackage{palatino}
+\RequirePackage{amsmath}
+\RequirePackage{amssymb}
+\RequirePackage{amsthm}
+\RequirePackage{fullpage}
+\RequirePackage{mdframed}
+
+\newtheoremstyle{hmlplain}
+                {3pt}% Space above
+                {3pt}% Space below
+                {}% Body font
+                {}% Indent amount
+                {\bfseries}% Theorem head font
+                {\\*[3pt]}% Punctuation after theorem head
+                {.5em}% Space after theorem head
+                {}% Theorem head spec (can be left empty, meaning `normal')
+
+\def\titlebar{\hrule height2pt\vskip .25in\vskip-\parskip}
+
+\newcommand{\headerblock}{%
+  \noindent\begin{minipage}{0.33\textwidth}
+  \begin{flushleft}
+  \ifhmlset@submit
+    \mbox{\hmlset@name}\\
+    \mbox{\tt \hmlset@email}\\
+    \mbox{\hmlset@course}
+  \fi
+  \end{flushleft}
+  \end{minipage}
+  \noindent\begin{minipage}{0.33\textwidth}
+  \begin{center}
+    \mbox{\Large\hmlset@assignment}\protect\\
+    Due: \hmlset@duedate
+  \end{center}
+  \end{minipage}
+  \noindent\begin{minipage}{0.33\textwidth}
+  \begin{flushright}
+  \ifhmlset@submit
+    Collaborators: \hmlset@collaborators
+    \fi
+  \end{flushright}
+  \end{minipage}
+  \vspace{0.1cm}
+  \titlebar
+}
+
+\ifhmlset@header\AtBeginDocument{\headerblock}\fi
+
+\def\hmlset@name{}
+\def\hmlset@email{}
+\def\hmlset@course{}
+\def\hmlset@assignment{}
+\def\hmlset@duedate{}
+\def\hmlset@collaborators{}
+\def\hmlset@extraline{}
+
+% commands to set header block info
+\newcommand{\name}[1]{\def\hmlset@name{#1}}
+\newcommand{\email}[1]{\def\hmlset@email{#1}}
+\newcommand{\course}[1]{\def\hmlset@course{#1}}
+\newcommand{\assignment}[1]{\def\hmlset@assignment{#1}}
+\newcommand{\duedate}[1]{\def\hmlset@duedate{#1}}
+\newcommand{\collaborators}[1]{\def\hmlset@collaborators{#1}}
+\newcommand{\extraline}[1]{\def\hmlset@extraline{#1}}
+
+\theoremstyle{hmlplain}
+\newmdtheoremenv[skipabove=\topsep,skipbelow=\topsep,nobreak=true]{problem}{Problem}
diff --git a/hw3/main.tex b/hw3/main.tex
new file mode 100644
index 0000000..e4460c4
--- /dev/null
+++ b/hw3/main.tex
@@ -0,0 +1,495 @@
+% No 'submit' option for the problems by themselves.
+\documentclass[submit]{harvardml}
+% Use the 'submit' option when you submit your solutions.
+%\documentclass[submit]{harvardml}
+
+% Put in your full name and email address.
+\name{Thibaut Horel}
+\email{thorel@seas.harvard.edu}
+
+% List any people you worked with.
+\collaborators{%
+}
+
+% You don't need to change these.
+\course{CS281-F15}
+\assignment{Assignment \#3}
+\duedate{11:59pm October 28, 2015}
+
+\usepackage{url, enumitem}
+\usepackage{amsfonts}
+\usepackage{listings}
+\usepackage{graphicx}
+\usepackage{bm}
+
+% Some useful macros.
+\DeclareMathOperator*{\argmax}{argmax}
+\DeclareMathOperator*{\argmin}{argmin}
+\newcommand{\prob}{\mathbb{P}}
+\newcommand{\given}{\,|\,}
+\newcommand{\R}{\mathbb{R}}
+\newcommand{\E}{\mathbb{E}}
+\newcommand{\var}{\text{var}}
+\newcommand{\cov}{\text{cov}}
+\newcommand{\N}{\mathcal{N}}
+\newcommand{\ep}{\varepsilon}
+
+\theoremstyle{plain}
+\newtheorem{lemma}{Lemma}
+
+\begin{document}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\begin{problem}[An orthogonal view of PCA, 20pts]
+In lecture we showed that PCA maximizes the variance explained by the top $r$ principal components for every integer $r$. In this problem you will prove an equivalent characterization of PCA as an optimal low-rank approximation of the data. One reason we might seek such a representation is if we have a very high-dimensional data set to which we'd like to apply an expensive learning procedure. In this case, a lower-dimensional approximation of our data \--- if it's sufficiently good \--- might allow us to do our learning much faster by allowing us to operate in the lower-dimensional space.
+
+Let's begin. Suppose we are given a data matrix $X \in \R^{n \times m}$ that is centered, i.e., the mean of every column is 0. Given some set of orthonormal vectors $\{ \pi_1, \ldots, \pi_r\} \subset \R^m$, let $\Pi \in \R^{m \times r}$ be the matrix whose columns are the $\pi_i$. We seek to represent observations as linear combinations of the vectors $\pi_d$. This won't be possible exactly because $r < m$, so instead we ask which set of vectors will give us the best approximation with respect to the $L^2$ norm.
+
+\begin{enumerate}[label=(\alph*)]
+\item
+Show that if $x$ is an $m$-dimensional row vector then $x \Pi \Pi^T = \sum_{d=1}^r \langle x, \pi_d \rangle \pi_d^T$, so that $\Pi \Pi^T$ projects onto the span of the vectors $\pi_d^T$. 
+\item
+Argue that the rank of $X \Pi \Pi^T$ is at most $r$.
+
+\item
+Suppose that $r=1$ so that $\Pi$ contains just one column vector $\pi_1$, and consider the matrix $X \Pi\Pi^T = X \pi_1 \pi_1^T$. Prove that the reconstruction error
+\[
+\| X \pi_1\pi_1^T - X \|_2^2
+\]
+is minimized if and only if
+\[
+\frac{1}{n} \sum_{i=1}^n (x_i \cdot \pi_1)^2 = \frac{1}{n} \| X \pi_1 \|_2^2
+\]
+is maximized, where $x_i$ is a row-vector containing the $i$-th row of $X$ and $\| \cdot \|_2$ is the vector/matrix 2-norm.
+
+\item
+Argue that the previous part implies that the reconstruction error is minimized if and only if the variance of the data in the $\pi_1$ direction is maximized. That is, if $X^*$ denotes a random observation from $X$ then an optimal $\pi_1$ maximizes $\var(X^* \cdot \pi_1)$. Conclude that when $r=1$ an optimal choice for $\pi_1$ is the top principal component of $X$.
+
+\item
+What if $r > 1$? Given a vector $x \in \R^m$, use the fact that $\Pi \Pi^T$ is a projection (part a), as well as the previous part, to show similarly to part c that the reconstruction error
+\[
+\| X \Pi \Pi^T - X \|_2^2
+\]
+is minimized if and only if the sum of the variances along the directions $\pi_d$,
+\[
+\sum_{d=1}^r \var(X^* \cdot \pi_d)
+\]
+is maximized. Conclude that in general the top $r$ principal components give us an optimal rank-$d$ approximation of the data matrix $X$.
+\end{enumerate}
+\end{problem}
+
+\paragraph{Solution.} (a) By definition of the matrix product, $\Pi\Pi^T
+= \sum_{d=1}^r \pi_d\pi_d^T$. Hence, $x\Pi\Pi^T = \sum_{d=1}^r x\pi_d\pi_d^T$.
+Since $x$ is a row-vector and $\pi_d$ is a column vector, $x\pi_d = \langle x,
+\pi_d \rangle$, and we obtain the required form for $x\Pi\Pi^T$.
+
+Furthermore, it is easy to see that $\Pi^T\Pi$ (the Gramiam matrix of the
+$\pi_d$ vectors) is equal to $I_r$ since the vectors $\pi_d$ are orthonormal.
+Hence $(\Pi\Pi^T)^2 = (\Pi\Pi^T)$, which shows that $\Pi\Pi^T$ is a projection.
+
+This form shows that $\Pi\Pi^T$ projects $x$ onto the span of the
+vectors $\pi_d^T$. The coordinate along vector $\pi_d^T$ is given by $\langle
+x, \pi_d\rangle$.
+
+(b) Since $\Pi\Pi^T$ projects onto the span $S$ of the vectors $\pi_d^T$, we know
+that its image is a subspace of $S$. $S$ has dimension exactly $r$ (the vectors
+$\pi_d$ form an orthonormal basis of it). Hence the image of $\Pi\Pi^T$ has
+dimension at most $r$. That is, the rank of $\Pi\Pi^T$ is at most $r$.
+
+(c) For matrices $X$ and $Y$ of compatible dimensions, by definition of the
+matrix product, the $i$th row of matrix $XY$ is equal to $x_i Y$ where $x_i$ is
+the $i$th row of $X$.
+
+In particular:
+\begin{displaymath}
+    \|X\pi_1\pi_1^T - X\|_2^2 = \sum_{i=1}^n \|x_i\pi_1\pi_1^T - x_i\|_2^2
+\end{displaymath}
+Using the identity $\|a+b\|^2 = \|a\|^2 + \|b\|^2 + 2\langle a, b\rangle$, we
+get:
+\begin{displaymath}
+    \|X\pi_1\pi_1^T - X\|_2^2 = \sum_{i=1}^n\big( \|x_i\pi_1\pi_1^T\|_2^2
+    + \|x_i\|_2^2 - 2x_i\pi_1\pi_1^Tx_i^T\big)
+\end{displaymath}
+We also note that:
+\begin{displaymath}
+    \|x_i\pi_1\pi_1^T\|_2^2 = x_i\pi_1\pi_1^T\pi_1\pi_1^Tx_i^T
+    = x_i\pi_1\pi_1^Tx_i^T
+\end{displaymath}
+where the last equality uses that $\pi_1^T\pi_1 = 1$. Plugging this in the
+above equality we obtain:
+\begin{displaymath}
+    \|X\pi_1\pi_1^T - X\|_2^2 = \sum_{i=1}^n\big(\|x_i\|_2^2 - x_i\pi_1\pi_1^Tx_i^T\big)
+\end{displaymath}
+Since the first term in the sum does not depend on $\pi_1$, minimizing the
+reconstruction error is equivalent to maximizing:
+\begin{displaymath}
+    \sum_{i=1}^n  x_i\pi_1\pi_1^Tx_i^T = \sum_{i=1}^n (x_i\cdot \pi_1)^2 = 
+    \|X\pi_1\|^2
+\end{displaymath}
+Finally, multiplying by $\frac{1}{n}$ does not change the maximization problem.
+
+(d) Let $X^*$ be a random observation from $X$, that is a row of $X$ chosen
+uniformly at random. First:
+\begin{displaymath}
+    \E(X^*\cdot\pi_1) = \frac{1}{n}\sum_{i=1}^n x_i\cdot\pi_1
+    = \left(\frac{1}{n}\sum_{i=1}^n x_i\right)\cdot\pi_1
+    = 0\cdot\pi_1 = 0
+\end{displaymath}
+where the penultimate equality use that the columns of $X$ sum up to zero.
+Hence:
+\begin{displaymath}
+    \var(X^*\cdot\pi_1) = \E\big((X^*\cdot\pi_1)^2\big)
+    = \frac{1}{n}\sum_{i=1}^n (x_i\cdot\pi_1)^2
+\end{displaymath}
+and we obtain exactly the quantity computed in (c). This shows that choosing
+a direction $\pi_1$ of maximum variance for $X$, is equivalent to choosing
+$\pi_1$ minimizing the reconstruction error. So for $r=1$, the optimal choice
+for $\pi_1$ (to minimize the reconstruction error) is the top principal
+component of $X$.
+
+(e) Using part (a), one can write similarly to (c):
+\begin{displaymath}
+    \|X\Pi\Pi^T - X\|_2^2 = \sum_{i=1}^n \|x_i\Pi\Pi^T - x_i\|_2^2
+    = \sum_{i=1}^n \|\sum_{d=1}^r(x_i\cdot\pi_d)\pi_d^T - x_i\|_2^2
+\end{displaymath}
+Expanding the squares and using that the $\pi_d$ vectors are orthonormal:
+\begin{displaymath}
+    \|\sum_{d=1}^r(x_i\cdot\pi_d)\pi_d^T - x_i\|_2^2
+    = \sum_{d=1}^r(x_i\cdot\pi_d)^2 + \|x_i\|_2^2
+    - 2\sum_{d=1}^r (x_i\cdot\pi_d)^2
+    = -\sum_{d=1}^r(x_i\cdot\pi_d)^2 + \|x_i\|_2^2
+\end{displaymath}
+Hence minimizing the reconstruction error is equivalent to maximizing:
+\begin{displaymath}
+    \frac{1}{n}\sum_{i=1}^n\sum_{d=1}^r(x_i\cdot\pi_d)^2
+\end{displaymath}
+
+One the other hand, similarly to (d), the sum of the variances along the
+projecting dimensions:
+\begin{displaymath}
+    \sum_{d=1}^r\var(X^*\cdot\pi_d) = \sum_{d=1}^r\frac{1}{n}\sum_{i=1}^n(x_i\cdot\pi_d)^2
+\end{displaymath}
+is exactly the same quantity as above after swapping the sums.
+
+Hence minimizing the reconstruction error is equivalent to maximizing the sum
+of the variances. We conclude that for $r>1$, finding the best low-rank
+approximation is equivalent to using the top $r$ principal components as the
+projection basis.
+
+
+\begin{problem}[Modeling users and jokes with a latent linear model, 25pts]
+In this problem, we'll use a latent linear model to jointly model the ratings
+users assign to jokes. The data set we'll use is a modified and preprocessed
+variant of the Jester data set (version 2) from
+\url{http://eigentaste.berkeley.edu/dataset/}. The data we provide you with are user
+ratings of 150 jokes. There are over 1.7M ratings with values 1, 2, 3, 4 and 5,
+from about sixty thousand users. {\bf The data we give you is a modified version of the original Jester data set, 
+please use the files we provide in canvas and not the original ones}. The texts of the jokes are also available.
+Warning: most of the jokes are bad, tasteless, or both. At best, they were
+funny to late-night TV hosts in 1999-2000. Note also that some of the jokes do
+not have any ratings and so can be ignored.
+
+\begin{enumerate}[label=(\alph*)]
+\item
+Let~${r_{i,j}\in\{1,2,3,4,5\}}$ be the rating of user $i$ on joke $j$.  A latent linear model introduces a vector ${u_i\in\R^K}$ for each user and a vector~${v_j\in\R^K}$ for each joke.  Then, each rating is modeled as a noisy version of the appropriate inner product. Specifically,
+\[
+r_{i,j} \sim \mathcal{N}(u_i^T v_j, \sigma^2).
+\]
+so that the log-likelihood is
+\[
+\log P(R) = \sum_{i,j} \left( -\frac{1}{2\sigma^2} \left( r_{i,j} - u_i^T v_j \right)^2 - \log \sigma - \frac{1}{2} \log(2\pi) \right)
+\]
+where the sum runs over all $i,j$ for which a rating exists.
+
+What are the gradients of this log-likelihood with respect to $u_i$ and $v_j$?
+
+\item
+Randomly choose 100K ratings to hold out as a validation set. Now set $K = 2$,
+$\sigma^2=1.0$ and initialize the components of $u_i$ and $v_j$ by sampling from
+a Gaussian distribution with zero mean and standard deviation $0.01$.  After
+this, adjust the vectors $u_i$ and $v_j$ on your training set using stochastic
+gradient descent without minibatches: iterate over the training ratings
+updating $u_i$ and $v_j$ for each $r_{ij}$ that is available. Use $\lambda =
+0.05$ as the learning rate and iterate 10 times (epochs) over the training
+data. Note that the maximum likelihood estimate of $\sigma^2$ is just the mean
+squared error of your predictions on the training set. Report your MLE of
+$\sigma^2$.
+
+\item Evaluate different choices of $K$ on the validation set. Evaluate $K = 1,
+\ldots, 10$ and produce a plot that shows the root-mean-squared error on both
+the training set and the validation set for each trial and for each $K$. What
+seems like a good value for~$K$?
+
+ \item We might imagine that some jokes are just better or worse than others.
+We might also imagine that some users tend to have higher or lower means in
+their ratings. In this case, we can introduce biases into the model so that
+$r_{ij} \approx u_i^\text{T} v_j + a_i + b_j + g$, where $a_i$, $b_j$ and $g$ are user,
+joke and global biases, respectively.  Change the model to incorporate these
+biases and fit it again with $K=2$, learning these new biases as well. Write
+down the likelihood that you are optimizing. One side-effect is that you should
+be able to rank the jokes from best to worst. What are the best and worst jokes
+and their respective biases?  What is the value of the global bias?
+
+ \item Sometimes we have users or jokes that only have a few ratings. We don't
+want to overfit with these and so we might want to put priors on them. What are
+reasonable priors for the latent features and the biases? Draw a directed
+graphical model that shows all of these variables and their relationships.
+Note that you are not required to code this new model up, just discuss
+resonable priors and write the graphical model.
+\end{enumerate}
+\end{problem}
+
+\paragraph{Solution.} The code for this problem can be found in file \textsf{2.py}.
+
+(a) The gradient with respect to $u_i$ is:
+\begin{displaymath}
+    \frac{1}{\sigma^2}\sum_{j}(r_{i,j} - u_i^Tv_j)v_j
+\end{displaymath}
+Similarly, the gradient with respect to $v_j$ is:
+\begin{displaymath}
+    \frac{1}{\sigma^2}\sum_{i}(r_{i,j} - u_i^Tv_j)u_i
+\end{displaymath}
+
+(b) The obtained MLE for $\sigma^2$ is 1.1064889001
+
+(c) The plot of the RMSE for different values of $K$ can be found below:
+\begin{center}
+\includegraphics[width=0.8\textwidth]{rmse.pdf}
+\end{center}
+It seems that a good value for $K$ is 2 or 3. For these values, the RMSE on the
+validation set is minimal. For larger values of $K$, the RMSE decreases on the training
+set but becomes worse on the validation set: we are overfitting.
+
+(d) The new log-likelihood is:
+\begin{displaymath}
+\log P(R) = \sum_{i,j}
+\left( -\frac{1}{2\sigma^2} \left( r_{i,j} - u_i^T v_j - a_i - b_j - g\right)^2
+- \log \sigma - \frac{1}{2} \log(2\pi) \right)
+\end{displaymath}
+
+After fitting this new-model, we obtain an MSE of 0.932 and 1.111 on the
+training and validation sets respectively. The global bias is 1.4429.
+
+By ordering jokes by their bias, we obtain the best joke (72) with a bias of 1.76:
+\begin{center}
+    \emph{on the first day of college the dean addressed the students pointing
+        out some of the rules the female dormitory will be out of bounds for
+        all male students and the male dormitory to the female students anybody
+        caught breaking this rule will be finded the first time he continued
+        anybody caught breaking this rule the second time will be fined being
+        caught a third time will cost you a fine of are there any questions at
+        this point a male student in the crowd inquired how much for a season
+    pass}
+\end{center}
+
+and the worst joke (7) with a bias of -0.18:
+\begin{center}
+    \emph{how many feminists does it take to screw in a light bulb thats not
+    funny}
+\end{center}
+
+(e) A Gaussian prior of mean 0 seems reasonable for the coordinates of the
+latent vectors for both the users and the jokes. This way we will avoid having
+too large parameters. Another reasonable prior would be a Laplace prior to
+enforce sparse latent vectors.
+
+For the user bias and the joke bias as well as the global bias we can use
+uniform prior over $[1, 5]$.
+
+The graphical model looks like this in nested plate notation:
+\begin{center}
+\includegraphics[width=0.5\textwidth]{model.pdf}
+\end{center}
+
+
+\section*{Ordinal Regression}
+
+We now address the problem of predicting joke ratings given the text of the
+joke. The previous models assumed that the ratings where continuous real
+numbers, while they are actually integers from 1 to 5. To take this into
+account, we will use an ordinal regression model.
+Let the rating values
+be $r = 1,\ldots,R$. In the ordinal regression model the real line is
+partitioned into $R$ contiguous intervals with boundaries
+\begin{align}
+-\infty = b_1 < b_2 < \ldots < b_{R+1} = +\infty\,,
+\end{align}
+such that the interval $[b_r,b_{r+1})$ corresponds to the $r$-th rating value.
+We will assume that $b_2 = -4$, $b_1 = -2$, $b_3 = 0$, $b_4 = 2$ and $b_5 = 4$.
+Instead of directly modeling the ratings, we will be modeling them in terms of a
+hidden variable $f$. We have that the rating $r$ is observed
+if $f$ falls in the interval for that rating. The conditional probability of
+$r$ given $f$ is then
+\begin{align}
+p(r|f) = 
+\left\{
+    \begin{array}{ll}
+        1  & \mbox{if }\quad  b_r \leq f < b_{r+1} \\
+        0 & \mbox{if } \quad \mbox{otherwise}
+    \end{array}
+\right.
+= \Theta(f - b_r) - \Theta(f-b_{r+1})\,,
+\end{align}
+where $\Theta(x)$ is the Heaviside step function, that is, $\Theta(x)=1$ if
+$x>0$ and $\Theta(x)=0$ otherwise. Uncertainty about the exact value of $f$ can
+be modeled by adding additive Gaussian noise to $f$. Let $\sigma^2$ be the variance
+of this noise. Then $p(f|h) = \mathcal{N}(f|h,\sigma^2)$, where $h$ is a new latent
+variable that is the noise free version of $f$.
+
+Given some features $\mathbf{x}$ for a particular joke, we can then combine the
+previous likelihood with a linear model to make predictions for the possible
+rating values for the joke. In particular, we can assume that the noise-free
+rating value $h$ is a linear function of $\mathbf{x}$, that is
+$h(\mathbf{x})=\mathbf{w}^\text{T} \mathbf{x}$, where $\mathbf{w}$ is a vector
+of regression coefficients. We will assume that the prior for $\mathbf{w}$ is
+$p(\mathbf{w})=\mathcal{N}(\bm 0, \mathbf{I})$.
+
+The problem now is what feature representation to use for the jokes. We propose
+here that you compute binary features that indicate the presence or absence of
+the 150 most common words across all the jokes plus a bias term that is always
+equal to one. The file \texttt{jester\_items.clean.dat} in canvas has the joke texts with
+punctuation and case removed. To turn the texts into the proposed binary features you can use the
+python script \texttt{generate\_binary\_features.py}, which is also available at canvas.
+
+\subsection*{Stochastic optimization with Adam}
+
+Stochastic optimization of cost functions for models with many parameters can
+be difficult because each parameter may require a different learning
+rate. Adam,
+(\url{http://arxiv.org/pdf/1412.6980v8.pdf})
+is a method that can be used to avoid this problem. Adam computes individual
+adaptive learning rates for different parameters from estimates of first and
+second moments of the gradients. Algorithm 1, shows the pseudo-code (extracted from the above
+reference). You are encouraged to look at the pdf
+for more details.
+
+\subsection*{Sparse matrices}
+
+A collection of ratings given by a set of users on a set of items can be
+efficiently manipulated by storing them in a sparse matrix format. The non-zero
+entries in the matrix would contain the observed rating values, while the zero
+entries encode missing ratings. In the package \texttt{spcipy.sparse} you
+have available the following sparse matrix types:
+
+{\footnotesize
+\begin{verbatim}
+bsr_matrix  Block Sparse Row matrix
+coo_matrix  A sparse matrix in COOrdinate format.
+csc_matrix  Compressed Sparse Column matrix
+csr_matrix  Compressed Sparse Row matrix
+dia_matrix  Sparse matrix with DIAgonal storage
+dok_matrix  Dictionary Of Keys based sparse matrix.
+lil_matrix  Row-based linked list sparse matrix
+\end{verbatim}
+}
+
+Each sparse matrix type presents different computational advantages for
+accessing and manipulating the data depending on whether the operations are
+performed by rows or by columns or in an arbitrary way. If you encode the
+ratings in a matrix with users as rows and jokes as columns, the
+\texttt{csc\_matrix} type can be useful to quickly identify the ratings that
+are available for any particular joke. The \texttt{lil\_matrix} type is useful
+if we want to incrementally construct a sparse matrix by adding new blocks of
+non-zero entries.
+
+\begin{problem}[Ordinal linear regression 25pts] You are required to
+\begin{enumerate}
+\vspace{-0.1cm}
+\item Explain what is the form for $p(r|h)$ in the ordinal regression model.
+
+\vspace{-0.1cm}
+\item Explain what is the mean of the predictive distribution in the ordinal regression model.
+
+\vspace{-0.1cm}
+\item Code a function that splits the ratings for each joke into a training set
+with 95\% of the ratings and a test set with the remaining 5\%.  After this,
+each joke should have 95\% of its ratings in the training set and 5\% of them
+in the test set. For this, it can be useful to work with sparse matrices of dimension $n\times
+m$ where $n$ is the number of users and $m$ is the number of jokes and the
+non-zero entries in the matrix are the observed rating values.
+
+\vspace{-0.1cm}
+\item Code a function that computes the log posterior distribution of the ordinal
+regression model up to a normalization constant. Code a function that computes
+the gradients of the previous function with respect to $\mathbf{w}$ and
+$\sigma^2$. You are encouraged to use automatic differentiation tools such as autograd
+(\url{https://github.com/HIPS/autograd}).
+
+\vspace{-0.1cm}
+\item Code a function that finds the MAP solution for $\mathbf{w}$ and
+$\sigma^2$ in the ordinal regression model given the available training data using mini-batch stochastic
+gradient descent. Use minibatches of size 100 and Adam with its default
+parameter values for 100 epochs (iterations over the training data). Report the
+average test RMSE.
+
+\vspace{-0.1cm}
+\item Modify the previous model to have a Gaussian likelihood, that is, $p(r|h)=\mathcal{N}(r|h,\sigma^2)$.
+Report the resulting average test RMSE of the new model. Does performance
+increase or decrease? Why?
+\vspace{-0.1cm}
+\item How does the performance of the models analyzed in this problem compare to
+the performance of the model from problem 2? Which model performs best? Why?
+\end{enumerate}
+\end{problem}
+
+\paragraph{Solution.} The code for this problem can be found in file
+\textsf{3.py}.
+
+(1) Let us denote by $\Phi$ the CDF of a Gaussian variable of mean 0 and
+variance 1, then:
+\begin{displaymath}
+    p(r|h) = \Phi\left(\frac{b_{r+1} - h}{\sigma^2}\right) - 
+    \Phi\left(\frac{b_{r} - h}{\sigma^2}\right)
+\end{displaymath}
+this computes the probably that $f\sim \mathcal{N}(h,\sigma^2)$ falls in the
+interval $[b_r, b_{r+1}]$.
+
+(2) In the ordinal regression model, $h = h(x) = w^Tx$. So for a given weight
+vector $w$ and joke with attributes $x$, the mean of the predictive
+distribution is:
+\begin{displaymath}
+    \mu(w, x, \sigma^2) = \sum_{r=1}^5 r\left(
+        \Phi\left(\frac{b_{r+1} - w^Tx}{\sigma^2}\right) - 
+        \Phi\left(\frac{b_{r} - w^Tx}{\sigma^2}\right)
+    \right)
+\end{displaymath}
+It is not entirely clear to me what was being asked for this question. If we
+wanted to we could also compute the marginal predictive distribution by
+integrating the above quantity against the prior $p(w)$. However, after reading
+Piazza, I think the above quantity is what was expected.
+
+(3) See  function \textsf{build\_train\_test} in \textsf{3.py}.
+
+(4) See functions \textsf{log\_posterior} and \textsf{grad\_log\_posterior} in
+\textsf{3.py}.
+
+(5) See function \textsf{sgd} in \textsf{3.py}. I obtained an average test RMSE
+of 1.2871.
+
+(6) See function \textsf{log\_posterior\_bis}. This function is simpler to
+write, because the likelihood part of the log-posterior is the likelihood of
+Gaussian variables, which takes the standard form of the squared error on the
+training dataset (like in linear regression). The regularization term is the
+same as in (5). The new average test RMSE is  1.2896. This is slightly worse
+than what as obtained in (5). This can be explained by the fact that this model
+does not capture the \emph{ordinal} nature (integer ratings) of the
+variable to be predicted.
+
+(7). The performances of the models in this problem are worse than the model
+from problem 2.
+
+This can be explained by the fact the model of problem 2 is richer: it assumes
+that the ratings depends on latent features of the jokes and latent features of
+the users.
+
+In contrast, the models in this problem assume that the ratings of a joke only
+depend on features of the joke. The individual fluctuations across individuals
+are simply models as independent random noise.
+
+An ideal model would combine ideas from the model in this problem to capture
+the ordinal nature of ratings and ideas from the model in problem 2 to use
+a low rank model on both users and jokes to model the mean of the Gaussian
+variables.
+
+\end{document}
diff --git a/hw3/model.pdf b/hw3/model.pdf
new file mode 100644
index 0000000..2a458a8
--- /dev/null
+++ b/hw3/model.pdf
diff --git a/hw3/plot.py b/hw3/plot.py
new file mode 100644
index 0000000..9797985
--- /dev/null
+++ b/hw3/plot.py
@@ -0,0 +1,18 @@
+import matplotlib.pyplot as plt
+import seaborn as sb
+from math import sqrt
+
+sb.set_style("white")
+
+values = [map(float, line.strip().split()) for line in open("results.txt")]
+x, y, z = zip(*values)
+y = map(sqrt, y)
+z = map(sqrt, z)
+
+plt.figure(figsize=(9, 6))
+plt.plot(x, y, label="train")
+plt.plot(x, z, label="validation")
+plt.legend()
+plt.xlabel("K")
+plt.ylabel("RMSE")
+plt.savefig("rmse.pdf")
diff --git a/hw3/rmse.pdf b/hw3/rmse.pdf
new file mode 100644
index 0000000..d3388e3
--- /dev/null
+++ b/hw3/rmse.pdf