diff options
| author | Thibaut Horel <thibaut.horel@gmail.com> | 2015-10-30 17:16:32 -0400 |
|---|---|---|
| committer | Thibaut Horel <thibaut.horel@gmail.com> | 2015-10-30 17:16:32 -0400 |
| commit | 61f644a6a7d36dc5c15d957c48d10675ab3627ae (patch) | |
| tree | e765c3ac2b1239ea2728a625a7a19196c370adbe | |
| parent | 6a969e7afb0b796996f63b8d341f8891f187ca8e (diff) | |
| download | cs281-61f644a6a7d36dc5c15d957c48d10675ab3627ae.tar.gz | |
[hw3]
| -rw-r--r-- | hw3/2.py | 86 | ||||
| -rw-r--r-- | hw3/3.py | 100 | ||||
| -rw-r--r-- | hw3/build_train_test.py | 28 | ||||
| -rw-r--r-- | hw3/harvardml.cls | 97 | ||||
| -rw-r--r-- | hw3/main.tex | 495 | ||||
| -rw-r--r-- | hw3/model.pdf | bin | 0 -> 13981 bytes | |||
| -rw-r--r-- | hw3/plot.py | 18 | ||||
| -rw-r--r-- | hw3/rmse.pdf | bin | 0 -> 11991 bytes |
8 files changed, 824 insertions, 0 deletions
diff --git a/hw3/2.py b/hw3/2.py new file mode 100644 index 0000000..6ccbcfd --- /dev/null +++ b/hw3/2.py @@ -0,0 +1,86 @@ +from numpy.random import normal +import numpy as np +from pickle import load +from random import shuffle + + +def init_vectors(K, sigma=0.01): + U = normal(0, sigma, size=(n+1, K)) + V = normal(0, sigma, size=(m+1, K)) + return U, V + + +def init_vectors_bis(K, sigma=0.01): + U = normal(0, sigma, size=(n+1, K)) + V = normal(0, sigma, size=(m+1, K)) + a = np.ones(n+1) + b = np.ones(m+1) + g = 1 + return U, V, a, b, g + + +def sgd_step(U, V, lamb=0.05, sigmas=1.0): + keys = train.keys() + shuffle(keys) + for (i, j) in keys: + r = train[(i, j)] + ui = np.copy(U[i]) + vj = np.copy(V[j]) + a = (lamb / sigmas) * (float(r) - np.inner(ui, vj)) + U[i] += a * vj + V[j] += a * ui + + +def sgd_step_bis(U, V, a, b, g, lamb=0.05, sigmas=1.0): + keys = train.keys() + shuffle(keys) + for (i, j) in keys: + r = train[(i, j)] + ui = np.copy(U[i]) + vj = np.copy(V[j]) + e = (lamb / sigmas) * (float(r) - np.inner(ui, vj) - a[i] - b[j] - g) + U[i] += e * vj + V[j] += e * ui + a[i] += e + b[j] += e + g += e + return g + + +def mse(U, V, d): + serror = sum((float(r) - np.inner(U[i], V[j])) ** 2 + for (i, j), r in d.iteritems()) + return serror / len(d) + + +def mse_bis(U, V, a, b, g, d): + serror = sum((float(r) - np.inner(U[i], V[j]) - a[i] - b[j] - g) ** 2 + for (i, j), r in d.iteritems()) + return serror / len(d) + + +def part_c(): + for K in xrange(1, 11): + U, V = init_vectors(K) + for t in xrange(10): + sgd_step(U, V) + print K, mse(U, V, train), mse(U, V, test) + + +def part_d(): + U, V, a, b, g = init_vectors_bis(2) + for t in xrange(10): + print t + g = sgd_step_bis(U, V, a, b, g) + print g + i = range(len(b)) + mi = min(i, key=lambda x: b[x]) + ma = max(i, key=lambda x: b[x]) + print mi, b[mi] + print ma, b[ma] + print mse_bis(U, V, a, b, g, train), mse_bis(U, V, a, b, g, test) + + +if __name__ == "__main__": + n, m, train, test = load(open("data.pickle", "rb")) + part_d() diff --git a/hw3/3.py b/hw3/3.py new file mode 100644 index 0000000..b2d3e62 --- /dev/null +++ b/hw3/3.py @@ -0,0 +1,100 @@ +import sys +from itertools import groupby, chain +import autograd.numpy as np +from autograd import grad +import autograd.scipy as sp +from math import sqrt + +b = np.array([-1e100, -4., -2., 2., 4., 1e100]) + + +def get_ratings(): + with open(sys.argv[1]) as fh: + for line in fh: + i, j, r = map(int, line.strip().split()) + yield (j - 1, r) + + +def split_train_test(): + l = list(get_ratings()) + l.sort(key=lambda x: x[0]) + for k, g in groupby(l, key=lambda x: x[0]): + g = list(g) + n = int(0.95 * len(g)) + train = g[:n] + test = g[n:] + yield train, test + + +def build_train_test(): + train, test = zip(*split_train_test()) + train = np.array(list(chain.from_iterable(train))) + test = np.array(list(chain.from_iterable(test))) + return train, test + + +def log_posterior(ws, batch): + w = ws[:-1] + sigmas = ws[-1] + indices = batch[:, 0] + ratings = batch[:, 1] + m = np.dot(X, w)[indices] + n = b[ratings] + o = b[ratings - 1] + a1 = sp.stats.norm.logcdf(1. / sigmas * (n - m)) + a2 = sp.stats.norm.logcdf(1. / sigmas * (o - m)) + return - np.sum(np.log(1 - np.exp(a2 - a1)) + a1) + 0.5 * np.sum(w * w) + + +def log_posterior_bis(ws, batch): + w = ws[:-1] + sigmas = ws[-1] + indices = batch[:, 0] + ratings = batch[:, 1] + m = np.dot(X, w)[indices] + return - np.sum(-(m - ratings) ** 2 / (2 * sigmas)) + 0.5 * np.sum(w * w) + +grad_log_posterior = grad(log_posterior) + + +def sgd(): + b1 = 0.9 + b2 = 0.999 + b1t = b1 + b2t = b2 + eps = 1e-8 + alpha = 0.001 + w = np.ones(X.shape[1] + 1) + m = np.zeros(X.shape[1] + 1) + v = np.zeros(X.shape[1] + 1) + for i in xrange(100): + print i + for k in xrange(len(train) / 100): + batch = train[k * 100: k * 100 + 100, :] + g = grad_log_posterior(w, batch) + m = b1 * m + (1. - b1) * g + v = b2 * v + (1. - b2) * (g * g) + mt = m / (1. - b1t) + vt = v / (1. - b2t) + tmp = alpha * mt / (np.sqrt(vt) + eps) + w = w - tmp + b1t *= b1 + b2t *= b2 + print sqrt(compute_mse(w, test)) + + +def compute_mse(w, t): + w = w[:-1] + ratings = np.dot(X, w) + ratings = np.searchsorted(b, ratings) + ratings = ratings[t[:, 0]] + s = np.sum((ratings - t[:, 1]) ** 2) + return float(s) / len(t) + + +if __name__ == "__main__": + train, test = build_train_test() + X = np.loadtxt(sys.argv[2]) + w = np.ones(X.shape[1] + 1) + #print grad_log_posterior(w, train) + sgd() diff --git a/hw3/build_train_test.py b/hw3/build_train_test.py new file mode 100644 index 0000000..2498ce0 --- /dev/null +++ b/hw3/build_train_test.py @@ -0,0 +1,28 @@ +import sys +from random import shuffle +from pickle import dump + + +def get_ratings(filename): + d = {} + with open(filename) as fh: + for line in fh: + i, j, r = map(int, line.strip().split()) + d[(i, j)] = r + n = max(i for (i, j) in d) + m = max(j for (i, j) in d) + return d, n, m + + +def split_train_test(): + keys = ratings.keys() + s = int(1e5) + shuffle(keys) + test = {k: ratings[k] for k in keys[:s]} + train = {k: ratings[k] for k in keys[s:]} + return train, test + +if __name__ == "__main__": + ratings, n, m = get_ratings(sys.argv[1]) + train, test = split_train_test() + dump((n, m, train, test), open("data.pickle", "wb")) diff --git a/hw3/harvardml.cls b/hw3/harvardml.cls new file mode 100644 index 0000000..e285173 --- /dev/null +++ b/hw3/harvardml.cls @@ -0,0 +1,97 @@ +% Ryan Adams +% School of Engineering and Applied Sciences +% Harvard University +% v0.01, 31 August 2013 +% Based on HMC Math Dept. template by Eric J. Malm. +\NeedsTeXFormat{LaTeX2e}[1995/01/01] +\ProvidesClass{harvardml} +[2013/08/31 v0.01 Harvard ML Assignment Class] + +\RequirePackage{ifpdf} + +\newif\ifhmlset@submit +\DeclareOption{submit}{% + \hmlset@submittrue% +} +\DeclareOption{nosubmit}{% + \hmlset@submitfalse% +} + +\DeclareOption*{\PassOptionsToClass{\CurrentOption}{article}} +\ExecuteOptions{nosubmit} +\ProcessOptions\relax + +\LoadClass[10pt,letterpaper]{article} + +\newif\ifhmlset@header + +\hmlset@headertrue + +\RequirePackage{mathpazo} +\RequirePackage{palatino} +\RequirePackage{amsmath} +\RequirePackage{amssymb} +\RequirePackage{amsthm} +\RequirePackage{fullpage} +\RequirePackage{mdframed} + +\newtheoremstyle{hmlplain} + {3pt}% Space above + {3pt}% Space below + {}% Body font + {}% Indent amount + {\bfseries}% Theorem head font + {\\*[3pt]}% Punctuation after theorem head + {.5em}% Space after theorem head + {}% Theorem head spec (can be left empty, meaning `normal') + +\def\titlebar{\hrule height2pt\vskip .25in\vskip-\parskip} + +\newcommand{\headerblock}{% + \noindent\begin{minipage}{0.33\textwidth} + \begin{flushleft} + \ifhmlset@submit + \mbox{\hmlset@name}\\ + \mbox{\tt \hmlset@email}\\ + \mbox{\hmlset@course} + \fi + \end{flushleft} + \end{minipage} + \noindent\begin{minipage}{0.33\textwidth} + \begin{center} + \mbox{\Large\hmlset@assignment}\protect\\ + Due: \hmlset@duedate + \end{center} + \end{minipage} + \noindent\begin{minipage}{0.33\textwidth} + \begin{flushright} + \ifhmlset@submit + Collaborators: \hmlset@collaborators + \fi + \end{flushright} + \end{minipage} + \vspace{0.1cm} + \titlebar +} + +\ifhmlset@header\AtBeginDocument{\headerblock}\fi + +\def\hmlset@name{} +\def\hmlset@email{} +\def\hmlset@course{} +\def\hmlset@assignment{} +\def\hmlset@duedate{} +\def\hmlset@collaborators{} +\def\hmlset@extraline{} + +% commands to set header block info +\newcommand{\name}[1]{\def\hmlset@name{#1}} +\newcommand{\email}[1]{\def\hmlset@email{#1}} +\newcommand{\course}[1]{\def\hmlset@course{#1}} +\newcommand{\assignment}[1]{\def\hmlset@assignment{#1}} +\newcommand{\duedate}[1]{\def\hmlset@duedate{#1}} +\newcommand{\collaborators}[1]{\def\hmlset@collaborators{#1}} +\newcommand{\extraline}[1]{\def\hmlset@extraline{#1}} + +\theoremstyle{hmlplain} +\newmdtheoremenv[skipabove=\topsep,skipbelow=\topsep,nobreak=true]{problem}{Problem} diff --git a/hw3/main.tex b/hw3/main.tex new file mode 100644 index 0000000..e4460c4 --- /dev/null +++ b/hw3/main.tex @@ -0,0 +1,495 @@ +% No 'submit' option for the problems by themselves. +\documentclass[submit]{harvardml} +% Use the 'submit' option when you submit your solutions. +%\documentclass[submit]{harvardml} + +% Put in your full name and email address. +\name{Thibaut Horel} +\email{thorel@seas.harvard.edu} + +% List any people you worked with. +\collaborators{% +} + +% You don't need to change these. +\course{CS281-F15} +\assignment{Assignment \#3} +\duedate{11:59pm October 28, 2015} + +\usepackage{url, enumitem} +\usepackage{amsfonts} +\usepackage{listings} +\usepackage{graphicx} +\usepackage{bm} + +% Some useful macros. +\DeclareMathOperator*{\argmax}{argmax} +\DeclareMathOperator*{\argmin}{argmin} +\newcommand{\prob}{\mathbb{P}} +\newcommand{\given}{\,|\,} +\newcommand{\R}{\mathbb{R}} +\newcommand{\E}{\mathbb{E}} +\newcommand{\var}{\text{var}} +\newcommand{\cov}{\text{cov}} +\newcommand{\N}{\mathcal{N}} +\newcommand{\ep}{\varepsilon} + +\theoremstyle{plain} +\newtheorem{lemma}{Lemma} + +\begin{document} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\begin{problem}[An orthogonal view of PCA, 20pts] +In lecture we showed that PCA maximizes the variance explained by the top $r$ principal components for every integer $r$. In this problem you will prove an equivalent characterization of PCA as an optimal low-rank approximation of the data. One reason we might seek such a representation is if we have a very high-dimensional data set to which we'd like to apply an expensive learning procedure. In this case, a lower-dimensional approximation of our data \--- if it's sufficiently good \--- might allow us to do our learning much faster by allowing us to operate in the lower-dimensional space. + +Let's begin. Suppose we are given a data matrix $X \in \R^{n \times m}$ that is centered, i.e., the mean of every column is 0. Given some set of orthonormal vectors $\{ \pi_1, \ldots, \pi_r\} \subset \R^m$, let $\Pi \in \R^{m \times r}$ be the matrix whose columns are the $\pi_i$. We seek to represent observations as linear combinations of the vectors $\pi_d$. This won't be possible exactly because $r < m$, so instead we ask which set of vectors will give us the best approximation with respect to the $L^2$ norm. + +\begin{enumerate}[label=(\alph*)] +\item +Show that if $x$ is an $m$-dimensional row vector then $x \Pi \Pi^T = \sum_{d=1}^r \langle x, \pi_d \rangle \pi_d^T$, so that $\Pi \Pi^T$ projects onto the span of the vectors $\pi_d^T$. +\item +Argue that the rank of $X \Pi \Pi^T$ is at most $r$. + +\item +Suppose that $r=1$ so that $\Pi$ contains just one column vector $\pi_1$, and consider the matrix $X \Pi\Pi^T = X \pi_1 \pi_1^T$. Prove that the reconstruction error +\[ +\| X \pi_1\pi_1^T - X \|_2^2 +\] +is minimized if and only if +\[ +\frac{1}{n} \sum_{i=1}^n (x_i \cdot \pi_1)^2 = \frac{1}{n} \| X \pi_1 \|_2^2 +\] +is maximized, where $x_i$ is a row-vector containing the $i$-th row of $X$ and $\| \cdot \|_2$ is the vector/matrix 2-norm. + +\item +Argue that the previous part implies that the reconstruction error is minimized if and only if the variance of the data in the $\pi_1$ direction is maximized. That is, if $X^*$ denotes a random observation from $X$ then an optimal $\pi_1$ maximizes $\var(X^* \cdot \pi_1)$. Conclude that when $r=1$ an optimal choice for $\pi_1$ is the top principal component of $X$. + +\item +What if $r > 1$? Given a vector $x \in \R^m$, use the fact that $\Pi \Pi^T$ is a projection (part a), as well as the previous part, to show similarly to part c that the reconstruction error +\[ +\| X \Pi \Pi^T - X \|_2^2 +\] +is minimized if and only if the sum of the variances along the directions $\pi_d$, +\[ +\sum_{d=1}^r \var(X^* \cdot \pi_d) +\] +is maximized. Conclude that in general the top $r$ principal components give us an optimal rank-$d$ approximation of the data matrix $X$. +\end{enumerate} +\end{problem} + +\paragraph{Solution.} (a) By definition of the matrix product, $\Pi\Pi^T += \sum_{d=1}^r \pi_d\pi_d^T$. Hence, $x\Pi\Pi^T = \sum_{d=1}^r x\pi_d\pi_d^T$. +Since $x$ is a row-vector and $\pi_d$ is a column vector, $x\pi_d = \langle x, +\pi_d \rangle$, and we obtain the required form for $x\Pi\Pi^T$. + +Furthermore, it is easy to see that $\Pi^T\Pi$ (the Gramiam matrix of the +$\pi_d$ vectors) is equal to $I_r$ since the vectors $\pi_d$ are orthonormal. +Hence $(\Pi\Pi^T)^2 = (\Pi\Pi^T)$, which shows that $\Pi\Pi^T$ is a projection. + +This form shows that $\Pi\Pi^T$ projects $x$ onto the span of the +vectors $\pi_d^T$. The coordinate along vector $\pi_d^T$ is given by $\langle +x, \pi_d\rangle$. + +(b) Since $\Pi\Pi^T$ projects onto the span $S$ of the vectors $\pi_d^T$, we know +that its image is a subspace of $S$. $S$ has dimension exactly $r$ (the vectors +$\pi_d$ form an orthonormal basis of it). Hence the image of $\Pi\Pi^T$ has +dimension at most $r$. That is, the rank of $\Pi\Pi^T$ is at most $r$. + +(c) For matrices $X$ and $Y$ of compatible dimensions, by definition of the +matrix product, the $i$th row of matrix $XY$ is equal to $x_i Y$ where $x_i$ is +the $i$th row of $X$. + +In particular: +\begin{displaymath} + \|X\pi_1\pi_1^T - X\|_2^2 = \sum_{i=1}^n \|x_i\pi_1\pi_1^T - x_i\|_2^2 +\end{displaymath} +Using the identity $\|a+b\|^2 = \|a\|^2 + \|b\|^2 + 2\langle a, b\rangle$, we +get: +\begin{displaymath} + \|X\pi_1\pi_1^T - X\|_2^2 = \sum_{i=1}^n\big( \|x_i\pi_1\pi_1^T\|_2^2 + + \|x_i\|_2^2 - 2x_i\pi_1\pi_1^Tx_i^T\big) +\end{displaymath} +We also note that: +\begin{displaymath} + \|x_i\pi_1\pi_1^T\|_2^2 = x_i\pi_1\pi_1^T\pi_1\pi_1^Tx_i^T + = x_i\pi_1\pi_1^Tx_i^T +\end{displaymath} +where the last equality uses that $\pi_1^T\pi_1 = 1$. Plugging this in the +above equality we obtain: +\begin{displaymath} + \|X\pi_1\pi_1^T - X\|_2^2 = \sum_{i=1}^n\big(\|x_i\|_2^2 - x_i\pi_1\pi_1^Tx_i^T\big) +\end{displaymath} +Since the first term in the sum does not depend on $\pi_1$, minimizing the +reconstruction error is equivalent to maximizing: +\begin{displaymath} + \sum_{i=1}^n x_i\pi_1\pi_1^Tx_i^T = \sum_{i=1}^n (x_i\cdot \pi_1)^2 = + \|X\pi_1\|^2 +\end{displaymath} +Finally, multiplying by $\frac{1}{n}$ does not change the maximization problem. + +(d) Let $X^*$ be a random observation from $X$, that is a row of $X$ chosen +uniformly at random. First: +\begin{displaymath} + \E(X^*\cdot\pi_1) = \frac{1}{n}\sum_{i=1}^n x_i\cdot\pi_1 + = \left(\frac{1}{n}\sum_{i=1}^n x_i\right)\cdot\pi_1 + = 0\cdot\pi_1 = 0 +\end{displaymath} +where the penultimate equality use that the columns of $X$ sum up to zero. +Hence: +\begin{displaymath} + \var(X^*\cdot\pi_1) = \E\big((X^*\cdot\pi_1)^2\big) + = \frac{1}{n}\sum_{i=1}^n (x_i\cdot\pi_1)^2 +\end{displaymath} +and we obtain exactly the quantity computed in (c). This shows that choosing +a direction $\pi_1$ of maximum variance for $X$, is equivalent to choosing +$\pi_1$ minimizing the reconstruction error. So for $r=1$, the optimal choice +for $\pi_1$ (to minimize the reconstruction error) is the top principal +component of $X$. + +(e) Using part (a), one can write similarly to (c): +\begin{displaymath} + \|X\Pi\Pi^T - X\|_2^2 = \sum_{i=1}^n \|x_i\Pi\Pi^T - x_i\|_2^2 + = \sum_{i=1}^n \|\sum_{d=1}^r(x_i\cdot\pi_d)\pi_d^T - x_i\|_2^2 +\end{displaymath} +Expanding the squares and using that the $\pi_d$ vectors are orthonormal: +\begin{displaymath} + \|\sum_{d=1}^r(x_i\cdot\pi_d)\pi_d^T - x_i\|_2^2 + = \sum_{d=1}^r(x_i\cdot\pi_d)^2 + \|x_i\|_2^2 + - 2\sum_{d=1}^r (x_i\cdot\pi_d)^2 + = -\sum_{d=1}^r(x_i\cdot\pi_d)^2 + \|x_i\|_2^2 +\end{displaymath} +Hence minimizing the reconstruction error is equivalent to maximizing: +\begin{displaymath} + \frac{1}{n}\sum_{i=1}^n\sum_{d=1}^r(x_i\cdot\pi_d)^2 +\end{displaymath} + +One the other hand, similarly to (d), the sum of the variances along the +projecting dimensions: +\begin{displaymath} + \sum_{d=1}^r\var(X^*\cdot\pi_d) = \sum_{d=1}^r\frac{1}{n}\sum_{i=1}^n(x_i\cdot\pi_d)^2 +\end{displaymath} +is exactly the same quantity as above after swapping the sums. + +Hence minimizing the reconstruction error is equivalent to maximizing the sum +of the variances. We conclude that for $r>1$, finding the best low-rank +approximation is equivalent to using the top $r$ principal components as the +projection basis. + + +\begin{problem}[Modeling users and jokes with a latent linear model, 25pts] +In this problem, we'll use a latent linear model to jointly model the ratings +users assign to jokes. The data set we'll use is a modified and preprocessed +variant of the Jester data set (version 2) from +\url{http://eigentaste.berkeley.edu/dataset/}. The data we provide you with are user +ratings of 150 jokes. There are over 1.7M ratings with values 1, 2, 3, 4 and 5, +from about sixty thousand users. {\bf The data we give you is a modified version of the original Jester data set, +please use the files we provide in canvas and not the original ones}. The texts of the jokes are also available. +Warning: most of the jokes are bad, tasteless, or both. At best, they were +funny to late-night TV hosts in 1999-2000. Note also that some of the jokes do +not have any ratings and so can be ignored. + +\begin{enumerate}[label=(\alph*)] +\item +Let~${r_{i,j}\in\{1,2,3,4,5\}}$ be the rating of user $i$ on joke $j$. A latent linear model introduces a vector ${u_i\in\R^K}$ for each user and a vector~${v_j\in\R^K}$ for each joke. Then, each rating is modeled as a noisy version of the appropriate inner product. Specifically, +\[ +r_{i,j} \sim \mathcal{N}(u_i^T v_j, \sigma^2). +\] +so that the log-likelihood is +\[ +\log P(R) = \sum_{i,j} \left( -\frac{1}{2\sigma^2} \left( r_{i,j} - u_i^T v_j \right)^2 - \log \sigma - \frac{1}{2} \log(2\pi) \right) +\] +where the sum runs over all $i,j$ for which a rating exists. + +What are the gradients of this log-likelihood with respect to $u_i$ and $v_j$? + +\item +Randomly choose 100K ratings to hold out as a validation set. Now set $K = 2$, +$\sigma^2=1.0$ and initialize the components of $u_i$ and $v_j$ by sampling from +a Gaussian distribution with zero mean and standard deviation $0.01$. After +this, adjust the vectors $u_i$ and $v_j$ on your training set using stochastic +gradient descent without minibatches: iterate over the training ratings +updating $u_i$ and $v_j$ for each $r_{ij}$ that is available. Use $\lambda = +0.05$ as the learning rate and iterate 10 times (epochs) over the training +data. Note that the maximum likelihood estimate of $\sigma^2$ is just the mean +squared error of your predictions on the training set. Report your MLE of +$\sigma^2$. + +\item Evaluate different choices of $K$ on the validation set. Evaluate $K = 1, +\ldots, 10$ and produce a plot that shows the root-mean-squared error on both +the training set and the validation set for each trial and for each $K$. What +seems like a good value for~$K$? + + \item We might imagine that some jokes are just better or worse than others. +We might also imagine that some users tend to have higher or lower means in +their ratings. In this case, we can introduce biases into the model so that +$r_{ij} \approx u_i^\text{T} v_j + a_i + b_j + g$, where $a_i$, $b_j$ and $g$ are user, +joke and global biases, respectively. Change the model to incorporate these +biases and fit it again with $K=2$, learning these new biases as well. Write +down the likelihood that you are optimizing. One side-effect is that you should +be able to rank the jokes from best to worst. What are the best and worst jokes +and their respective biases? What is the value of the global bias? + + \item Sometimes we have users or jokes that only have a few ratings. We don't +want to overfit with these and so we might want to put priors on them. What are +reasonable priors for the latent features and the biases? Draw a directed +graphical model that shows all of these variables and their relationships. +Note that you are not required to code this new model up, just discuss +resonable priors and write the graphical model. +\end{enumerate} +\end{problem} + +\paragraph{Solution.} The code for this problem can be found in file \textsf{2.py}. + +(a) The gradient with respect to $u_i$ is: +\begin{displaymath} + \frac{1}{\sigma^2}\sum_{j}(r_{i,j} - u_i^Tv_j)v_j +\end{displaymath} +Similarly, the gradient with respect to $v_j$ is: +\begin{displaymath} + \frac{1}{\sigma^2}\sum_{i}(r_{i,j} - u_i^Tv_j)u_i +\end{displaymath} + +(b) The obtained MLE for $\sigma^2$ is 1.1064889001 + +(c) The plot of the RMSE for different values of $K$ can be found below: +\begin{center} +\includegraphics[width=0.8\textwidth]{rmse.pdf} +\end{center} +It seems that a good value for $K$ is 2 or 3. For these values, the RMSE on the +validation set is minimal. For larger values of $K$, the RMSE decreases on the training +set but becomes worse on the validation set: we are overfitting. + +(d) The new log-likelihood is: +\begin{displaymath} +\log P(R) = \sum_{i,j} +\left( -\frac{1}{2\sigma^2} \left( r_{i,j} - u_i^T v_j - a_i - b_j - g\right)^2 +- \log \sigma - \frac{1}{2} \log(2\pi) \right) +\end{displaymath} + +After fitting this new-model, we obtain an MSE of 0.932 and 1.111 on the +training and validation sets respectively. The global bias is 1.4429. + +By ordering jokes by their bias, we obtain the best joke (72) with a bias of 1.76: +\begin{center} + \emph{on the first day of college the dean addressed the students pointing + out some of the rules the female dormitory will be out of bounds for + all male students and the male dormitory to the female students anybody + caught breaking this rule will be finded the first time he continued + anybody caught breaking this rule the second time will be fined being + caught a third time will cost you a fine of are there any questions at + this point a male student in the crowd inquired how much for a season + pass} +\end{center} + +and the worst joke (7) with a bias of -0.18: +\begin{center} + \emph{how many feminists does it take to screw in a light bulb thats not + funny} +\end{center} + +(e) A Gaussian prior of mean 0 seems reasonable for the coordinates of the +latent vectors for both the users and the jokes. This way we will avoid having +too large parameters. Another reasonable prior would be a Laplace prior to +enforce sparse latent vectors. + +For the user bias and the joke bias as well as the global bias we can use +uniform prior over $[1, 5]$. + +The graphical model looks like this in nested plate notation: +\begin{center} +\includegraphics[width=0.5\textwidth]{model.pdf} +\end{center} + + +\section*{Ordinal Regression} + +We now address the problem of predicting joke ratings given the text of the +joke. The previous models assumed that the ratings where continuous real +numbers, while they are actually integers from 1 to 5. To take this into +account, we will use an ordinal regression model. +Let the rating values +be $r = 1,\ldots,R$. In the ordinal regression model the real line is +partitioned into $R$ contiguous intervals with boundaries +\begin{align} +-\infty = b_1 < b_2 < \ldots < b_{R+1} = +\infty\,, +\end{align} +such that the interval $[b_r,b_{r+1})$ corresponds to the $r$-th rating value. +We will assume that $b_2 = -4$, $b_1 = -2$, $b_3 = 0$, $b_4 = 2$ and $b_5 = 4$. +Instead of directly modeling the ratings, we will be modeling them in terms of a +hidden variable $f$. We have that the rating $r$ is observed +if $f$ falls in the interval for that rating. The conditional probability of +$r$ given $f$ is then +\begin{align} +p(r|f) = +\left\{ + \begin{array}{ll} + 1 & \mbox{if }\quad b_r \leq f < b_{r+1} \\ + 0 & \mbox{if } \quad \mbox{otherwise} + \end{array} +\right. += \Theta(f - b_r) - \Theta(f-b_{r+1})\,, +\end{align} +where $\Theta(x)$ is the Heaviside step function, that is, $\Theta(x)=1$ if +$x>0$ and $\Theta(x)=0$ otherwise. Uncertainty about the exact value of $f$ can +be modeled by adding additive Gaussian noise to $f$. Let $\sigma^2$ be the variance +of this noise. Then $p(f|h) = \mathcal{N}(f|h,\sigma^2)$, where $h$ is a new latent +variable that is the noise free version of $f$. + +Given some features $\mathbf{x}$ for a particular joke, we can then combine the +previous likelihood with a linear model to make predictions for the possible +rating values for the joke. In particular, we can assume that the noise-free +rating value $h$ is a linear function of $\mathbf{x}$, that is +$h(\mathbf{x})=\mathbf{w}^\text{T} \mathbf{x}$, where $\mathbf{w}$ is a vector +of regression coefficients. We will assume that the prior for $\mathbf{w}$ is +$p(\mathbf{w})=\mathcal{N}(\bm 0, \mathbf{I})$. + +The problem now is what feature representation to use for the jokes. We propose +here that you compute binary features that indicate the presence or absence of +the 150 most common words across all the jokes plus a bias term that is always +equal to one. The file \texttt{jester\_items.clean.dat} in canvas has the joke texts with +punctuation and case removed. To turn the texts into the proposed binary features you can use the +python script \texttt{generate\_binary\_features.py}, which is also available at canvas. + +\subsection*{Stochastic optimization with Adam} + +Stochastic optimization of cost functions for models with many parameters can +be difficult because each parameter may require a different learning +rate. Adam, +(\url{http://arxiv.org/pdf/1412.6980v8.pdf}) +is a method that can be used to avoid this problem. Adam computes individual +adaptive learning rates for different parameters from estimates of first and +second moments of the gradients. Algorithm 1, shows the pseudo-code (extracted from the above +reference). You are encouraged to look at the pdf +for more details. + +\subsection*{Sparse matrices} + +A collection of ratings given by a set of users on a set of items can be +efficiently manipulated by storing them in a sparse matrix format. The non-zero +entries in the matrix would contain the observed rating values, while the zero +entries encode missing ratings. In the package \texttt{spcipy.sparse} you +have available the following sparse matrix types: + +{\footnotesize +\begin{verbatim} +bsr_matrix Block Sparse Row matrix +coo_matrix A sparse matrix in COOrdinate format. +csc_matrix Compressed Sparse Column matrix +csr_matrix Compressed Sparse Row matrix +dia_matrix Sparse matrix with DIAgonal storage +dok_matrix Dictionary Of Keys based sparse matrix. +lil_matrix Row-based linked list sparse matrix +\end{verbatim} +} + +Each sparse matrix type presents different computational advantages for +accessing and manipulating the data depending on whether the operations are +performed by rows or by columns or in an arbitrary way. If you encode the +ratings in a matrix with users as rows and jokes as columns, the +\texttt{csc\_matrix} type can be useful to quickly identify the ratings that +are available for any particular joke. The \texttt{lil\_matrix} type is useful +if we want to incrementally construct a sparse matrix by adding new blocks of +non-zero entries. + +\begin{problem}[Ordinal linear regression 25pts] You are required to +\begin{enumerate} +\vspace{-0.1cm} +\item Explain what is the form for $p(r|h)$ in the ordinal regression model. + +\vspace{-0.1cm} +\item Explain what is the mean of the predictive distribution in the ordinal regression model. + +\vspace{-0.1cm} +\item Code a function that splits the ratings for each joke into a training set +with 95\% of the ratings and a test set with the remaining 5\%. After this, +each joke should have 95\% of its ratings in the training set and 5\% of them +in the test set. For this, it can be useful to work with sparse matrices of dimension $n\times +m$ where $n$ is the number of users and $m$ is the number of jokes and the +non-zero entries in the matrix are the observed rating values. + +\vspace{-0.1cm} +\item Code a function that computes the log posterior distribution of the ordinal +regression model up to a normalization constant. Code a function that computes +the gradients of the previous function with respect to $\mathbf{w}$ and +$\sigma^2$. You are encouraged to use automatic differentiation tools such as autograd +(\url{https://github.com/HIPS/autograd}). + +\vspace{-0.1cm} +\item Code a function that finds the MAP solution for $\mathbf{w}$ and +$\sigma^2$ in the ordinal regression model given the available training data using mini-batch stochastic +gradient descent. Use minibatches of size 100 and Adam with its default +parameter values for 100 epochs (iterations over the training data). Report the +average test RMSE. + +\vspace{-0.1cm} +\item Modify the previous model to have a Gaussian likelihood, that is, $p(r|h)=\mathcal{N}(r|h,\sigma^2)$. +Report the resulting average test RMSE of the new model. Does performance +increase or decrease? Why? +\vspace{-0.1cm} +\item How does the performance of the models analyzed in this problem compare to +the performance of the model from problem 2? Which model performs best? Why? +\end{enumerate} +\end{problem} + +\paragraph{Solution.} The code for this problem can be found in file +\textsf{3.py}. + +(1) Let us denote by $\Phi$ the CDF of a Gaussian variable of mean 0 and +variance 1, then: +\begin{displaymath} + p(r|h) = \Phi\left(\frac{b_{r+1} - h}{\sigma^2}\right) - + \Phi\left(\frac{b_{r} - h}{\sigma^2}\right) +\end{displaymath} +this computes the probably that $f\sim \mathcal{N}(h,\sigma^2)$ falls in the +interval $[b_r, b_{r+1}]$. + +(2) In the ordinal regression model, $h = h(x) = w^Tx$. So for a given weight +vector $w$ and joke with attributes $x$, the mean of the predictive +distribution is: +\begin{displaymath} + \mu(w, x, \sigma^2) = \sum_{r=1}^5 r\left( + \Phi\left(\frac{b_{r+1} - w^Tx}{\sigma^2}\right) - + \Phi\left(\frac{b_{r} - w^Tx}{\sigma^2}\right) + \right) +\end{displaymath} +It is not entirely clear to me what was being asked for this question. If we +wanted to we could also compute the marginal predictive distribution by +integrating the above quantity against the prior $p(w)$. However, after reading +Piazza, I think the above quantity is what was expected. + +(3) See function \textsf{build\_train\_test} in \textsf{3.py}. + +(4) See functions \textsf{log\_posterior} and \textsf{grad\_log\_posterior} in +\textsf{3.py}. + +(5) See function \textsf{sgd} in \textsf{3.py}. I obtained an average test RMSE +of 1.2871. + +(6) See function \textsf{log\_posterior\_bis}. This function is simpler to +write, because the likelihood part of the log-posterior is the likelihood of +Gaussian variables, which takes the standard form of the squared error on the +training dataset (like in linear regression). The regularization term is the +same as in (5). The new average test RMSE is 1.2896. This is slightly worse +than what as obtained in (5). This can be explained by the fact that this model +does not capture the \emph{ordinal} nature (integer ratings) of the +variable to be predicted. + +(7). The performances of the models in this problem are worse than the model +from problem 2. + +This can be explained by the fact the model of problem 2 is richer: it assumes +that the ratings depends on latent features of the jokes and latent features of +the users. + +In contrast, the models in this problem assume that the ratings of a joke only +depend on features of the joke. The individual fluctuations across individuals +are simply models as independent random noise. + +An ideal model would combine ideas from the model in this problem to capture +the ordinal nature of ratings and ideas from the model in problem 2 to use +a low rank model on both users and jokes to model the mean of the Gaussian +variables. + +\end{document} diff --git a/hw3/model.pdf b/hw3/model.pdf Binary files differnew file mode 100644 index 0000000..2a458a8 --- /dev/null +++ b/hw3/model.pdf diff --git a/hw3/plot.py b/hw3/plot.py new file mode 100644 index 0000000..9797985 --- /dev/null +++ b/hw3/plot.py @@ -0,0 +1,18 @@ +import matplotlib.pyplot as plt +import seaborn as sb +from math import sqrt + +sb.set_style("white") + +values = [map(float, line.strip().split()) for line in open("results.txt")] +x, y, z = zip(*values) +y = map(sqrt, y) +z = map(sqrt, z) + +plt.figure(figsize=(9, 6)) +plt.plot(x, y, label="train") +plt.plot(x, z, label="validation") +plt.legend() +plt.xlabel("K") +plt.ylabel("RMSE") +plt.savefig("rmse.pdf") diff --git a/hw3/rmse.pdf b/hw3/rmse.pdf Binary files differnew file mode 100644 index 0000000..d3388e3 --- /dev/null +++ b/hw3/rmse.pdf |
