diff options
| author | Thibaut Horel <thibaut.horel@gmail.com> | 2015-11-16 12:35:05 -0500 |
|---|---|---|
| committer | Thibaut Horel <thibaut.horel@gmail.com> | 2015-11-16 12:35:05 -0500 |
| commit | adc7cb7256c8fcc11e7fd85866d6d3e2dcb319c1 (patch) | |
| tree | 9b0065b6215919e86fc0ea3f377ea6bf536b4bf2 /hw4/2.py | |
| parent | 61f644a6a7d36dc5c15d957c48d10675ab3627ae (diff) | |
| download | cs281-adc7cb7256c8fcc11e7fd85866d6d3e2dcb319c1.tar.gz | |
Diffstat (limited to 'hw4/2.py')
| -rw-r--r-- | hw4/2.py | 81 |
1 files changed, 81 insertions, 0 deletions
diff --git a/hw4/2.py b/hw4/2.py new file mode 100644 index 0000000..275be3e --- /dev/null +++ b/hw4/2.py @@ -0,0 +1,81 @@ +import sys +from itertools import islice +import numpy as np +from scipy.sparse import coo_matrix +from math import sqrt + + +def get_ratings(filename): + with open(filename) as fh: + for line in fh: + yield map(int, line.strip().split()) + + +def get_train_test(filename): + l = [(i, j) for (i, j, _) in get_ratings(filename)] + n = max(i for (i, _) in l) + m = max(j for (_, j) in l) + g = get_ratings(filename) + train = islice(g, 100000) + test = islice(g, 100000) + return n, m, list(train), list(test) + + +def sparse_matrix(ratings): + i, j, data = zip(*ratings) + S = coo_matrix((data, (i, j))) + return S.tocsc(), S.tocsr() + + +def get_users(Rr): + return [i for i in xrange(Rr.shape[0]) if len(Rr[i].nonzero()[1])] + + +def get_jokes(Rc): + return [j for j in xrange(Rc.shape[1]) if len(Rc[:, j].nonzero()[0])] + + +def sample_users(U, V, Rr, users): + for i in users: + r = Rr[i] + ind = r.nonzero()[1] + v = V[ind] + isigma = np.identity(k) / 5. + np.dot(v.T, v) + sigma = np.linalg.inv(isigma) + U[i] = np.random.multivariate_normal(np.dot(sigma, r.dot(V)[0]), + sigma) + + +def sample_jokes(U, V, Rc, jokes): + for j in jokes: + r = Rc[:, j] + u = U[r.nonzero()[0]] + isigma = np.identity(k) / 5. + np.dot(u.T, u) + sigma = np.linalg.inv(isigma) + V[j] = np.random.multivariate_normal(np.dot(sigma, r.T.dot(U)[0]), + sigma) + + +def sample(U, V, Rr, Rc, users, jokes): + sample_users(U, V, Rr, users) + sample_jokes(U, V, Rc, jokes) + + +def likelihood(ratings, U, V): + return sum((r - np.inner(U[i], V[j])) ** 2 for i, j, r in ratings) + + +if __name__ == "__main__": + n, m, train, test = get_train_test(sys.argv[1]) + Rc, Rr = sparse_matrix(train) + users = get_users(Rr) # users with at least one rating + jokes = get_jokes(Rc) # jokes with at least one rating + for k in xrange(1, 11): + with open("gibbs_" + str(k) + ".txt", "w") as fh: + U = np.random.normal(0, sqrt(5), size=(Rc.shape[0], k)) + V = np.random.normal(0, sqrt(5), size=(Rc.shape[1], k)) + for e in xrange(100): + fh.write("\t".join(map(str, [e, likelihood(train, U, V), + likelihood(test, U, V)])) + "\n") + fh.flush() + sample(U, V, Rr, Rc, users, jokes) |
