import sys from itertools import islice import numpy as np from scipy.sparse import coo_matrix from math import sqrt def get_ratings(filename): with open(filename) as fh: for line in fh: yield map(int, line.strip().split()) def get_train_test(filename): l = [(i, j) for (i, j, _) in get_ratings(filename)] n = max(i for (i, _) in l) m = max(j for (_, j) in l) g = get_ratings(filename) train = islice(g, 100000) test = islice(g, 100000) return n, m, list(train), list(test) def sparse_matrix(ratings): i, j, data = zip(*ratings) S = coo_matrix((data, (i, j))) return S.tocsc(), S.tocsr() def get_users(Rr): return [i for i in xrange(Rr.shape[0]) if len(Rr[i].nonzero()[1])] def get_jokes(Rc): return [j for j in xrange(Rc.shape[1]) if len(Rc[:, j].nonzero()[0])] def sample_users(U, V, Rr, users): for i in users: r = Rr[i] ind = r.nonzero()[1] v = V[ind] isigma = np.identity(k) / 5. + np.dot(v.T, v) sigma = np.linalg.inv(isigma) U[i] = np.random.multivariate_normal(np.dot(sigma, r.dot(V)[0]), sigma) def sample_jokes(U, V, Rc, jokes): for j in jokes: r = Rc[:, j] u = U[r.nonzero()[0]] isigma = np.identity(k) / 5. + np.dot(u.T, u) sigma = np.linalg.inv(isigma) V[j] = np.random.multivariate_normal(np.dot(sigma, r.T.dot(U)[0]), sigma) def sample(U, V, Rr, Rc, users, jokes): sample_users(U, V, Rr, users) sample_jokes(U, V, Rc, jokes) def likelihood(ratings, U, V): return sum((r - np.inner(U[i], V[j])) ** 2 for i, j, r in ratings) if __name__ == "__main__": n, m, train, test = get_train_test(sys.argv[1]) Rc, Rr = sparse_matrix(train) users = get_users(Rr) # users with at least one rating jokes = get_jokes(Rc) # jokes with at least one rating for k in xrange(1, 11): with open("gibbs_" + str(k) + ".txt", "w") as fh: U = np.random.normal(0, sqrt(5), size=(Rc.shape[0], k)) V = np.random.normal(0, sqrt(5), size=(Rc.shape[1], k)) for e in xrange(100): fh.write("\t".join(map(str, [e, likelihood(train, U, V), likelihood(test, U, V)])) + "\n") fh.flush() sample(U, V, Rr, Rc, users, jokes)