summaryrefslogtreecommitdiffstats
path: root/hw4/2.py
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2015-11-16 12:35:05 -0500
committerThibaut Horel <thibaut.horel@gmail.com>2015-11-16 12:35:05 -0500
commitadc7cb7256c8fcc11e7fd85866d6d3e2dcb319c1 (patch)
tree9b0065b6215919e86fc0ea3f377ea6bf536b4bf2 /hw4/2.py
parent61f644a6a7d36dc5c15d957c48d10675ab3627ae (diff)
downloadcs281-master.tar.gz
[HW4] Problem 1 and 2HEADmaster
Diffstat (limited to 'hw4/2.py')
-rw-r--r--hw4/2.py81
1 files changed, 81 insertions, 0 deletions
diff --git a/hw4/2.py b/hw4/2.py
new file mode 100644
index 0000000..275be3e
--- /dev/null
+++ b/hw4/2.py
@@ -0,0 +1,81 @@
+import sys
+from itertools import islice
+import numpy as np
+from scipy.sparse import coo_matrix
+from math import sqrt
+
+
+def get_ratings(filename):
+ with open(filename) as fh:
+ for line in fh:
+ yield map(int, line.strip().split())
+
+
+def get_train_test(filename):
+ l = [(i, j) for (i, j, _) in get_ratings(filename)]
+ n = max(i for (i, _) in l)
+ m = max(j for (_, j) in l)
+ g = get_ratings(filename)
+ train = islice(g, 100000)
+ test = islice(g, 100000)
+ return n, m, list(train), list(test)
+
+
+def sparse_matrix(ratings):
+ i, j, data = zip(*ratings)
+ S = coo_matrix((data, (i, j)))
+ return S.tocsc(), S.tocsr()
+
+
+def get_users(Rr):
+ return [i for i in xrange(Rr.shape[0]) if len(Rr[i].nonzero()[1])]
+
+
+def get_jokes(Rc):
+ return [j for j in xrange(Rc.shape[1]) if len(Rc[:, j].nonzero()[0])]
+
+
+def sample_users(U, V, Rr, users):
+ for i in users:
+ r = Rr[i]
+ ind = r.nonzero()[1]
+ v = V[ind]
+ isigma = np.identity(k) / 5. + np.dot(v.T, v)
+ sigma = np.linalg.inv(isigma)
+ U[i] = np.random.multivariate_normal(np.dot(sigma, r.dot(V)[0]),
+ sigma)
+
+
+def sample_jokes(U, V, Rc, jokes):
+ for j in jokes:
+ r = Rc[:, j]
+ u = U[r.nonzero()[0]]
+ isigma = np.identity(k) / 5. + np.dot(u.T, u)
+ sigma = np.linalg.inv(isigma)
+ V[j] = np.random.multivariate_normal(np.dot(sigma, r.T.dot(U)[0]),
+ sigma)
+
+
+def sample(U, V, Rr, Rc, users, jokes):
+ sample_users(U, V, Rr, users)
+ sample_jokes(U, V, Rc, jokes)
+
+
+def likelihood(ratings, U, V):
+ return sum((r - np.inner(U[i], V[j])) ** 2 for i, j, r in ratings)
+
+
+if __name__ == "__main__":
+ n, m, train, test = get_train_test(sys.argv[1])
+ Rc, Rr = sparse_matrix(train)
+ users = get_users(Rr) # users with at least one rating
+ jokes = get_jokes(Rc) # jokes with at least one rating
+ for k in xrange(1, 11):
+ with open("gibbs_" + str(k) + ".txt", "w") as fh:
+ U = np.random.normal(0, sqrt(5), size=(Rc.shape[0], k))
+ V = np.random.normal(0, sqrt(5), size=(Rc.shape[1], k))
+ for e in xrange(100):
+ fh.write("\t".join(map(str, [e, likelihood(train, U, V),
+ likelihood(test, U, V)])) + "\n")
+ fh.flush()
+ sample(U, V, Rr, Rc, users, jokes)