[HW4] Problem 1 and 2HEAD master

author: Thibaut Horel <thibaut.horel@gmail.com> 2015-11-16 12:35:05 -0500
committer: Thibaut Horel <thibaut.horel@gmail.com> 2015-11-16 12:35:05 -0500
commit: adc7cb7256c8fcc11e7fd85866d6d3e2dcb319c1 (patch)
tree: 9b0065b6215919e86fc0ea3f377ea6bf536b4bf2 /hw4/2.py
parent: 61f644a6a7d36dc5c15d957c48d10675ab3627ae (diff)
download: cs281-adc7cb7256c8fcc11e7fd85866d6d3e2dcb319c1.tar.gz
1 files changed, 81 insertions, 0 deletions
diff --git a/hw4/2.py b/hw4/2.py
new file mode 100644
index 0000000..275be3e
--- /dev/null
+++ b/hw4/2.py
@@ -0,0 +1,81 @@
+import sys
+from itertools import islice
+import numpy as np
+from scipy.sparse import coo_matrix
+from math import sqrt
+
+
+def get_ratings(filename):
+    with open(filename) as fh:
+        for line in fh:
+            yield map(int, line.strip().split())
+
+
+def get_train_test(filename):
+    l = [(i, j) for (i, j, _) in get_ratings(filename)]
+    n = max(i for (i, _) in l)
+    m = max(j for (_, j) in l)
+    g = get_ratings(filename)
+    train = islice(g, 100000)
+    test = islice(g, 100000)
+    return n, m, list(train), list(test)
+
+
+def sparse_matrix(ratings):
+    i, j, data = zip(*ratings)
+    S = coo_matrix((data, (i, j)))
+    return S.tocsc(), S.tocsr()
+
+
+def get_users(Rr):
+    return [i for i in xrange(Rr.shape[0]) if len(Rr[i].nonzero()[1])]
+
+
+def get_jokes(Rc):
+    return [j for j in xrange(Rc.shape[1]) if len(Rc[:, j].nonzero()[0])]
+
+
+def sample_users(U, V, Rr, users):
+    for i in users:
+        r = Rr[i]
+        ind = r.nonzero()[1]
+        v = V[ind]
+        isigma = np.identity(k) / 5. + np.dot(v.T, v)
+        sigma = np.linalg.inv(isigma)
+        U[i] = np.random.multivariate_normal(np.dot(sigma, r.dot(V)[0]),
+                                             sigma)
+
+
+def sample_jokes(U, V, Rc, jokes):
+    for j in jokes:
+        r = Rc[:, j]
+        u = U[r.nonzero()[0]]
+        isigma = np.identity(k) / 5. + np.dot(u.T, u)
+        sigma = np.linalg.inv(isigma)
+        V[j] = np.random.multivariate_normal(np.dot(sigma, r.T.dot(U)[0]),
+                                             sigma)
+
+
+def sample(U, V, Rr, Rc, users, jokes):
+    sample_users(U, V, Rr, users)
+    sample_jokes(U, V, Rc, jokes)
+
+
+def likelihood(ratings, U, V):
+    return sum((r - np.inner(U[i], V[j])) ** 2 for i, j, r in ratings)
+
+
+if __name__ == "__main__":
+    n, m, train, test = get_train_test(sys.argv[1])
+    Rc, Rr = sparse_matrix(train)
+    users = get_users(Rr)  # users with at least one rating
+    jokes = get_jokes(Rc)  # jokes with at least one rating
+    for k in xrange(1, 11):
+        with open("gibbs_" + str(k) + ".txt", "w") as fh:
+            U = np.random.normal(0, sqrt(5), size=(Rc.shape[0], k))
+            V = np.random.normal(0, sqrt(5), size=(Rc.shape[1], k))
+            for e in xrange(100):
+                fh.write("\t".join(map(str, [e, likelihood(train, U, V),
+                                    likelihood(test, U, V)])) + "\n")
+                fh.flush()
+                sample(U, V, Rr, Rc, users, jokes)
author	Thibaut Horel <thibaut.horel@gmail.com>	2015-11-16 12:35:05 -0500
committer	Thibaut Horel <thibaut.horel@gmail.com>	2015-11-16 12:35:05 -0500
commit	adc7cb7256c8fcc11e7fd85866d6d3e2dcb319c1 (patch)
tree	9b0065b6215919e86fc0ea3f377ea6bf536b4bf2 /hw4/2.py
parent	61f644a6a7d36dc5c15d957c48d10675ab3627ae (diff)
download	cs281-adc7cb7256c8fcc11e7fd85866d6d3e2dcb319c1.tar.gz