1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
|
import sys
from itertools import islice
import numpy as np
from scipy.sparse import coo_matrix
from math import sqrt
def get_ratings(filename):
with open(filename) as fh:
for line in fh:
yield map(int, line.strip().split())
def get_train_test(filename):
l = [(i, j) for (i, j, _) in get_ratings(filename)]
n = max(i for (i, _) in l)
m = max(j for (_, j) in l)
g = get_ratings(filename)
train = islice(g, 100000)
test = islice(g, 100000)
return n, m, list(train), list(test)
def sparse_matrix(ratings):
i, j, data = zip(*ratings)
S = coo_matrix((data, (i, j)))
return S.tocsc(), S.tocsr()
def get_users(Rr):
return [i for i in xrange(Rr.shape[0]) if len(Rr[i].nonzero()[1])]
def get_jokes(Rc):
return [j for j in xrange(Rc.shape[1]) if len(Rc[:, j].nonzero()[0])]
def sample_users(U, V, Rr, users):
for i in users:
r = Rr[i]
ind = r.nonzero()[1]
v = V[ind]
isigma = np.identity(k) / 5. + np.dot(v.T, v)
sigma = np.linalg.inv(isigma)
U[i] = np.random.multivariate_normal(np.dot(sigma, r.dot(V)[0]),
sigma)
def sample_jokes(U, V, Rc, jokes):
for j in jokes:
r = Rc[:, j]
u = U[r.nonzero()[0]]
isigma = np.identity(k) / 5. + np.dot(u.T, u)
sigma = np.linalg.inv(isigma)
V[j] = np.random.multivariate_normal(np.dot(sigma, r.T.dot(U)[0]),
sigma)
def sample(U, V, Rr, Rc, users, jokes):
sample_users(U, V, Rr, users)
sample_jokes(U, V, Rc, jokes)
def likelihood(ratings, U, V):
return sum((r - np.inner(U[i], V[j])) ** 2 for i, j, r in ratings)
if __name__ == "__main__":
n, m, train, test = get_train_test(sys.argv[1])
Rc, Rr = sparse_matrix(train)
users = get_users(Rr) # users with at least one rating
jokes = get_jokes(Rc) # jokes with at least one rating
for k in xrange(1, 11):
with open("gibbs_" + str(k) + ".txt", "w") as fh:
U = np.random.normal(0, sqrt(5), size=(Rc.shape[0], k))
V = np.random.normal(0, sqrt(5), size=(Rc.shape[1], k))
for e in xrange(100):
fh.write("\t".join(map(str, [e, likelihood(train, U, V),
likelihood(test, U, V)])) + "\n")
fh.flush()
sample(U, V, Rr, Rc, users, jokes)
|