1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
|
from tasks import NumFollowers, ListFollowers, normalize
from bs4 import BeautifulSoup
from celery.result import ResultSet
import os.path as op
from glob import glob
nf = NumFollowers()
lf = ListFollowers()
rset = ResultSet([])
users = {}
try:
with open("all_users.txt") as f:
for line in f:
values = line.strip().split()
users[values[0]] = int(values[1])
except IOError:
pass
output = open("all_users.txt", "a")
def strip(url):
if url.endswith("/friends"):
return url[:-8]
else:
return url.split("&")[0]
def add_user(user, degree):
print user, degree
users[user] = degree
output.write(user + " " + str(degree) + "\n")
output.flush()
def call_back(tid, value):
if "friends" in value:
return
if "nfriends" in value:
basename, fname, getname = normalize(value["for"])
add_user(fname, value["nfriends"])
return
todo = ResultSet([])
for finame in glob("facebook/*"):
with open(finame) as f:
for line in f:
basename, fname, getname = normalize(line.strip())
if fname not in users:
print finame
todo.add(nf.delay(basename))
todo.join_native(callback=call_back)
soup = BeautifulSoup(open("seed.txt"))
links = [div.a["href"] for div in soup.findAll("div", class_="fsl")]
for link in links[:100]:
basename, fname, getname = normalize(link)
if not op.isfile("facebook/" + fname):
result = lf.delay(getname)
value = result.get()
basename, fname, getname = normalize(strip(value["for"]))
add_user(fname, len(value["friends"]))
todo = ResultSet([])
with open("facebook/" + fname, "w") as f:
for friend in value["friends"]:
basename, fname, getname = normalize(friend)
f.write(basename + "\n")
if fname not in users:
todo.add(nf.delay(basename))
print ("facebook/" + fname)
todo.join_native(callback=call_back)
|