summaryrefslogtreecommitdiffstats
path: root/facebook_scraping/run.py
blob: 94eb1a463c477cfc02ff7ddc91faa8dacd546db3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from tasks import NumFollowers, ListFollowers, normalize, strip
from bs4 import BeautifulSoup
from celery.result import ResultSet
import os.path as op
from datetime import datetime
import sys

nf = NumFollowers()
lf = ListFollowers()

users = {}
try:
    with open(sys.argv[1]) as f:
        for line in f:
            values = line.strip().split()
            users[values[0]] = int(values[1].replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore"))
except IOError:
    pass

output = open(sys.argv[1], "a")
bad = open("bad.txt", "a")


def add_user(user, degree):
    users[user] = degree
    output.write(user + " " + str(degree) + "\n")


def call_back(tid, value):
    print datetime.now().isoformat() + " " + str(value)
    if "nfriends" in value:
        if value["nfriends"] is None:
            bad.write(value["orig"] + "\n")
            bad.flush()
            return
        basename, fname, getname = normalize(value["for"])
        n_friends = int(str(value["nfriends"]).replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore"))
        add_user(fname, n_friends)
        return

if sys.argv[4] == "True":
    todo = ResultSet([])
    soup = BeautifulSoup(open(sys.argv[2]))
    links = [div.a["href"] for div in soup.findAll("div", class_="fsl")]
    chunk = []
    for link in links:
        basename, finame, getname = normalize(link)
        if op.isfile("facebook/" + finame):
            with open("facebook/" + finame) as f:
                for line in f:
                    basename, fname, getname = normalize(line.strip())
                    if fname not in users:
                        print finame
                        todo.add(nf.delay(basename))
    todo.join_native(callback=call_back)
todo = []


def call_back_fd(tid, value):
    print datetime.now().isoformat() + " " + str(value)
    if value["friends"] is None:
        bad.write(value["orig"] + "\n")
        bad.flush()
        return
    basename, fname, getname = normalize(strip(value["for"]))
    add_user(fname, len(value["friends"]))
    with open("facebook/" + fname, "w") as f:
        for friend in value["friends"]:
            basename, fname, getname = normalize(friend)
            f.write(basename + "\n")
            if fname not in users:
                todo.append(basename)

soup = BeautifulSoup(open(sys.argv[2]))
links = [div.a["href"] for div in soup.findAll("div", class_="fsl")]
chunk = []
for link in links:
    basename, fname, getname = normalize(link)
    if not op.isfile("facebook/" + fname):
        chunk.append(getname)
        if len(chunk) == int(sys.argv[3]):
            todofd = ResultSet([])
            for name in chunk:
                todofd.add(lf.delay(name))
            chunk = []
            todofd.join_native(callback=call_back_fd)
            todos = ResultSet([])
            for name in todo:
                todos.add(nf.delay(name))
            todo = []
            todos.join_native(callback=call_back)