summaryrefslogtreecommitdiffstats
path: root/scraper.py
blob: ee7dd8f2733fd87dfa73555e7c0ca8bde766a305 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException,\
    ElementNotVisibleException, NoSuchElementException
from bs4 import BeautifulSoup, Tag

import os.path


class Driver:

    def __init__(self, username, password):
        self.driver = webdriver.PhantomJS()
        self.username = username
        self.password = password
        self.__connect()

    def __ajax_complete(self):
        return 0 == self.driver.execute_script("return jQuery.active")

    def __connect(self):
        driver = self.driver
        driver.get("http://twitter.com")
        driver.find_element_by_id("signin-email").send_keys(self.username)
        elem = driver.find_element_by_id("signin-password")
        elem.send_keys(self.password)
        elem.submit()

    def __get_followers(self, username):
        driver = self.driver
        driver.get("https://twitter.com/{0}/followers".format(username))
        try:
            footer = driver.find_element_by_class_name("timeline-end")
        except NoSuchElementException:
            return

        while True:
            try:
                if "has-more-items" not in footer.get_attribute("class"):
                    break
                footer.click()
                try:
                    WebDriverWait(driver,
                                  5).until(lambda x: self.__ajax_complete(),
                                           "Timeout waiting for "
                                           "ajax to return")
                except TimeoutException:
                    break
            except (NoSuchElementException, ElementNotVisibleException):
                break

        try:
            fws = driver.find_element_by_id("stream-items-id")
        except NoSuchElementException:
            return

        soup = BeautifulSoup(fws.get_attribute("outerHTML"))
        for follower in soup.ol:
            if type(follower) == Tag:
                div = follower.div
                user_id = div["data-user-id"]
                screen_name = div["data-screen-name"]
                yield user_id, screen_name

    def get_followers(self, user_id, username):
        filename = os.path.join("data", "users", user_id + ".txt")
        if os.path.isfile(filename):
            return filename
        l = list(self.__get_followers(username))
        with open(filename, "w") as f:
            for (fid, fname) in l:
                f.write(fid + " " + fname + "\n")
        return filename

    def process(self, filename):
        with open(filename) as f:
            for line in f:
                values = line.strip().split()
                self.get_followers(*values[:2])


if __name__ == "__main__":
    credentials = open("scraping_accounts.txt").readline().strip().split()
    driver = Driver(*credentials[:2])
    # driver.get_followers("23302126", "flipper509")
    print driver.get_profile(100, "thibauthorel")