summaryrefslogtreecommitdiffstats
path: root/twitter/scraper.py
blob: e9127827f9d16253bf560257bd967c5befbafdf2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException,\
    ElementNotVisibleException, NoSuchElementException
from bs4 import BeautifulSoup, Tag

import os.path
from urllib2 import URLError
from time import sleep


class Driver:

    def __init__(self, username, password):
        self.driver = webdriver.PhantomJS()
        self.username = username
        self.password = password
        self.__connect()

    def __ajax_complete(self):
        return 0 == self.driver.execute_script("return jQuery.active")

    def __connect(self):
        driver = self.driver
        driver.get("http://twitter.com")
        driver.find_element_by_id("signin-email").send_keys(self.username)
        elem = driver.find_element_by_id("signin-password")
        elem.send_keys(self.password)
        elem.submit()

    def __get_followers(self, username):
        driver = self.driver
        try:
            driver.get("https://twitter.com/{0}/followers".format(username))
        except URLError:
            sleep(1)
            try:
                driver.get("https://twitter.com/{0}/followers".format(username))
            except URLError:
                return

        try:
            footer = driver.find_element_by_class_name("timeline-end")
        except NoSuchElementException:
            return

        while True:
            try:
                if "has-more-items" not in footer.get_attribute("class"):
                    break
                footer.click()
                try:
                    WebDriverWait(driver,
                                  5).until(lambda x: self.__ajax_complete(),
                                           "Timeout waiting for "
                                           "ajax to return")
                except TimeoutException:
                    break
            except (NoSuchElementException, ElementNotVisibleException):
                break

        try:
            fws = driver.find_element_by_id("stream-items-id")
        except NoSuchElementException:
            return

        soup = BeautifulSoup(fws.get_attribute("outerHTML"))
        for follower in soup.ol:
            if type(follower) == Tag:
                div = follower.div
                user_id = div["data-user-id"]
                screen_name = div["data-screen-name"]
                yield user_id, screen_name

    def get_followers(self, user_id, username):
        filename = os.path.join("data", "users", user_id + ".txt")
        if os.path.isfile(filename):
            return filename
        l = list(self.__get_followers(username))
        with open(filename, "w") as f:
            for (fid, fname) in l:
                f.write(fid + " " + fname + "\n")
        return filename

    def process(self, filename):
        with open(filename) as f:
            for line in f:
                values = line.strip().split()
                self.get_followers(*values[:2])


if __name__ == "__main__":
    credentials = open("scraping_accounts.txt").readline().strip().split()
    driver = Driver(*credentials[:2])
    driver.get_followers("23302126", "flipper509")