1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
|
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException,\
ElementNotVisibleException, NoSuchElementException
from bs4 import BeautifulSoup, Tag
import os.path
from urllib2 import URLError
from time import sleep
class Driver:
def __init__(self, username, password):
self.driver = webdriver.PhantomJS()
self.username = username
self.password = password
self.__connect()
def __ajax_complete(self):
return 0 == self.driver.execute_script("return jQuery.active")
def __connect(self):
driver = self.driver
driver.get("http://twitter.com")
driver.find_element_by_id("signin-email").send_keys(self.username)
elem = driver.find_element_by_id("signin-password")
elem.send_keys(self.password)
elem.submit()
def __get_followers(self, username):
driver = self.driver
try:
driver.get("https://twitter.com/{0}/followers".format(username))
except URLError:
sleep(1)
try:
driver.get("https://twitter.com/{0}/followers".format(username))
except URLError:
return
try:
footer = driver.find_element_by_class_name("timeline-end")
except NoSuchElementException:
return
while True:
try:
if "has-more-items" not in footer.get_attribute("class"):
break
footer.click()
try:
WebDriverWait(driver,
5).until(lambda x: self.__ajax_complete(),
"Timeout waiting for "
"ajax to return")
except TimeoutException:
break
except (NoSuchElementException, ElementNotVisibleException):
break
try:
fws = driver.find_element_by_id("stream-items-id")
except NoSuchElementException:
return
soup = BeautifulSoup(fws.get_attribute("outerHTML"))
for follower in soup.ol:
if type(follower) == Tag:
div = follower.div
user_id = div["data-user-id"]
screen_name = div["data-screen-name"]
yield user_id, screen_name
def get_followers(self, user_id, username):
filename = os.path.join("data", "users", user_id + ".txt")
if os.path.isfile(filename):
return filename
l = list(self.__get_followers(username))
with open(filename, "w") as f:
for (fid, fname) in l:
f.write(fid + " " + fname + "\n")
return filename
def process(self, filename):
with open(filename) as f:
for line in f:
values = line.strip().split()
self.get_followers(*values[:2])
if __name__ == "__main__":
credentials = open("scraping_accounts.txt").readline().strip().split()
driver = Driver(*credentials[:2])
driver.get_followers("23302126", "flipper509")
|