1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
|
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException,\
ElementNotVisibleException, NoSuchElementException
from bs4 import BeautifulSoup, Tag
import os.path
class Driver:
def __init__(self, username, password):
self.driver = webdriver.PhantomJS()
self.username = username
self.password = password
self.__connect()
def __ajax_complete(self):
return 0 == self.driver.execute_script("return jQuery.active")
def __connect(self):
driver = self.driver
driver.get("http://twitter.com")
driver.find_element_by_id("signin-email").send_keys(self.username)
elem = driver.find_element_by_id("signin-password")
elem.send_keys(self.password)
elem.submit()
def __get_followers(self, username):
driver = self.driver
driver.get("https://twitter.com/{0}/followers".format(username))
try:
footer = driver.find_element_by_class_name("timeline-end")
except NoSuchElementException:
return
while True:
try:
if "has-more-items" not in footer.get_attribute("class"):
break
footer.click()
try:
WebDriverWait(driver,
5).until(lambda x: self.__ajax_complete(),
"Timeout waiting for "
"ajax to return")
except TimeoutException:
break
except (NoSuchElementException, ElementNotVisibleException):
break
try:
fws = driver.find_element_by_id("stream-items-id")
except NoSuchElementException:
return
soup = BeautifulSoup(fws.get_attribute("outerHTML"))
for follower in soup.ol:
if type(follower) == Tag:
div = follower.div
user_id = div["data-user-id"]
screen_name = div["data-screen-name"]
yield user_id, screen_name
def get_followers(self, user_id, username):
filename = os.path.join("data", "users", user_id + ".txt")
if os.path.isfile(filename):
return filename
l = list(self.__get_followers(username))
with open(filename, "w") as f:
for (fid, fname) in l:
f.write(fid + " " + fname + "\n")
return filename
def process(self, filename):
with open(filename) as f:
for line in f:
values = line.strip().split()
self.get_followers(*values[:2])
if __name__ == "__main__":
credentials = open("scraping_accounts.txt").readline().strip().split()
driver = Driver(*credentials[:2])
# driver.get_followers("23302126", "flipper509")
print driver.get_profile(100, "thibauthorel")
|