summaryrefslogtreecommitdiffstats
path: root/twitter/scraper.py
diff options
context:
space:
mode:
Diffstat (limited to 'twitter/scraper.py')
-rw-r--r--twitter/scraper.py96
1 files changed, 96 insertions, 0 deletions
diff --git a/twitter/scraper.py b/twitter/scraper.py
new file mode 100644
index 0000000..49b116a
--- /dev/null
+++ b/twitter/scraper.py
@@ -0,0 +1,96 @@
+from selenium import webdriver
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.common.exceptions import TimeoutException,\
+ ElementNotVisibleException, NoSuchElementException
+from bs4 import BeautifulSoup, Tag
+
+import os.path
+from urllib2 import URLError
+from time import sleep
+
+
+class Driver:
+
+ def __init__(self, username, password):
+ self.driver = webdriver.PhantomJS()
+ self.username = username
+ self.password = password
+ self.__connect()
+
+ def __ajax_complete(self):
+ return 0 == self.driver.execute_script("return jQuery.active")
+
+ def __connect(self):
+ driver = self.driver
+ driver.get("http://twitter.com")
+ driver.find_element_by_id("signin-email").send_keys(self.username)
+ elem = driver.find_element_by_id("signin-password")
+ elem.send_keys(self.password)
+ elem.submit()
+
+ def __get_followers(self, username):
+ driver = self.driver
+ try:
+ driver.get("https://twitter.com/{0}/followers".format(username))
+ except URLError:
+ sleep(1)
+ try:
+ driver.get("https://twitter.com/{0}/followers".format(username))
+ except URLError:
+ return
+
+ try:
+ footer = driver.find_element_by_class_name("timeline-end")
+ except NoSuchElementException:
+ return
+
+ while True:
+ try:
+ if "has-more-items" not in footer.get_attribute("class"):
+ break
+ footer.click()
+ try:
+ WebDriverWait(driver,
+ 5).until(lambda x: self.__ajax_complete(),
+ "Timeout waiting for "
+ "ajax to return")
+ except TimeoutException:
+ break
+ except (NoSuchElementException, ElementNotVisibleException):
+ break
+
+ try:
+ fws = driver.find_element_by_id("stream-items-id")
+ except NoSuchElementException:
+ return
+
+ soup = BeautifulSoup(fws.get_attribute("outerHTML"))
+ for follower in soup.ol:
+ if type(follower) == Tag:
+ div = follower.div
+ user_id = div["data-user-id"]
+ screen_name = div["data-screen-name"]
+ yield user_id, screen_name
+
+ def get_followers(self, user_id, username):
+ filename = os.path.join("data", "users", user_id + ".txt")
+ if os.path.isfile(filename):
+ return filename
+ l = list(self.__get_followers(username))
+ with open(filename, "w") as f:
+ for (fid, fname) in l:
+ f.write(fid + " " + fname + "\n")
+ return filename
+
+ def process(self, filename):
+ with open(filename) as f:
+ for line in f:
+ values = line.strip().split()
+ self.get_followers(*values[:2])
+
+
+if __name__ == "__main__":
+ credentials = open("scraping_accounts.txt").readline().strip().split()
+ driver = Driver(*credentials[:2])
+ # driver.get_followers("23302126", "flipper509")
+ print driver.get_profile(100, "thibauthorel")