diff options
| author | Thibaut Horel <thibaut.horel@gmail.com> | 2014-01-28 00:14:54 -0500 |
|---|---|---|
| committer | Thibaut Horel <thibaut.horel@gmail.com> | 2014-01-28 00:14:54 -0500 |
| commit | 9f32604638a14cce101f9a3b1dd08971a8142f58 (patch) | |
| tree | e863695593821dce6004678df8990f964888fe89 /scraper.py | |
| download | fast-seeding-9f32604638a14cce101f9a3b1dd08971a8142f58.tar.gz | |
Initial commit
Diffstat (limited to 'scraper.py')
| -rw-r--r-- | scraper.py | 86 |
1 files changed, 86 insertions, 0 deletions
diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..ee7dd8f --- /dev/null +++ b/scraper.py @@ -0,0 +1,86 @@ +from selenium import webdriver +from selenium.webdriver.support.wait import WebDriverWait +from selenium.common.exceptions import TimeoutException,\ + ElementNotVisibleException, NoSuchElementException +from bs4 import BeautifulSoup, Tag + +import os.path + + +class Driver: + + def __init__(self, username, password): + self.driver = webdriver.PhantomJS() + self.username = username + self.password = password + self.__connect() + + def __ajax_complete(self): + return 0 == self.driver.execute_script("return jQuery.active") + + def __connect(self): + driver = self.driver + driver.get("http://twitter.com") + driver.find_element_by_id("signin-email").send_keys(self.username) + elem = driver.find_element_by_id("signin-password") + elem.send_keys(self.password) + elem.submit() + + def __get_followers(self, username): + driver = self.driver + driver.get("https://twitter.com/{0}/followers".format(username)) + try: + footer = driver.find_element_by_class_name("timeline-end") + except NoSuchElementException: + return + + while True: + try: + if "has-more-items" not in footer.get_attribute("class"): + break + footer.click() + try: + WebDriverWait(driver, + 5).until(lambda x: self.__ajax_complete(), + "Timeout waiting for " + "ajax to return") + except TimeoutException: + break + except (NoSuchElementException, ElementNotVisibleException): + break + + try: + fws = driver.find_element_by_id("stream-items-id") + except NoSuchElementException: + return + + soup = BeautifulSoup(fws.get_attribute("outerHTML")) + for follower in soup.ol: + if type(follower) == Tag: + div = follower.div + user_id = div["data-user-id"] + screen_name = div["data-screen-name"] + yield user_id, screen_name + + def get_followers(self, user_id, username): + filename = os.path.join("data", "users", user_id + ".txt") + if os.path.isfile(filename): + return filename + l = list(self.__get_followers(username)) + with open(filename, "w") as f: + for (fid, fname) in l: + f.write(fid + " " + fname + "\n") + return filename + + def process(self, filename): + with open(filename) as f: + for line in f: + values = line.strip().split() + self.get_followers(*values[:2]) + + +if __name__ == "__main__": + credentials = open("scraping_accounts.txt").readline().strip().split() + driver = Driver(*credentials[:2]) + # driver.get_followers("23302126", "flipper509") + print driver.get_profile(100, "thibauthorel") |
