From 7426d8ff0e7969eb1a86bdb5bec8a0c971309e2b Mon Sep 17 00:00:00 2001 From: Thibaut Horel Date: Sun, 2 Feb 2014 16:53:22 -0500 Subject: Facebook scraping --- api.py | 101 ----------------------------------------------------------------- 1 file changed, 101 deletions(-) delete mode 100644 api.py (limited to 'api.py') diff --git a/api.py b/api.py deleted file mode 100644 index 853a2c3..0000000 --- a/api.py +++ /dev/null @@ -1,101 +0,0 @@ -from tweepy import API, OAuthHandler -from tweepy import cursor -from bs4 import BeautifulSoup - -import os.path -import uuid -from time import time, sleep -from urllib import urlopen - - -class RequestHandler: - - def __init__(self, *args): - auth = OAuthHandler(*args[0:2]) - auth.set_access_token(*args[2:]) - self.api = API(auth) - self.state = {} - limits = self.api.rate_limit_status() - self.state["followers"] = limits["resources"]["followers"]["/followers/ids"] - self.state["lookup"] = limits["resources"]["users"]["/users/lookup"] - - def __get_followers(self, user_id): - pages = cursor.Cursor(self.api.followers_ids, id=user_id).pages(1) - for page in pages: - for follower in page: - yield follower - - def get_followers(self, user_id): - filename = os.path.join("data", "users", user_id + ".txt") - if os.path.isfile(filename): - return filename - l = list(self.__get_followers(user_id)) - with open(filename, "w") as f: - for fid in l: - f.write(str(fid) + "\n") - for key, value in self.api.last_response.getheaders(): - if key.startswith("x-rate-limit"): - self.state["followers"][key.split("-")[-1]] = int(value) - return filename - - def __lookup(self, users_list): - for user in self.api.lookup_users(users_list): - yield user - - def lookup(self, users_list): - uid = uuid.uuid1() - filename = os.path.join("data", "users", "lookup-" + str(uid) + ".txt") - l = list(self.__lookup(users_list)) - with open(filename, "w") as f: - for user in l: - output = " ".join([str(user.id), user.screen_name, - str(user.followers_count), - str(user.friends_count)]) - f.write(output + "\n") - for key, value in self.api.last_response.getheaders(): - if key.startswith("x-rate-limit"): - self.state["lookup"][key.split("-")[-1]] = int(value) - return filename - - def get_profile(self, user_id, username): - fh = urlopen("https://twitter.com/{0}".format(username)) - soup = BeautifulSoup(fh) - ul = soup.find("ul", class_="js-mini-profile-stats") - following, followers = [li.strong.string - for li in ul.find_all("li")[1:]] - return user_id, username, followers, following - - def short_lookup(self, users_list): - uid = uuid.uuid1() - filename = os.path.join("data", "users", "lookup-" + str(uid) + ".txt") - - def get_output(): - for user_id, username in users_list: - output = " ".join(map(str, self.get_profile(user_id, - username))) - yield output - sleep(0.5) - - to_write = list(get_output()) - with open(filename, "w") as f: - f.write("\n".join(to_write)) - - return filename - - def ready(self, method): - now = int(time()) - if (int(self.state[method]["remaining"]) > 0 - or int(self.state[method]["reset"]) < now): - return True - else: - return False - - -if __name__ == "__main__": - credentials = open("api_accounts.txt").readline().strip().split() - handler = RequestHandler(*credentials[2:]) - # if handler.ready("lookup"): - # handler.lookup(["304224106"]) - # if handler.ready("followers"): - # handler.lookup("304224106") - print handler.short_lookup([("000", "thibauthorel")]) -- cgit v1.2.3-70-g09d2