From 7426d8ff0e7969eb1a86bdb5bec8a0c971309e2b Mon Sep 17 00:00:00 2001 From: Thibaut Horel Date: Sun, 2 Feb 2014 16:53:22 -0500 Subject: Facebook scraping --- twitter/api.py | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 twitter/api.py (limited to 'twitter/api.py') diff --git a/twitter/api.py b/twitter/api.py new file mode 100644 index 0000000..b6f2863 --- /dev/null +++ b/twitter/api.py @@ -0,0 +1,126 @@ +from tweepy import API, OAuthHandler +from tweepy import cursor +from bs4 import BeautifulSoup + +import os.path +import uuid +from time import time, sleep +from urllib import urlopen + + +class RequestHandler: + + def __init__(self, *args): + auth = OAuthHandler(*args[0:2]) + auth.set_access_token(*args[2:]) + self.api = API(auth) + self.state = {} + limits = self.api.rate_limit_status() + self.state["followers"] = limits["resources"]["followers"]["/followers/ids"] + self.state["lookup"] = limits["resources"]["users"]["/users/lookup"] + + def __get_followers(self, user_id): + pages = cursor.Cursor(self.api.followers_ids, id=user_id).pages(1) + for page in pages: + for follower in page: + yield follower + + def get_followers(self, user_id): + filename = os.path.join("data", "users", user_id + ".txt") + if os.path.isfile(filename): + return filename + l = list(self.__get_followers(user_id)) + with open(filename, "w") as f: + for fid in l: + f.write(str(fid) + "\n") + for key, value in self.api.last_response.getheaders(): + if key.startswith("x-rate-limit"): + self.state["followers"][key.split("-")[-1]] = int(value) + return filename + + def __lookup(self, users_list): + for user in self.api.lookup_users(users_list): + yield user + + def lookup(self, users_list): + uid = uuid.uuid1() + filename = os.path.join("data", "users", "lookup-" + str(uid) + ".txt") + l = list(self.__lookup(users_list)) + with open(filename, "w") as f: + for user in l: + output = " ".join([str(user.id), user.screen_name, + str(user.followers_count), + str(user.friends_count), + str(user.verified)]) + f.write(output + "\n") + for key, value in self.api.last_response.getheaders(): + if key.startswith("x-rate-limit"): + self.state["lookup"][key.split("-")[-1]] = int(value) + return filename + + def get_profile(self, user_id, username): + fh = urlopen("https://twitter.com/{0}".format(username)) + soup = BeautifulSoup(fh) + ul = soup.find("ul", class_="js-mini-profile-stats") + following, followers = [li.strong.string + for li in ul.find_all("li")[1:]] + return user_id, username, followers, following + + def short_lookup(self, users_list): + uid = uuid.uuid1() + filename = os.path.join("data", "users", "lookup-" + str(uid) + ".txt") + + def get_output(): + for user_id, username in users_list: + try: + output = " ".join(map(str, self.get_profile(user_id, + username))) + except: + pass + else: + yield output + sleep(0.5) + + to_write = list(get_output()) + with open(filename, "w") as f: + f.write("\n".join(to_write)) + + return filename + + def ready(self, method): + now = int(time()) + if (int(self.state[method]["remaining"]) > 0 + or int(self.state[method]["reset"]) < now): + return True + else: + return False + + +if __name__ == "__main__": + credentials = open("api_accounts.txt").readline().strip().split() + handler = RequestHandler(*credentials[2:]) + # if handler.ready("lookup"): + # handler.lookup(["304224106"]) + # if handler.ready("followers"): + # handler.lookup("304224106") + # starbucks 30973 + # bestbuy 17475575 + # sears 19464428 + # macys 50687788 + # target 89084561 + # gap 18462157 + # mountain 9409552 + # coachella 688583 + + id = "688583" + print handler.get_followers(id) + f = open("data/users/{0}.txt".format(id)) + g = open("data/users/{0}_followers.txt".format(id), "w") + l = [] + for line in f: + l.append(line.strip()) + if len(l) == 100: + i = open(handler.lookup(l)) + for line in i: + g.write(line) + l = [] -- cgit v1.2.3-70-g09d2