From 7426d8ff0e7969eb1a86bdb5bec8a0c971309e2b Mon Sep 17 00:00:00 2001 From: Thibaut Horel Date: Sun, 2 Feb 2014 16:53:22 -0500 Subject: Facebook scraping --- stream.py | 62 -------------------------------------------------------------- 1 file changed, 62 deletions(-) delete mode 100644 stream.py (limited to 'stream.py') diff --git a/stream.py b/stream.py deleted file mode 100644 index 71cf615..0000000 --- a/stream.py +++ /dev/null @@ -1,62 +0,0 @@ -from tweepy import StreamListener, OAuthHandler, Stream - -from itertools import chain -from datetime import datetime -import sys -import os - - -class Listener(StreamListener): - - def __init__(self, *args, **kwargs): - copy = kwargs.copy() - del copy["concepts"] - super(Listener, self).__init__(*args, **copy) - date = datetime.now().replace(microsecond=0).isoformat() - self.fhandlers = {concept: open(concept + "_{0}.txt".format(date), "w") - for concept in kwargs["concepts"]} - - def __del__(self, *args, **kwargs): - super(Listener, self).__init__(*args, **kwargs) - for fh in self.fhandlers.itervalues(): - fh.close() - - def get_concepts(self, entities): - hashtags = (hashtag["text"].lower() - for hashtag in entities["hashtags"]) - users = (user["screen_name"].lower() - for user in entities["user_mentions"]) - return set(chain(hashtags, users)) - - def on_status(self, tweet): - concepts = self.get_concepts(tweet.entities) - output = " ".join([str(tweet.user.id), tweet.user.screen_name, - str(tweet.user.followers_count), - str(tweet.user.friends_count), - str(tweet.user.verified), - tweet.created_at.isoformat()]) - for concept in concepts: - if concept in self.fhandlers: - fh = self.fhandlers[concept] - fh.write(output + "\n") - fh.flush() - - -def process(filename, cred_file): - with open(filename) as f: - concepts = [line.strip() for line in f] - credentials = open(cred_file).readline().strip().split() - os.chdir("data") - entities = [("#" + concept, "@" + concept) for concept in concepts] - track = chain.from_iterable(entities) - auth = OAuthHandler(*credentials[2:4]) - auth.set_access_token(*credentials[4:]) - listener = Listener(concepts=concepts) - stream = Stream(auth, listener) - stream.filter(track=track) - -if __name__ == '__main__': - try: - process(sys.argv[1], sys.argv[2]) - except IndexError: - print "{0} ".format(sys.argv[0]) -- cgit v1.2.3-70-g09d2