summaryrefslogtreecommitdiffstats
path: root/stream.py
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2014-02-02 16:53:22 -0500
committerThibaut Horel <thibaut.horel@gmail.com>2014-02-02 16:53:22 -0500
commit7426d8ff0e7969eb1a86bdb5bec8a0c971309e2b (patch)
tree323d6a9a4423b51fbebb37c115fddeab1c7a9641 /stream.py
parenta0e95b0843d4e366e4b979685f7c821954afebc6 (diff)
downloadfast-seeding-7426d8ff0e7969eb1a86bdb5bec8a0c971309e2b.tar.gz
Facebook scraping
Diffstat (limited to 'stream.py')
-rw-r--r--stream.py62
1 files changed, 0 insertions, 62 deletions
diff --git a/stream.py b/stream.py
deleted file mode 100644
index 71cf615..0000000
--- a/stream.py
+++ /dev/null
@@ -1,62 +0,0 @@
-from tweepy import StreamListener, OAuthHandler, Stream
-
-from itertools import chain
-from datetime import datetime
-import sys
-import os
-
-
-class Listener(StreamListener):
-
- def __init__(self, *args, **kwargs):
- copy = kwargs.copy()
- del copy["concepts"]
- super(Listener, self).__init__(*args, **copy)
- date = datetime.now().replace(microsecond=0).isoformat()
- self.fhandlers = {concept: open(concept + "_{0}.txt".format(date), "w")
- for concept in kwargs["concepts"]}
-
- def __del__(self, *args, **kwargs):
- super(Listener, self).__init__(*args, **kwargs)
- for fh in self.fhandlers.itervalues():
- fh.close()
-
- def get_concepts(self, entities):
- hashtags = (hashtag["text"].lower()
- for hashtag in entities["hashtags"])
- users = (user["screen_name"].lower()
- for user in entities["user_mentions"])
- return set(chain(hashtags, users))
-
- def on_status(self, tweet):
- concepts = self.get_concepts(tweet.entities)
- output = " ".join([str(tweet.user.id), tweet.user.screen_name,
- str(tweet.user.followers_count),
- str(tweet.user.friends_count),
- str(tweet.user.verified),
- tweet.created_at.isoformat()])
- for concept in concepts:
- if concept in self.fhandlers:
- fh = self.fhandlers[concept]
- fh.write(output + "\n")
- fh.flush()
-
-
-def process(filename, cred_file):
- with open(filename) as f:
- concepts = [line.strip() for line in f]
- credentials = open(cred_file).readline().strip().split()
- os.chdir("data")
- entities = [("#" + concept, "@" + concept) for concept in concepts]
- track = chain.from_iterable(entities)
- auth = OAuthHandler(*credentials[2:4])
- auth.set_access_token(*credentials[4:])
- listener = Listener(concepts=concepts)
- stream = Stream(auth, listener)
- stream.filter(track=track)
-
-if __name__ == '__main__':
- try:
- process(sys.argv[1], sys.argv[2])
- except IndexError:
- print "{0} <concept_file> <credentials_file>".format(sys.argv[0])