summaryrefslogtreecommitdiffstats
path: root/twitter/main.py
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2014-02-02 16:53:22 -0500
committerThibaut Horel <thibaut.horel@gmail.com>2014-02-02 16:53:22 -0500
commit7426d8ff0e7969eb1a86bdb5bec8a0c971309e2b (patch)
tree323d6a9a4423b51fbebb37c115fddeab1c7a9641 /twitter/main.py
parenta0e95b0843d4e366e4b979685f7c821954afebc6 (diff)
downloadfast-seeding-7426d8ff0e7969eb1a86bdb5bec8a0c971309e2b.tar.gz
Facebook scraping
Diffstat (limited to 'twitter/main.py')
-rw-r--r--twitter/main.py182
1 files changed, 182 insertions, 0 deletions
diff --git a/twitter/main.py b/twitter/main.py
new file mode 100644
index 0000000..be565a1
--- /dev/null
+++ b/twitter/main.py
@@ -0,0 +1,182 @@
+from bottle import route, run, request, static_file
+
+from scraper import Driver
+from api import RequestHandler
+
+from multiprocessing import Process, Queue
+from Queue import Empty
+from time import sleep
+from json import dumps
+from uuid import uuid1
+import tarfile
+import os.path
+import os
+from glob import glob
+
+
+long_queue = Queue()
+short_queue = Queue()
+lookup_queue = Queue()
+short_lookup_queue = Queue()
+processes = []
+done_queue = Queue()
+
+
+def start():
+ global long_queue, short_queue, lookup_queue, done_queue, processes,\
+ short_lookup_queue
+ processes = []
+ long_queue = Queue()
+ short_queue = Queue()
+ short_lookup_queue = Queue()
+ lookup_queue = Queue()
+ with open("api_accounts.txt") as f:
+ for line in f:
+ credentials = line.strip().split()[2:]
+ handler = RequestHandler(*credentials)
+ p = Process(target=api_target, args=(handler, long_queue,
+ short_queue,
+ lookup_queue, done_queue))
+ processes.append(p)
+ p.daemon = True
+ p.start()
+
+ with open("scraping_accounts.txt") as f:
+ for line in f:
+ credentials = line.strip().split()[:2]
+ driver = Driver(*credentials)
+ p = Process(target=scraper_target, args=(driver, short_queue,
+ done_queue))
+ processes.append(p)
+ p.daemon = True
+ p.start()
+
+
+@route('/short_lookup', method='POST')
+def short_lookup():
+ query_list = request.forms.list.split(",")
+ user_list = zip(*[iter(query_list)] * 2) # this is dark magic
+ short_lookup_queue.put(user_list)
+
+
+@route('/restart')
+def restart():
+ global processes
+ for p in processes:
+ p.terminate()
+ start()
+
+
+@route('/long')
+def long():
+ user_id = request.query.id
+ long_queue.put(user_id)
+
+
+@route('/short')
+def short():
+ user_id = request.query.id
+ user_name = request.query.user_name
+ short_queue.put((user_id, user_name))
+
+
+@route('/lookup', method='POST')
+def lookup():
+ id_list = request.forms.list.split(",")
+ lookup_queue.put(id_list)
+
+
+@route('/status')
+def status():
+ answer_dict = {
+ "long": long_queue.qsize(),
+ "short": short_queue.qsize(),
+ "lookup": lookup_queue.qsize(),
+ "short_lookup": short_lookup_queue.qsize(),
+ "done": done_queue.qsize(),
+ "processes": len([p for p in processes if p.is_alive()]),
+ "users": len(glob("data/users/[0-9]*.txt")),
+ "lookups": len(glob("data/users/lookup*.txt"))
+ }
+ return dumps(answer_dict)
+
+
+@route('/fetch')
+def fetch():
+ for filename in glob("data/users/*.tar.gz"):
+ os.remove(filename)
+
+ def get_filenames():
+ try:
+ while True:
+ yield done_queue.get(False)
+ except Empty:
+ pass
+
+ filename = os.path.join("data", "users", "archive-"
+ + str(uuid1()) + ".tar.gz")
+ with tarfile.open(filename, "w:gz") as tar:
+ for name in get_filenames():
+ tar.add(name)
+ return static_file(filename, root=".")
+
+
+def scraper_target(driver, short_queue, done_queue):
+ while True:
+ try:
+ user_id, user_name = short_queue.get(False)
+ except Empty:
+ pass
+ else:
+ filename = driver.get_followers(user_id, user_name)
+ done_queue.put(filename)
+ finally:
+ sleep(0.5)
+
+
+def api_target(handler, long_queue, short_queue, lookup_queue, done_queue):
+ while True:
+ if handler.ready("followers"):
+ try:
+ user_id = long_queue.get(False)
+ except Empty:
+ try:
+ user_id = short_queue.get(False)[0]
+ except Empty:
+ pass
+ else:
+ filename = handler.get_followers(user_id)
+ done_queue.put(filename)
+ continue
+ else:
+ filename = handler.get_followers(user_id)
+ done_queue.put(filename)
+ continue
+ if handler.ready("lookup"):
+ try:
+ users_list = lookup_queue.get(False)
+ except Empty:
+ try:
+ user_list = short_lookup_queue.get(False)
+ except Empty:
+ pass
+ else:
+ filename = handler.lookup(user[0] for user in user_list)
+ done_queue.put(filename)
+ else:
+ filename = handler.lookup(users_list)
+ done_queue.put(filename)
+ else:
+ try:
+ user_list = short_lookup_queue.get(False)
+ except Empty:
+ pass
+ else:
+ filename = handler.short_lookup(user_list)
+ done_queue.put(filename)
+
+
+if __name__ == "__main__":
+ import sys
+ start()
+ run(host="0.0.0.0", port=int(sys.argv[1]))