from bottle import route, run, request, static_file from scraper import Driver from api import RequestHandler from multiprocessing import Process, Queue from Queue import Empty from time import sleep from json import dumps from uuid import uuid1 import tarfile import os.path import os from glob import glob long_queue = Queue() short_queue = Queue() lookup_queue = Queue() short_lookup_queue = Queue() processes = [] done_queue = Queue() def start(): global long_queue, short_queue, lookup_queue, done_queue, processes,\ short_lookup_queue processes = [] long_queue = Queue() short_queue = Queue() short_lookup_queue = Queue() lookup_queue = Queue() with open("api_accounts.txt") as f: for line in f: credentials = line.strip().split()[2:] handler = RequestHandler(*credentials) p = Process(target=api_target, args=(handler, long_queue, short_queue, lookup_queue, done_queue)) processes.append(p) p.daemon = True p.start() with open("scraping_accounts.txt") as f: for line in f: credentials = line.strip().split()[:2] driver = Driver(*credentials) p = Process(target=scraper_target, args=(driver, short_queue, done_queue)) processes.append(p) p.daemon = True p.start() @route('/short_lookup', method='POST') def short_lookup(): query_list = request.forms.list.split(",") user_list = zip(*[iter(query_list)] * 2) # this is dark magic short_lookup_queue.put(user_list) @route('/restart') def restart(): global processes for p in processes: p.terminate() start() @route('/long') def long(): user_id = request.query.id long_queue.put(user_id) @route('/short') def short(): user_id = request.query.id user_name = request.query.user_name short_queue.put((user_id, user_name)) @route('/lookup', method='POST') def lookup(): id_list = request.forms.list.split(",") lookup_queue.put(id_list) @route('/status') def status(): answer_dict = { "long": long_queue.qsize(), "short": short_queue.qsize(), "lookup": lookup_queue.qsize(), "short_lookup": short_lookup_queue.qsize(), "done": done_queue.qsize(), "processes": len([p for p in processes if p.is_alive()]), "users": len(glob("data/users/[0-9]*.txt")), "lookups": len(glob("data/users/lookup*.txt")) } return dumps(answer_dict) @route('/fetch') def fetch(): for filename in glob("data/users/*.tar.gz"): os.remove(filename) def get_filenames(): try: while True: yield done_queue.get(False) except Empty: pass filename = os.path.join("data", "users", "archive-" + str(uuid1()) + ".tar.gz") with tarfile.open(filename, "w:gz") as tar: for name in get_filenames(): tar.add(name) return static_file(filename, root=".") def scraper_target(driver, short_queue, done_queue): while True: try: user_id, user_name = short_queue.get(False) except Empty: pass else: filename = driver.get_followers(user_id, user_name) done_queue.put(filename) finally: sleep(0.5) def api_target(handler, long_queue, short_queue, lookup_queue, done_queue): while True: if handler.ready("followers"): try: user_id = long_queue.get(False) except Empty: try: user_id = short_queue.get(False)[0] except Empty: pass else: filename = handler.get_followers(user_id) done_queue.put(filename) continue else: filename = handler.get_followers(user_id) done_queue.put(filename) continue if handler.ready("lookup"): try: users_list = lookup_queue.get(False) except Empty: try: user_list = short_lookup_queue.get(False) except Empty: pass else: filename = handler.lookup(user[0] for user in user_list) done_queue.put(filename) else: filename = handler.lookup(users_list) done_queue.put(filename) else: try: user_list = short_lookup_queue.get(False) except Empty: pass else: filename = handler.short_lookup(user_list) done_queue.put(filename) if __name__ == "__main__": import sys start() run(host="0.0.0.0", port=int(sys.argv[1]))