diff options
Diffstat (limited to 'main.py')
| -rw-r--r-- | main.py | 182 |
1 files changed, 182 insertions, 0 deletions
@@ -0,0 +1,182 @@ +from bottle import route, run, request, static_file + +from scraper import Driver +from api import RequestHandler + +from multiprocessing import Process, Queue +from Queue import Empty +from time import sleep +from json import dumps +from uuid import uuid1 +import tarfile +import os.path +import os +from glob import glob + + +long_queue = Queue() +short_queue = Queue() +lookup_queue = Queue() +short_lookup_queue = Queue() +processes = [] +done_queue = Queue() + + +def start(): + global long_queue, short_queue, lookup_queue, done_queue, processes,\ + short_lookup_queue + processes = [] + long_queue = Queue() + short_queue = Queue() + short_lookup_queue = Queue() + lookup_queue = Queue() + with open("api_accounts.txt") as f: + for line in f: + credentials = line.strip().split()[2:] + handler = RequestHandler(*credentials) + p = Process(target=api_target, args=(handler, long_queue, + short_queue, + lookup_queue, done_queue)) + processes.append(p) + p.daemon = True + p.start() + + with open("scraping_accounts.txt") as f: + for line in f: + credentials = line.strip().split()[:2] + driver = Driver(*credentials) + p = Process(target=scraper_target, args=(driver, short_queue, + done_queue)) + processes.append(p) + p.daemon = True + p.start() + + +@route('/short_lookup', method='POST') +def short_lookup(): + query_list = request.forms.list.split(",") + user_list = zip(*[iter(query_list)] * 2) # this is dark magic + short_lookup_queue.put(user_list) + + +@route('/restart') +def restart(): + global processes + for p in processes: + p.terminate() + start() + + +@route('/long') +def long(): + user_id = request.query.id + long_queue.put(user_id) + + +@route('/short') +def short(): + user_id = request.query.id + user_name = request.query.user_name + short_queue.put((user_id, user_name)) + + +@route('/lookup', method='POST') +def lookup(): + id_list = request.forms.list.split(",") + lookup_queue.put(id_list) + + +@route('/status') +def status(): + answer_dict = { + "long": long_queue.qsize(), + "short": short_queue.qsize(), + "lookup": lookup_queue.qsize(), + "short_lookup": short_lookup_queue.qsize(), + "done": done_queue.qsize(), + "processes": len([p for p in processes if p.is_alive()]), + "users": len(glob("data/users/[0-9]*.txt")), + "lookups": len(glob("data/users/lookup*.txt")) + } + return dumps(answer_dict) + + +@route('/fetch') +def fetch(): + for filename in glob("data/users/*.tar.gz"): + os.remove(filename) + + def get_filenames(): + try: + while True: + yield done_queue.get(False) + except Empty: + pass + + filename = os.path.join("data", "users", "archive-" + + str(uuid1()) + ".tar.gz") + with tarfile.open(filename, "w:gz") as tar: + for name in get_filenames(): + tar.add(name) + return static_file(filename, root=".") + + +def scraper_target(driver, short_queue, done_queue): + while True: + try: + user_id, user_name = short_queue.get(False) + except Empty: + pass + else: + filename = driver.get_followers(user_id, user_name) + done_queue.put(filename) + finally: + sleep(0.5) + + +def api_target(handler, long_queue, short_queue, lookup_queue, done_queue): + while True: + if handler.ready("followers"): + try: + user_id = long_queue.get(False) + except Empty: + try: + user_id = short_queue.get(False)[0] + except Empty: + pass + else: + filename = handler.get_followers(user_id) + done_queue.put(filename) + continue + else: + filename = handler.get_followers(user_id) + done_queue.put(filename) + continue + if handler.ready("lookup"): + try: + users_list = lookup_queue.get(False) + except Empty: + try: + user_list = short_lookup_queue.get(False) + except Empty: + pass + else: + filename = handler.lookup(user[0] for user in user_list) + done_queue.put(filename) + else: + filename = handler.lookup(users_list) + done_queue.put(filename) + else: + try: + user_list = short_lookup_queue.get(False) + except Empty: + pass + else: + filename = handler.short_lookup(user_list) + done_queue.put(filename) + + +if __name__ == "__main__": + import sys + start() + run(host="0.0.0.0", port=int(sys.argv[1])) |
