summaryrefslogtreecommitdiffstats
path: root/main.py
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2014-02-02 16:53:22 -0500
committerThibaut Horel <thibaut.horel@gmail.com>2014-02-02 16:53:22 -0500
commit7426d8ff0e7969eb1a86bdb5bec8a0c971309e2b (patch)
tree323d6a9a4423b51fbebb37c115fddeab1c7a9641 /main.py
parenta0e95b0843d4e366e4b979685f7c821954afebc6 (diff)
downloadfast-seeding-7426d8ff0e7969eb1a86bdb5bec8a0c971309e2b.tar.gz
Facebook scraping
Diffstat (limited to 'main.py')
-rw-r--r--main.py182
1 files changed, 0 insertions, 182 deletions
diff --git a/main.py b/main.py
deleted file mode 100644
index be565a1..0000000
--- a/main.py
+++ /dev/null
@@ -1,182 +0,0 @@
-from bottle import route, run, request, static_file
-
-from scraper import Driver
-from api import RequestHandler
-
-from multiprocessing import Process, Queue
-from Queue import Empty
-from time import sleep
-from json import dumps
-from uuid import uuid1
-import tarfile
-import os.path
-import os
-from glob import glob
-
-
-long_queue = Queue()
-short_queue = Queue()
-lookup_queue = Queue()
-short_lookup_queue = Queue()
-processes = []
-done_queue = Queue()
-
-
-def start():
- global long_queue, short_queue, lookup_queue, done_queue, processes,\
- short_lookup_queue
- processes = []
- long_queue = Queue()
- short_queue = Queue()
- short_lookup_queue = Queue()
- lookup_queue = Queue()
- with open("api_accounts.txt") as f:
- for line in f:
- credentials = line.strip().split()[2:]
- handler = RequestHandler(*credentials)
- p = Process(target=api_target, args=(handler, long_queue,
- short_queue,
- lookup_queue, done_queue))
- processes.append(p)
- p.daemon = True
- p.start()
-
- with open("scraping_accounts.txt") as f:
- for line in f:
- credentials = line.strip().split()[:2]
- driver = Driver(*credentials)
- p = Process(target=scraper_target, args=(driver, short_queue,
- done_queue))
- processes.append(p)
- p.daemon = True
- p.start()
-
-
-@route('/short_lookup', method='POST')
-def short_lookup():
- query_list = request.forms.list.split(",")
- user_list = zip(*[iter(query_list)] * 2) # this is dark magic
- short_lookup_queue.put(user_list)
-
-
-@route('/restart')
-def restart():
- global processes
- for p in processes:
- p.terminate()
- start()
-
-
-@route('/long')
-def long():
- user_id = request.query.id
- long_queue.put(user_id)
-
-
-@route('/short')
-def short():
- user_id = request.query.id
- user_name = request.query.user_name
- short_queue.put((user_id, user_name))
-
-
-@route('/lookup', method='POST')
-def lookup():
- id_list = request.forms.list.split(",")
- lookup_queue.put(id_list)
-
-
-@route('/status')
-def status():
- answer_dict = {
- "long": long_queue.qsize(),
- "short": short_queue.qsize(),
- "lookup": lookup_queue.qsize(),
- "short_lookup": short_lookup_queue.qsize(),
- "done": done_queue.qsize(),
- "processes": len([p for p in processes if p.is_alive()]),
- "users": len(glob("data/users/[0-9]*.txt")),
- "lookups": len(glob("data/users/lookup*.txt"))
- }
- return dumps(answer_dict)
-
-
-@route('/fetch')
-def fetch():
- for filename in glob("data/users/*.tar.gz"):
- os.remove(filename)
-
- def get_filenames():
- try:
- while True:
- yield done_queue.get(False)
- except Empty:
- pass
-
- filename = os.path.join("data", "users", "archive-"
- + str(uuid1()) + ".tar.gz")
- with tarfile.open(filename, "w:gz") as tar:
- for name in get_filenames():
- tar.add(name)
- return static_file(filename, root=".")
-
-
-def scraper_target(driver, short_queue, done_queue):
- while True:
- try:
- user_id, user_name = short_queue.get(False)
- except Empty:
- pass
- else:
- filename = driver.get_followers(user_id, user_name)
- done_queue.put(filename)
- finally:
- sleep(0.5)
-
-
-def api_target(handler, long_queue, short_queue, lookup_queue, done_queue):
- while True:
- if handler.ready("followers"):
- try:
- user_id = long_queue.get(False)
- except Empty:
- try:
- user_id = short_queue.get(False)[0]
- except Empty:
- pass
- else:
- filename = handler.get_followers(user_id)
- done_queue.put(filename)
- continue
- else:
- filename = handler.get_followers(user_id)
- done_queue.put(filename)
- continue
- if handler.ready("lookup"):
- try:
- users_list = lookup_queue.get(False)
- except Empty:
- try:
- user_list = short_lookup_queue.get(False)
- except Empty:
- pass
- else:
- filename = handler.lookup(user[0] for user in user_list)
- done_queue.put(filename)
- else:
- filename = handler.lookup(users_list)
- done_queue.put(filename)
- else:
- try:
- user_list = short_lookup_queue.get(False)
- except Empty:
- pass
- else:
- filename = handler.short_lookup(user_list)
- done_queue.put(filename)
-
-
-if __name__ == "__main__":
- import sys
- start()
- run(host="0.0.0.0", port=int(sys.argv[1]))