diff options
| author | Thibaut Horel <thibaut.horel@gmail.com> | 2014-10-24 12:16:51 -0400 |
|---|---|---|
| committer | Thibaut Horel <thibaut.horel@gmail.com> | 2014-10-24 12:16:51 -0400 |
| commit | ece1d828d53d6123fcecb5ea8bf9b126d1728ccc (patch) | |
| tree | b669382d0e5f1234556d1aeb7fa919891510b24d /facebook_scraping | |
| parent | 7426d8ff0e7969eb1a86bdb5bec8a0c971309e2b (diff) | |
| download | fast-seeding-ece1d828d53d6123fcecb5ea8bf9b126d1728ccc.tar.gz | |
Add code
Diffstat (limited to 'facebook_scraping')
| -rw-r--r-- | facebook_scraping/Makefile | 49 | ||||
| -rw-r--r-- | facebook_scraping/client/Makefile | 15 | ||||
| -rw-r--r-- | facebook_scraping/client/__init__.py | 0 | ||||
| -rw-r--r-- | facebook_scraping/client/requirements.txt | 4 | ||||
| -rw-r--r-- | facebook_scraping/client/tasks.py | 243 | ||||
| -rw-r--r-- | facebook_scraping/limits.py | 6 | ||||
| -rw-r--r-- | facebook_scraping/mturk.py | 16 | ||||
| -rw-r--r-- | facebook_scraping/run.py | 91 | ||||
| -rw-r--r-- | facebook_scraping/run2.py | 90 | ||||
| -rw-r--r-- | facebook_scraping/seed.py | 7 | ||||
| -rw-r--r-- | facebook_scraping/server.py | 16 |
11 files changed, 537 insertions, 0 deletions
diff --git a/facebook_scraping/Makefile b/facebook_scraping/Makefile new file mode 100644 index 0000000..fced427 --- /dev/null +++ b/facebook_scraping/Makefile @@ -0,0 +1,49 @@ +SHELL=/bin/bash +HOSTS=servers.txt +USER=ubuntu +OPTIONS=-x -"F ./ssh_config" +FOPTIONS=$(OPTIONS) -h <(cut -f1 $(HOSTS)) + +.PHONY: deploy servers + +servers_simple: + ec2-describe-instances --region us-west-2 | grep running | cut -f4,17,18 > servers.txt + +servers: + ec2-describe-instances --region us-west-2 | grep running | cut -f4,17,18 > servers.txt + paste <(cut -f2 servers.txt) <(cut -f28,29 survey8a.txt) > credentials.txt + rsync credentials.txt horel.org:kdd/ + +servers2: + ec2-describe-instances --region us-west-2 | grep running | cut -f4,17,18 > servers.txt + paste <(cut -f2 servers.txt) fb_accounts2.txt > credentials.txt + rsync credentials.txt horel.org:kdd/ + +uptime: + pssh $(FOPTIONS) 'uptime' + +running: + pssh -i $(FOPTIONS) 'pgrep -f "celery worker"' + +deploy: + cd client; tar -czf facebook.tar.gz requirements.txt tasks.py + cd client; rsync facebook.tar.gz Makefile horel.org:public_html/facebook + pssh -i $(FOPTIONS) 'rm -rf tasks.py tasks.pyc kdd/; curl http://thibaut.horel.org/facebook/Makefile > Makefile; make boostrap' + +run: + pssh -i $(FOPTIONS) 'make run' + +stop: + pssh -i $(FOPTIONS) "make stop; killall chromedriver; killall chromium-browser; killall Xvfb; rm -f tasks.pyc" + +restart: + pssh $(FOPTIONS) "make restart" + +test: + pssh -i $(FOPTIONS) 'rm -f tasks.pyc; grep "replace" tasks.py' + +deploy_server: + rsync run.py run2.py server.py credentials.txt horel.org:kdd/ + + + diff --git a/facebook_scraping/client/Makefile b/facebook_scraping/client/Makefile new file mode 100644 index 0000000..3a07802 --- /dev/null +++ b/facebook_scraping/client/Makefile @@ -0,0 +1,15 @@ +all: boostrap run + +boostrap: + curl http://thibaut.horel.org/facebook/facebook.tar.gz > facebook.tar.gz + tar -xzf facebook.tar.gz + +run: + celery -A tasks --concurrency=2 worker --detach -l info + +stop: + rm -f celeryd.pid + pgrep -f "celery worker" | xargs kill -9 + +restart: + pgrep -f "celery worker" | xargs kill -HUP diff --git a/facebook_scraping/client/__init__.py b/facebook_scraping/client/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/facebook_scraping/client/__init__.py diff --git a/facebook_scraping/client/requirements.txt b/facebook_scraping/client/requirements.txt new file mode 100644 index 0000000..cba9c1f --- /dev/null +++ b/facebook_scraping/client/requirements.txt @@ -0,0 +1,4 @@ +beautifulsoup4 +celery +selenium +xvfbwrapper diff --git a/facebook_scraping/client/tasks.py b/facebook_scraping/client/tasks.py new file mode 100644 index 0000000..4557968 --- /dev/null +++ b/facebook_scraping/client/tasks.py @@ -0,0 +1,243 @@ +from xvfbwrapper import Xvfb +from selenium import webdriver +from selenium.common.exceptions import ElementNotVisibleException,\ + NoSuchElementException, StaleElementReferenceException, WebDriverException +from time import sleep +from bs4 import BeautifulSoup, NavigableString +from celery import Celery, Task +from urllib2 import urlopen +import socket + +app = Celery('tasks', broker='amqp://guest@horel.org//') +app.conf.CELERY_RESULT_BACKEND = 'rpc' +app.conf.CELERY_ENABLE_UTC = True +app.conf.CELERY_ACKS_LATE = True +drivers = [None] +ip = socket.gethostbyname(socket.gethostname()) + + +def strip(url): + if url.endswith("/friends"): + return url[:-8] + else: + return url.split("&")[0] + + +def normalize(url): + if "profile.php" in url: + basename = url.split("&")[0] + fname = basename.split("=")[-1] + getname = basename + "&sk=friends" + else: + basename = url.split("?")[0] + fname = basename.split("/")[-1] + getname = basename + "/friends" + return basename, fname, getname + + +class ListFollowers(Task): + + @property + def driver(self): + if drivers[0] is None: + uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split() + vdisplay = Xvfb() + vdisplay.start() + driver = webdriver.Chrome() + driver.get("https://facebook.com") + driver.find_element_by_id("email").send_keys(uname) + elem = driver.find_element_by_id("pass") + elem.send_keys(passwd) + elem.submit() + drivers[0] = driver + return drivers[0] + + def run(self, url): + try: + self.driver.get(url) + except WebDriverException: + return {"friends": [], "for": url, "orig": ip} + + while True: + for _ in xrange(5): + try: + footer = self.driver.find_element_by_class_name("_359") + except (NoSuchElementException, ElementNotVisibleException): + sleep(0.1) + else: + break + else: + break + + try: + self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + footer.click() + except StaleElementReferenceException: + sleep(0.1) + except WebDriverException: + for _ in xrange(5): + try: + footer.click() + except (WebDriverException, StaleElementReferenceException): + sleep(0.1) + else: + break + else: + break + + for _ in xrange(5): + try: + div = self.driver.find_element_by_class_name("_30f") + except NoSuchElementException: + sleep(0.1) + else: + break + else: + try: + self.driver.find_element_by_id("loginbutton") + except NoSuchElementException: + return {"friends": [], "for": url, "orig": ip} + else: + return {"friends": None, "for": url, "orig": ip} + + soup = BeautifulSoup(div.get_attribute("outerHTML")) + return {"friends": [li.a["href"] + for li in soup.findAll("li", class_="_698")], + "for": url, + "orig": ip} + + +class NumFollowers(Task): + + @property + def driver(self): + if drivers[0] is None: + uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split() + vdisplay = Xvfb() + vdisplay.start() + driver = webdriver.Chrome() + driver.get("https://facebook.com") + driver.find_element_by_id("email").send_keys(uname) + elem = driver.find_element_by_id("pass") + elem.send_keys(passwd) + elem.submit() + drivers[0] = driver + return drivers[0] + + def run(self, url): + try: + self.driver.get(url) + except WebDriverException: + return {"nfriends": 0, "for": url, "orig": ip} + + for i in xrange(20): + try: + box = self.driver.find_element_by_class_name("_1f8g") + except (NoSuchElementException, ElementNotVisibleException): + sleep(0.1) + else: + break + else: + try: + self.driver.find_element_by_id("loginbutton") + except NoSuchElementException: + return {"nfriends": 0, "for": url, "orig": ip} + else: + return {"nfriends": None, "for": url, "orig": ip} + + soup = BeautifulSoup(box.get_attribute("outerHTML")) + a = soup.find("a", class_="uiLinkSubtle") + try: + n_friends = int(a.string.replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore")) + except ValueError: + n_friends = a.string + print n_friends + return {"nfriends": n_friends, + "for": url, + "orig": ip} + + +class Likes(Task): + + @property + def driver(self): + if drivers[0] is None: + uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split() + vdisplay = Xvfb() + vdisplay.start() + driver = webdriver.Chrome() + driver.get("https://facebook.com") + driver.find_element_by_id("email").send_keys(uname) + elem = driver.find_element_by_id("pass") + elem.send_keys(passwd) + elem.submit() + drivers[0] = driver + return drivers[0] + + def run(self, url): + try: + self.driver.get(url) + except WebDriverException: + return {"likes": [], "for": url, "orig": ip} + + while True: + for _ in xrange(5): + try: + footer = self.driver.find_element_by_class_name("_359") + except (NoSuchElementException, ElementNotVisibleException): + sleep(0.1) + else: + break + else: + break + + try: + self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + footer.click() + except StaleElementReferenceException: + sleep(0.1) + except WebDriverException: + for _ in xrange(5): + try: + footer.click() + except (WebDriverException, StaleElementReferenceException): + sleep(0.1) + else: + break + else: + break + + for _ in xrange(5): + try: + div = self.driver.find_element_by_class_name("_30f") + except NoSuchElementException: + sleep(0.1) + else: + break + else: + try: + self.driver.find_element_by_id("loginbutton") + except NoSuchElementException: + return {"likes": "", "for": url, "orig": ip} + else: + return {"likes": None, "for": url, "orig": ip} + + def clean(a): + for child in a.children: + if type(child) == NavigableString: + return child + else: + return "" + return "" + + soup = BeautifulSoup(div.get_attribute("outerHTML")) + likes = [clean(li.find("a", class_="_gx7")) + for li in soup.findAll("li", class_="_5rz")] + return {"likes": u"\t".join(likes).encode("utf8"), + "for": url, + "orig": ip} + +if __name__ == "__main__": + nf = Likes() + with open("toto.txt", "w") as f: + f.write( u"\t".join(nf.run("https://www.facebook.com/grvgaba29" + "/video_tv_show_favorite")["likes"]).encode("utf8") + "\n") diff --git a/facebook_scraping/limits.py b/facebook_scraping/limits.py new file mode 100644 index 0000000..8ce38cf --- /dev/null +++ b/facebook_scraping/limits.py @@ -0,0 +1,6 @@ +from celery import Celery +app = Celery('tasks', broker='amqp://guest@horel.org//') +print app.control.rate_limit('tasks.NumFollowers', '4/m', destination=['celery@ip-172-31-42-158'], reply=True) +print app.control.rate_limit('tasks.ListFollowers', '4/m', destination=['celery@ip-172-31-42-158'], reply=True) +print app.control.rate_limit('tasks.NumFollowers', '1/h', destination=['celery@ip-172-31-42-159'], reply=True) +print app.control.rate_limit('tasks.ListFollowers', '1/h', destination=['celery@ip-172-31-42-159'], reply=True) diff --git a/facebook_scraping/mturk.py b/facebook_scraping/mturk.py new file mode 100644 index 0000000..f6322da --- /dev/null +++ b/facebook_scraping/mturk.py @@ -0,0 +1,16 @@ +import csv +import os.path as op +from glob import glob + +for fname in glob("*.csv"): + with open(fname) as f: + reader = csv.reader(f) + oname, _ = op.splitext(fname) + oname = oname + ".txt" + with open(oname, "w") as of: + for i, row in enumerate(reader): + if i == 0: + continue + if row[-1] == "": + row = row[:-1] + of.write("\t".join(row) + "\n") diff --git a/facebook_scraping/run.py b/facebook_scraping/run.py new file mode 100644 index 0000000..94eb1a4 --- /dev/null +++ b/facebook_scraping/run.py @@ -0,0 +1,91 @@ +from tasks import NumFollowers, ListFollowers, normalize, strip +from bs4 import BeautifulSoup +from celery.result import ResultSet +import os.path as op +from datetime import datetime +import sys + +nf = NumFollowers() +lf = ListFollowers() + +users = {} +try: + with open(sys.argv[1]) as f: + for line in f: + values = line.strip().split() + users[values[0]] = int(values[1].replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore")) +except IOError: + pass + +output = open(sys.argv[1], "a") +bad = open("bad.txt", "a") + + +def add_user(user, degree): + users[user] = degree + output.write(user + " " + str(degree) + "\n") + + +def call_back(tid, value): + print datetime.now().isoformat() + " " + str(value) + if "nfriends" in value: + if value["nfriends"] is None: + bad.write(value["orig"] + "\n") + bad.flush() + return + basename, fname, getname = normalize(value["for"]) + n_friends = int(str(value["nfriends"]).replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore")) + add_user(fname, n_friends) + return + +if sys.argv[4] == "True": + todo = ResultSet([]) + soup = BeautifulSoup(open(sys.argv[2])) + links = [div.a["href"] for div in soup.findAll("div", class_="fsl")] + chunk = [] + for link in links: + basename, finame, getname = normalize(link) + if op.isfile("facebook/" + finame): + with open("facebook/" + finame) as f: + for line in f: + basename, fname, getname = normalize(line.strip()) + if fname not in users: + print finame + todo.add(nf.delay(basename)) + todo.join_native(callback=call_back) +todo = [] + + +def call_back_fd(tid, value): + print datetime.now().isoformat() + " " + str(value) + if value["friends"] is None: + bad.write(value["orig"] + "\n") + bad.flush() + return + basename, fname, getname = normalize(strip(value["for"])) + add_user(fname, len(value["friends"])) + with open("facebook/" + fname, "w") as f: + for friend in value["friends"]: + basename, fname, getname = normalize(friend) + f.write(basename + "\n") + if fname not in users: + todo.append(basename) + +soup = BeautifulSoup(open(sys.argv[2])) +links = [div.a["href"] for div in soup.findAll("div", class_="fsl")] +chunk = [] +for link in links: + basename, fname, getname = normalize(link) + if not op.isfile("facebook/" + fname): + chunk.append(getname) + if len(chunk) == int(sys.argv[3]): + todofd = ResultSet([]) + for name in chunk: + todofd.add(lf.delay(name)) + chunk = [] + todofd.join_native(callback=call_back_fd) + todos = ResultSet([]) + for name in todo: + todos.add(nf.delay(name)) + todo = [] + todos.join_native(callback=call_back) diff --git a/facebook_scraping/run2.py b/facebook_scraping/run2.py new file mode 100644 index 0000000..a52a37b --- /dev/null +++ b/facebook_scraping/run2.py @@ -0,0 +1,90 @@ +from tasks import NumFollowers, ListFollowers, normalize, Likes +from bs4 import BeautifulSoup +from celery.result import ResultSet +import os.path as op +from datetime import datetime +import sys + +nf = NumFollowers() +lf = ListFollowers() +likes = Likes() + +users = {} +try: + with open(sys.argv[1]) as f: + for line in f: + values = line.strip().split() + users[values[0]] = int(values[1].replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore")) +except IOError: + pass + +users_likes = {} +try: + with open(sys.argv[3]) as f: + for line in f: + values = line.strip().split() + users_likes[values[0]] = True +except IOError: + pass + +output = open(sys.argv[3], "a") +bad = open("bad.txt", "a") + + +def add_user(user, degree): + users[user] = degree + output.write(user + " " + str(degree) + "\n") + + +def add_user2(user, likes): + output.write(user + "\t" + likes + "\n") + + +def strip2(url): + l = "/video_tv_show_favorite" + if url.endswith(l): + return url[:-len(l)] + else: + return url.split("&")[0] + + +def call_back(tid, value): + print datetime.now().isoformat() + " " + str(value) + if "likes" in value: + if value["likes"] is None: + bad.write(value["orig"] + "\n") + bad.flush() + return + basename, fname, getname = normalize(strip2(value["for"])) + add_user2(fname, value["likes"]) + return + + +def normalize2(url): + if "profile.php" in url: + basename = url.split("&")[0] + fname = basename.split("=")[-1] + getname = basename + "&sk=video_tv_show_favorite" + else: + basename = url.split("?")[0] + fname = basename.split("/")[-1] + getname = basename + "/video_tv_show_favorite" + return basename, fname, getname + +soup = BeautifulSoup(open(sys.argv[2])) +links = [div.a["href"] for div in soup.findAll("div", class_="fsl")] +chunk = [] +for link in links: + basename, finame, getname = normalize(link) + if op.isfile("facebook/" + finame): + with open("facebook/" + finame) as f: + for line in f: + basename, fname, getname = normalize2(line.strip()) + if fname in users and users[fname] > 0 and fname not in users_likes: + chunk.append(getname) + if len(chunk) == 100: + todo = ResultSet([]) + for name in chunk: + todo.add(likes.delay(name)) + chunk = [] + todo.join_native(callback=call_back) diff --git a/facebook_scraping/seed.py b/facebook_scraping/seed.py new file mode 100644 index 0000000..932c16b --- /dev/null +++ b/facebook_scraping/seed.py @@ -0,0 +1,7 @@ +import sys +from bs4 import BeautifulSoup + +soup = BeautifulSoup(open(sys.argv[1])) +links = [div.a["href"] for div in soup.findAll("div", class_="fsl")] +for link in links: + print link diff --git a/facebook_scraping/server.py b/facebook_scraping/server.py new file mode 100644 index 0000000..6425c7b --- /dev/null +++ b/facebook_scraping/server.py @@ -0,0 +1,16 @@ +from bottle import route, run, request + + +@route('/') +def index(): + d = {} + with open("credentials.txt") as f: + for line in f: + values = line.strip().split() + d[values[0]] = values[1:3] + + ip = request.environ.get('REMOTE_ADDR') + return " ".join(d[ip]) + + +run(host='0.0.0.0', port=8080) |
