summaryrefslogtreecommitdiffstats
path: root/facebook_scraping
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2014-10-24 12:16:51 -0400
committerThibaut Horel <thibaut.horel@gmail.com>2014-10-24 12:16:51 -0400
commitece1d828d53d6123fcecb5ea8bf9b126d1728ccc (patch)
treeb669382d0e5f1234556d1aeb7fa919891510b24d /facebook_scraping
parent7426d8ff0e7969eb1a86bdb5bec8a0c971309e2b (diff)
downloadfast-seeding-ece1d828d53d6123fcecb5ea8bf9b126d1728ccc.tar.gz
Add code
Diffstat (limited to 'facebook_scraping')
-rw-r--r--facebook_scraping/Makefile49
-rw-r--r--facebook_scraping/client/Makefile15
-rw-r--r--facebook_scraping/client/__init__.py0
-rw-r--r--facebook_scraping/client/requirements.txt4
-rw-r--r--facebook_scraping/client/tasks.py243
-rw-r--r--facebook_scraping/limits.py6
-rw-r--r--facebook_scraping/mturk.py16
-rw-r--r--facebook_scraping/run.py91
-rw-r--r--facebook_scraping/run2.py90
-rw-r--r--facebook_scraping/seed.py7
-rw-r--r--facebook_scraping/server.py16
11 files changed, 537 insertions, 0 deletions
diff --git a/facebook_scraping/Makefile b/facebook_scraping/Makefile
new file mode 100644
index 0000000..fced427
--- /dev/null
+++ b/facebook_scraping/Makefile
@@ -0,0 +1,49 @@
+SHELL=/bin/bash
+HOSTS=servers.txt
+USER=ubuntu
+OPTIONS=-x -"F ./ssh_config"
+FOPTIONS=$(OPTIONS) -h <(cut -f1 $(HOSTS))
+
+.PHONY: deploy servers
+
+servers_simple:
+ ec2-describe-instances --region us-west-2 | grep running | cut -f4,17,18 > servers.txt
+
+servers:
+ ec2-describe-instances --region us-west-2 | grep running | cut -f4,17,18 > servers.txt
+ paste <(cut -f2 servers.txt) <(cut -f28,29 survey8a.txt) > credentials.txt
+ rsync credentials.txt horel.org:kdd/
+
+servers2:
+ ec2-describe-instances --region us-west-2 | grep running | cut -f4,17,18 > servers.txt
+ paste <(cut -f2 servers.txt) fb_accounts2.txt > credentials.txt
+ rsync credentials.txt horel.org:kdd/
+
+uptime:
+ pssh $(FOPTIONS) 'uptime'
+
+running:
+ pssh -i $(FOPTIONS) 'pgrep -f "celery worker"'
+
+deploy:
+ cd client; tar -czf facebook.tar.gz requirements.txt tasks.py
+ cd client; rsync facebook.tar.gz Makefile horel.org:public_html/facebook
+ pssh -i $(FOPTIONS) 'rm -rf tasks.py tasks.pyc kdd/; curl http://thibaut.horel.org/facebook/Makefile > Makefile; make boostrap'
+
+run:
+ pssh -i $(FOPTIONS) 'make run'
+
+stop:
+ pssh -i $(FOPTIONS) "make stop; killall chromedriver; killall chromium-browser; killall Xvfb; rm -f tasks.pyc"
+
+restart:
+ pssh $(FOPTIONS) "make restart"
+
+test:
+ pssh -i $(FOPTIONS) 'rm -f tasks.pyc; grep "replace" tasks.py'
+
+deploy_server:
+ rsync run.py run2.py server.py credentials.txt horel.org:kdd/
+
+
+
diff --git a/facebook_scraping/client/Makefile b/facebook_scraping/client/Makefile
new file mode 100644
index 0000000..3a07802
--- /dev/null
+++ b/facebook_scraping/client/Makefile
@@ -0,0 +1,15 @@
+all: boostrap run
+
+boostrap:
+ curl http://thibaut.horel.org/facebook/facebook.tar.gz > facebook.tar.gz
+ tar -xzf facebook.tar.gz
+
+run:
+ celery -A tasks --concurrency=2 worker --detach -l info
+
+stop:
+ rm -f celeryd.pid
+ pgrep -f "celery worker" | xargs kill -9
+
+restart:
+ pgrep -f "celery worker" | xargs kill -HUP
diff --git a/facebook_scraping/client/__init__.py b/facebook_scraping/client/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/facebook_scraping/client/__init__.py
diff --git a/facebook_scraping/client/requirements.txt b/facebook_scraping/client/requirements.txt
new file mode 100644
index 0000000..cba9c1f
--- /dev/null
+++ b/facebook_scraping/client/requirements.txt
@@ -0,0 +1,4 @@
+beautifulsoup4
+celery
+selenium
+xvfbwrapper
diff --git a/facebook_scraping/client/tasks.py b/facebook_scraping/client/tasks.py
new file mode 100644
index 0000000..4557968
--- /dev/null
+++ b/facebook_scraping/client/tasks.py
@@ -0,0 +1,243 @@
+from xvfbwrapper import Xvfb
+from selenium import webdriver
+from selenium.common.exceptions import ElementNotVisibleException,\
+ NoSuchElementException, StaleElementReferenceException, WebDriverException
+from time import sleep
+from bs4 import BeautifulSoup, NavigableString
+from celery import Celery, Task
+from urllib2 import urlopen
+import socket
+
+app = Celery('tasks', broker='amqp://guest@horel.org//')
+app.conf.CELERY_RESULT_BACKEND = 'rpc'
+app.conf.CELERY_ENABLE_UTC = True
+app.conf.CELERY_ACKS_LATE = True
+drivers = [None]
+ip = socket.gethostbyname(socket.gethostname())
+
+
+def strip(url):
+ if url.endswith("/friends"):
+ return url[:-8]
+ else:
+ return url.split("&")[0]
+
+
+def normalize(url):
+ if "profile.php" in url:
+ basename = url.split("&")[0]
+ fname = basename.split("=")[-1]
+ getname = basename + "&sk=friends"
+ else:
+ basename = url.split("?")[0]
+ fname = basename.split("/")[-1]
+ getname = basename + "/friends"
+ return basename, fname, getname
+
+
+class ListFollowers(Task):
+
+ @property
+ def driver(self):
+ if drivers[0] is None:
+ uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split()
+ vdisplay = Xvfb()
+ vdisplay.start()
+ driver = webdriver.Chrome()
+ driver.get("https://facebook.com")
+ driver.find_element_by_id("email").send_keys(uname)
+ elem = driver.find_element_by_id("pass")
+ elem.send_keys(passwd)
+ elem.submit()
+ drivers[0] = driver
+ return drivers[0]
+
+ def run(self, url):
+ try:
+ self.driver.get(url)
+ except WebDriverException:
+ return {"friends": [], "for": url, "orig": ip}
+
+ while True:
+ for _ in xrange(5):
+ try:
+ footer = self.driver.find_element_by_class_name("_359")
+ except (NoSuchElementException, ElementNotVisibleException):
+ sleep(0.1)
+ else:
+ break
+ else:
+ break
+
+ try:
+ self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+ footer.click()
+ except StaleElementReferenceException:
+ sleep(0.1)
+ except WebDriverException:
+ for _ in xrange(5):
+ try:
+ footer.click()
+ except (WebDriverException, StaleElementReferenceException):
+ sleep(0.1)
+ else:
+ break
+ else:
+ break
+
+ for _ in xrange(5):
+ try:
+ div = self.driver.find_element_by_class_name("_30f")
+ except NoSuchElementException:
+ sleep(0.1)
+ else:
+ break
+ else:
+ try:
+ self.driver.find_element_by_id("loginbutton")
+ except NoSuchElementException:
+ return {"friends": [], "for": url, "orig": ip}
+ else:
+ return {"friends": None, "for": url, "orig": ip}
+
+ soup = BeautifulSoup(div.get_attribute("outerHTML"))
+ return {"friends": [li.a["href"]
+ for li in soup.findAll("li", class_="_698")],
+ "for": url,
+ "orig": ip}
+
+
+class NumFollowers(Task):
+
+ @property
+ def driver(self):
+ if drivers[0] is None:
+ uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split()
+ vdisplay = Xvfb()
+ vdisplay.start()
+ driver = webdriver.Chrome()
+ driver.get("https://facebook.com")
+ driver.find_element_by_id("email").send_keys(uname)
+ elem = driver.find_element_by_id("pass")
+ elem.send_keys(passwd)
+ elem.submit()
+ drivers[0] = driver
+ return drivers[0]
+
+ def run(self, url):
+ try:
+ self.driver.get(url)
+ except WebDriverException:
+ return {"nfriends": 0, "for": url, "orig": ip}
+
+ for i in xrange(20):
+ try:
+ box = self.driver.find_element_by_class_name("_1f8g")
+ except (NoSuchElementException, ElementNotVisibleException):
+ sleep(0.1)
+ else:
+ break
+ else:
+ try:
+ self.driver.find_element_by_id("loginbutton")
+ except NoSuchElementException:
+ return {"nfriends": 0, "for": url, "orig": ip}
+ else:
+ return {"nfriends": None, "for": url, "orig": ip}
+
+ soup = BeautifulSoup(box.get_attribute("outerHTML"))
+ a = soup.find("a", class_="uiLinkSubtle")
+ try:
+ n_friends = int(a.string.replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore"))
+ except ValueError:
+ n_friends = a.string
+ print n_friends
+ return {"nfriends": n_friends,
+ "for": url,
+ "orig": ip}
+
+
+class Likes(Task):
+
+ @property
+ def driver(self):
+ if drivers[0] is None:
+ uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split()
+ vdisplay = Xvfb()
+ vdisplay.start()
+ driver = webdriver.Chrome()
+ driver.get("https://facebook.com")
+ driver.find_element_by_id("email").send_keys(uname)
+ elem = driver.find_element_by_id("pass")
+ elem.send_keys(passwd)
+ elem.submit()
+ drivers[0] = driver
+ return drivers[0]
+
+ def run(self, url):
+ try:
+ self.driver.get(url)
+ except WebDriverException:
+ return {"likes": [], "for": url, "orig": ip}
+
+ while True:
+ for _ in xrange(5):
+ try:
+ footer = self.driver.find_element_by_class_name("_359")
+ except (NoSuchElementException, ElementNotVisibleException):
+ sleep(0.1)
+ else:
+ break
+ else:
+ break
+
+ try:
+ self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+ footer.click()
+ except StaleElementReferenceException:
+ sleep(0.1)
+ except WebDriverException:
+ for _ in xrange(5):
+ try:
+ footer.click()
+ except (WebDriverException, StaleElementReferenceException):
+ sleep(0.1)
+ else:
+ break
+ else:
+ break
+
+ for _ in xrange(5):
+ try:
+ div = self.driver.find_element_by_class_name("_30f")
+ except NoSuchElementException:
+ sleep(0.1)
+ else:
+ break
+ else:
+ try:
+ self.driver.find_element_by_id("loginbutton")
+ except NoSuchElementException:
+ return {"likes": "", "for": url, "orig": ip}
+ else:
+ return {"likes": None, "for": url, "orig": ip}
+
+ def clean(a):
+ for child in a.children:
+ if type(child) == NavigableString:
+ return child
+ else:
+ return ""
+ return ""
+
+ soup = BeautifulSoup(div.get_attribute("outerHTML"))
+ likes = [clean(li.find("a", class_="_gx7"))
+ for li in soup.findAll("li", class_="_5rz")]
+ return {"likes": u"\t".join(likes).encode("utf8"),
+ "for": url,
+ "orig": ip}
+
+if __name__ == "__main__":
+ nf = Likes()
+ with open("toto.txt", "w") as f:
+ f.write( u"\t".join(nf.run("https://www.facebook.com/grvgaba29" + "/video_tv_show_favorite")["likes"]).encode("utf8") + "\n")
diff --git a/facebook_scraping/limits.py b/facebook_scraping/limits.py
new file mode 100644
index 0000000..8ce38cf
--- /dev/null
+++ b/facebook_scraping/limits.py
@@ -0,0 +1,6 @@
+from celery import Celery
+app = Celery('tasks', broker='amqp://guest@horel.org//')
+print app.control.rate_limit('tasks.NumFollowers', '4/m', destination=['celery@ip-172-31-42-158'], reply=True)
+print app.control.rate_limit('tasks.ListFollowers', '4/m', destination=['celery@ip-172-31-42-158'], reply=True)
+print app.control.rate_limit('tasks.NumFollowers', '1/h', destination=['celery@ip-172-31-42-159'], reply=True)
+print app.control.rate_limit('tasks.ListFollowers', '1/h', destination=['celery@ip-172-31-42-159'], reply=True)
diff --git a/facebook_scraping/mturk.py b/facebook_scraping/mturk.py
new file mode 100644
index 0000000..f6322da
--- /dev/null
+++ b/facebook_scraping/mturk.py
@@ -0,0 +1,16 @@
+import csv
+import os.path as op
+from glob import glob
+
+for fname in glob("*.csv"):
+ with open(fname) as f:
+ reader = csv.reader(f)
+ oname, _ = op.splitext(fname)
+ oname = oname + ".txt"
+ with open(oname, "w") as of:
+ for i, row in enumerate(reader):
+ if i == 0:
+ continue
+ if row[-1] == "":
+ row = row[:-1]
+ of.write("\t".join(row) + "\n")
diff --git a/facebook_scraping/run.py b/facebook_scraping/run.py
new file mode 100644
index 0000000..94eb1a4
--- /dev/null
+++ b/facebook_scraping/run.py
@@ -0,0 +1,91 @@
+from tasks import NumFollowers, ListFollowers, normalize, strip
+from bs4 import BeautifulSoup
+from celery.result import ResultSet
+import os.path as op
+from datetime import datetime
+import sys
+
+nf = NumFollowers()
+lf = ListFollowers()
+
+users = {}
+try:
+ with open(sys.argv[1]) as f:
+ for line in f:
+ values = line.strip().split()
+ users[values[0]] = int(values[1].replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore"))
+except IOError:
+ pass
+
+output = open(sys.argv[1], "a")
+bad = open("bad.txt", "a")
+
+
+def add_user(user, degree):
+ users[user] = degree
+ output.write(user + " " + str(degree) + "\n")
+
+
+def call_back(tid, value):
+ print datetime.now().isoformat() + " " + str(value)
+ if "nfriends" in value:
+ if value["nfriends"] is None:
+ bad.write(value["orig"] + "\n")
+ bad.flush()
+ return
+ basename, fname, getname = normalize(value["for"])
+ n_friends = int(str(value["nfriends"]).replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore"))
+ add_user(fname, n_friends)
+ return
+
+if sys.argv[4] == "True":
+ todo = ResultSet([])
+ soup = BeautifulSoup(open(sys.argv[2]))
+ links = [div.a["href"] for div in soup.findAll("div", class_="fsl")]
+ chunk = []
+ for link in links:
+ basename, finame, getname = normalize(link)
+ if op.isfile("facebook/" + finame):
+ with open("facebook/" + finame) as f:
+ for line in f:
+ basename, fname, getname = normalize(line.strip())
+ if fname not in users:
+ print finame
+ todo.add(nf.delay(basename))
+ todo.join_native(callback=call_back)
+todo = []
+
+
+def call_back_fd(tid, value):
+ print datetime.now().isoformat() + " " + str(value)
+ if value["friends"] is None:
+ bad.write(value["orig"] + "\n")
+ bad.flush()
+ return
+ basename, fname, getname = normalize(strip(value["for"]))
+ add_user(fname, len(value["friends"]))
+ with open("facebook/" + fname, "w") as f:
+ for friend in value["friends"]:
+ basename, fname, getname = normalize(friend)
+ f.write(basename + "\n")
+ if fname not in users:
+ todo.append(basename)
+
+soup = BeautifulSoup(open(sys.argv[2]))
+links = [div.a["href"] for div in soup.findAll("div", class_="fsl")]
+chunk = []
+for link in links:
+ basename, fname, getname = normalize(link)
+ if not op.isfile("facebook/" + fname):
+ chunk.append(getname)
+ if len(chunk) == int(sys.argv[3]):
+ todofd = ResultSet([])
+ for name in chunk:
+ todofd.add(lf.delay(name))
+ chunk = []
+ todofd.join_native(callback=call_back_fd)
+ todos = ResultSet([])
+ for name in todo:
+ todos.add(nf.delay(name))
+ todo = []
+ todos.join_native(callback=call_back)
diff --git a/facebook_scraping/run2.py b/facebook_scraping/run2.py
new file mode 100644
index 0000000..a52a37b
--- /dev/null
+++ b/facebook_scraping/run2.py
@@ -0,0 +1,90 @@
+from tasks import NumFollowers, ListFollowers, normalize, Likes
+from bs4 import BeautifulSoup
+from celery.result import ResultSet
+import os.path as op
+from datetime import datetime
+import sys
+
+nf = NumFollowers()
+lf = ListFollowers()
+likes = Likes()
+
+users = {}
+try:
+ with open(sys.argv[1]) as f:
+ for line in f:
+ values = line.strip().split()
+ users[values[0]] = int(values[1].replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore"))
+except IOError:
+ pass
+
+users_likes = {}
+try:
+ with open(sys.argv[3]) as f:
+ for line in f:
+ values = line.strip().split()
+ users_likes[values[0]] = True
+except IOError:
+ pass
+
+output = open(sys.argv[3], "a")
+bad = open("bad.txt", "a")
+
+
+def add_user(user, degree):
+ users[user] = degree
+ output.write(user + " " + str(degree) + "\n")
+
+
+def add_user2(user, likes):
+ output.write(user + "\t" + likes + "\n")
+
+
+def strip2(url):
+ l = "/video_tv_show_favorite"
+ if url.endswith(l):
+ return url[:-len(l)]
+ else:
+ return url.split("&")[0]
+
+
+def call_back(tid, value):
+ print datetime.now().isoformat() + " " + str(value)
+ if "likes" in value:
+ if value["likes"] is None:
+ bad.write(value["orig"] + "\n")
+ bad.flush()
+ return
+ basename, fname, getname = normalize(strip2(value["for"]))
+ add_user2(fname, value["likes"])
+ return
+
+
+def normalize2(url):
+ if "profile.php" in url:
+ basename = url.split("&")[0]
+ fname = basename.split("=")[-1]
+ getname = basename + "&sk=video_tv_show_favorite"
+ else:
+ basename = url.split("?")[0]
+ fname = basename.split("/")[-1]
+ getname = basename + "/video_tv_show_favorite"
+ return basename, fname, getname
+
+soup = BeautifulSoup(open(sys.argv[2]))
+links = [div.a["href"] for div in soup.findAll("div", class_="fsl")]
+chunk = []
+for link in links:
+ basename, finame, getname = normalize(link)
+ if op.isfile("facebook/" + finame):
+ with open("facebook/" + finame) as f:
+ for line in f:
+ basename, fname, getname = normalize2(line.strip())
+ if fname in users and users[fname] > 0 and fname not in users_likes:
+ chunk.append(getname)
+ if len(chunk) == 100:
+ todo = ResultSet([])
+ for name in chunk:
+ todo.add(likes.delay(name))
+ chunk = []
+ todo.join_native(callback=call_back)
diff --git a/facebook_scraping/seed.py b/facebook_scraping/seed.py
new file mode 100644
index 0000000..932c16b
--- /dev/null
+++ b/facebook_scraping/seed.py
@@ -0,0 +1,7 @@
+import sys
+from bs4 import BeautifulSoup
+
+soup = BeautifulSoup(open(sys.argv[1]))
+links = [div.a["href"] for div in soup.findAll("div", class_="fsl")]
+for link in links:
+ print link
diff --git a/facebook_scraping/server.py b/facebook_scraping/server.py
new file mode 100644
index 0000000..6425c7b
--- /dev/null
+++ b/facebook_scraping/server.py
@@ -0,0 +1,16 @@
+from bottle import route, run, request
+
+
+@route('/')
+def index():
+ d = {}
+ with open("credentials.txt") as f:
+ for line in f:
+ values = line.strip().split()
+ d[values[0]] = values[1:3]
+
+ ip = request.environ.get('REMOTE_ADDR')
+ return " ".join(d[ip])
+
+
+run(host='0.0.0.0', port=8080)