Add code

author: Thibaut Horel <thibaut.horel@gmail.com> 2014-10-24 12:16:51 -0400
committer: Thibaut Horel <thibaut.horel@gmail.com> 2014-10-24 12:16:51 -0400
commit: ece1d828d53d6123fcecb5ea8bf9b126d1728ccc (patch)
tree: b669382d0e5f1234556d1aeb7fa919891510b24d /facebook_scraping
parent: 7426d8ff0e7969eb1a86bdb5bec8a0c971309e2b (diff)
download: fast-seeding-ece1d828d53d6123fcecb5ea8bf9b126d1728ccc.tar.gz
11 files changed, 537 insertions, 0 deletions
diff --git a/facebook_scraping/Makefile b/facebook_scraping/Makefile
new file mode 100644
index 0000000..fced427
--- /dev/null
+++ b/facebook_scraping/Makefile
@@ -0,0 +1,49 @@
+SHELL=/bin/bash
+HOSTS=servers.txt
+USER=ubuntu
+OPTIONS=-x -"F ./ssh_config"
+FOPTIONS=$(OPTIONS) -h <(cut -f1 $(HOSTS))
+
+.PHONY: deploy servers
+
+servers_simple:
+	ec2-describe-instances --region us-west-2 | grep running | cut -f4,17,18 > servers.txt
+
+servers:
+	ec2-describe-instances --region us-west-2 | grep running | cut -f4,17,18 > servers.txt
+	paste <(cut -f2 servers.txt) <(cut -f28,29 survey8a.txt) > credentials.txt
+	rsync credentials.txt horel.org:kdd/
+
+servers2:
+	ec2-describe-instances --region us-west-2 | grep running | cut -f4,17,18 > servers.txt
+	paste <(cut -f2 servers.txt) fb_accounts2.txt > credentials.txt
+	rsync credentials.txt horel.org:kdd/
+
+uptime:
+	pssh $(FOPTIONS) 'uptime'
+
+running:
+	pssh -i $(FOPTIONS) 'pgrep -f "celery worker"'
+
+deploy:
+	cd client; tar -czf facebook.tar.gz requirements.txt tasks.py
+	cd client; rsync facebook.tar.gz Makefile horel.org:public_html/facebook
+	pssh -i $(FOPTIONS) 'rm -rf tasks.py tasks.pyc kdd/; curl http://thibaut.horel.org/facebook/Makefile > Makefile; make boostrap'
+
+run:
+	pssh -i $(FOPTIONS) 'make run'
+
+stop:
+	pssh -i $(FOPTIONS) "make stop; killall chromedriver; killall chromium-browser; killall Xvfb; rm -f tasks.pyc"
+
+restart:
+	pssh $(FOPTIONS) "make restart"
+
+test:
+	pssh -i $(FOPTIONS) 'rm -f tasks.pyc; grep "replace" tasks.py'
+
+deploy_server:
+	rsync run.py run2.py server.py credentials.txt horel.org:kdd/
+
+
+
diff --git a/facebook_scraping/client/Makefile b/facebook_scraping/client/Makefile
new file mode 100644
index 0000000..3a07802
--- /dev/null
+++ b/facebook_scraping/client/Makefile
@@ -0,0 +1,15 @@
+all: boostrap run
+
+boostrap:
+	curl http://thibaut.horel.org/facebook/facebook.tar.gz > facebook.tar.gz
+	tar -xzf facebook.tar.gz
+
+run:
+	celery -A tasks --concurrency=2 worker --detach -l info
+
+stop:
+	rm -f celeryd.pid
+	pgrep -f "celery worker" | xargs kill -9
+
+restart:
+	pgrep -f "celery worker" | xargs kill -HUP
diff --git a/facebook_scraping/client/__init__.py b/facebook_scraping/client/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/facebook_scraping/client/__init__.py
diff --git a/facebook_scraping/client/requirements.txt b/facebook_scraping/client/requirements.txt
new file mode 100644
index 0000000..cba9c1f
--- /dev/null
+++ b/facebook_scraping/client/requirements.txt
@@ -0,0 +1,4 @@
+beautifulsoup4
+celery
+selenium
+xvfbwrapper
diff --git a/facebook_scraping/client/tasks.py b/facebook_scraping/client/tasks.py
new file mode 100644
index 0000000..4557968
--- /dev/null
+++ b/facebook_scraping/client/tasks.py
@@ -0,0 +1,243 @@
+from xvfbwrapper import Xvfb
+from selenium import webdriver
+from selenium.common.exceptions import ElementNotVisibleException,\
+    NoSuchElementException, StaleElementReferenceException, WebDriverException
+from time import sleep
+from bs4 import BeautifulSoup, NavigableString
+from celery import Celery, Task
+from urllib2 import urlopen
+import socket
+
+app = Celery('tasks', broker='amqp://guest@horel.org//')
+app.conf.CELERY_RESULT_BACKEND = 'rpc'
+app.conf.CELERY_ENABLE_UTC = True
+app.conf.CELERY_ACKS_LATE = True
+drivers = [None]
+ip = socket.gethostbyname(socket.gethostname())
+
+
+def strip(url):
+    if url.endswith("/friends"):
+        return url[:-8]
+    else:
+        return url.split("&")[0]
+
+
+def normalize(url):
+    if "profile.php" in url:
+        basename = url.split("&")[0]
+        fname = basename.split("=")[-1]
+        getname = basename + "&sk=friends"
+    else:
+        basename = url.split("?")[0]
+        fname = basename.split("/")[-1]
+        getname = basename + "/friends"
+    return basename, fname, getname
+
+
+class ListFollowers(Task):
+
+    @property
+    def driver(self):
+        if drivers[0] is None:
+            uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split()
+            vdisplay = Xvfb()
+            vdisplay.start()
+            driver = webdriver.Chrome()
+            driver.get("https://facebook.com")
+            driver.find_element_by_id("email").send_keys(uname)
+            elem = driver.find_element_by_id("pass")
+            elem.send_keys(passwd)
+            elem.submit()
+            drivers[0] = driver
+        return drivers[0]
+
+    def run(self, url):
+        try:
+            self.driver.get(url)
+        except WebDriverException:
+            return {"friends": [], "for": url, "orig": ip}
+
+        while True:
+            for _ in xrange(5):
+                try:
+                    footer = self.driver.find_element_by_class_name("_359")
+                except (NoSuchElementException, ElementNotVisibleException):
+                    sleep(0.1)
+                else:
+                    break
+            else:
+                break
+
+            try:
+                self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+                footer.click()
+            except StaleElementReferenceException:
+                sleep(0.1)
+            except WebDriverException:
+                for _ in xrange(5):
+                    try:
+                        footer.click()
+                    except (WebDriverException, StaleElementReferenceException):
+                        sleep(0.1)
+                    else:
+                        break
+                else:
+                    break
+
+        for _ in xrange(5):
+            try:
+                div = self.driver.find_element_by_class_name("_30f")
+            except NoSuchElementException:
+                sleep(0.1)
+            else:
+                break
+        else:
+            try:
+                self.driver.find_element_by_id("loginbutton")
+            except NoSuchElementException:
+                return {"friends": [], "for": url, "orig": ip}
+            else:
+                return {"friends": None, "for": url, "orig": ip}
+
+        soup = BeautifulSoup(div.get_attribute("outerHTML"))
+        return {"friends": [li.a["href"]
+                            for li in soup.findAll("li", class_="_698")],
+                "for": url,
+                "orig": ip}
+
+
+class NumFollowers(Task):
+
+    @property
+    def driver(self):
+        if drivers[0] is None:
+            uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split()
+            vdisplay = Xvfb()
+            vdisplay.start()
+            driver = webdriver.Chrome()
+            driver.get("https://facebook.com")
+            driver.find_element_by_id("email").send_keys(uname)
+            elem = driver.find_element_by_id("pass")
+            elem.send_keys(passwd)
+            elem.submit()
+            drivers[0] = driver
+        return drivers[0]
+
+    def run(self, url):
+        try:
+            self.driver.get(url)
+        except WebDriverException:
+            return {"nfriends": 0, "for": url, "orig": ip}
+
+        for i in xrange(20):
+            try:
+                box = self.driver.find_element_by_class_name("_1f8g")
+            except (NoSuchElementException, ElementNotVisibleException):
+                sleep(0.1)
+            else:
+                break
+        else:
+            try:
+                self.driver.find_element_by_id("loginbutton")
+            except NoSuchElementException:
+                return {"nfriends": 0, "for": url, "orig": ip}
+            else:
+                return {"nfriends": None, "for": url, "orig": ip}
+
+        soup = BeautifulSoup(box.get_attribute("outerHTML"))
+        a = soup.find("a", class_="uiLinkSubtle")
+        try:
+            n_friends = int(a.string.replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore"))
+        except ValueError:
+            n_friends = a.string
+            print n_friends
+        return {"nfriends": n_friends,
+                "for": url,
+                "orig": ip}
+
+
+class Likes(Task):
+
+    @property
+    def driver(self):
+        if drivers[0] is None:
+            uname, passwd = urlopen("http://horel.org:8080/").readline().strip().split()
+            vdisplay = Xvfb()
+            vdisplay.start()
+            driver = webdriver.Chrome()
+            driver.get("https://facebook.com")
+            driver.find_element_by_id("email").send_keys(uname)
+            elem = driver.find_element_by_id("pass")
+            elem.send_keys(passwd)
+            elem.submit()
+            drivers[0] = driver
+        return drivers[0]
+
+    def run(self, url):
+        try:
+            self.driver.get(url)
+        except WebDriverException:
+            return {"likes": [], "for": url, "orig": ip}
+
+        while True:
+            for _ in xrange(5):
+                try:
+                    footer = self.driver.find_element_by_class_name("_359")
+                except (NoSuchElementException, ElementNotVisibleException):
+                    sleep(0.1)
+                else:
+                    break
+            else:
+                break
+
+            try:
+                self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+                footer.click()
+            except StaleElementReferenceException:
+                sleep(0.1)
+            except WebDriverException:
+                for _ in xrange(5):
+                    try:
+                        footer.click()
+                    except (WebDriverException, StaleElementReferenceException):
+                        sleep(0.1)
+                    else:
+                        break
+                else:
+                    break
+
+        for _ in xrange(5):
+            try:
+                div = self.driver.find_element_by_class_name("_30f")
+            except NoSuchElementException:
+                sleep(0.1)
+            else:
+                break
+        else:
+            try:
+                self.driver.find_element_by_id("loginbutton")
+            except NoSuchElementException:
+                return {"likes": "", "for": url, "orig": ip}
+            else:
+                return {"likes": None, "for": url, "orig": ip}
+
+        def clean(a):
+            for child in a.children:
+                if type(child) == NavigableString:
+                    return child
+            else:
+                return ""
+            return ""
+
+        soup = BeautifulSoup(div.get_attribute("outerHTML"))
+        likes = [clean(li.find("a", class_="_gx7"))
+                 for li in soup.findAll("li", class_="_5rz")]
+        return {"likes": u"\t".join(likes).encode("utf8"),
+                "for": url,
+                "orig": ip}
+
+if __name__ == "__main__":
+    nf = Likes()
+    with open("toto.txt", "w") as f:
+        f.write( u"\t".join(nf.run("https://www.facebook.com/grvgaba29" +  "/video_tv_show_favorite")["likes"]).encode("utf8") + "\n")
diff --git a/facebook_scraping/limits.py b/facebook_scraping/limits.py
new file mode 100644
index 0000000..8ce38cf
--- /dev/null
+++ b/facebook_scraping/limits.py
@@ -0,0 +1,6 @@
+from celery import Celery
+app = Celery('tasks', broker='amqp://guest@horel.org//')
+print app.control.rate_limit('tasks.NumFollowers', '4/m', destination=['celery@ip-172-31-42-158'], reply=True)
+print app.control.rate_limit('tasks.ListFollowers', '4/m', destination=['celery@ip-172-31-42-158'], reply=True)
+print app.control.rate_limit('tasks.NumFollowers', '1/h', destination=['celery@ip-172-31-42-159'], reply=True)
+print app.control.rate_limit('tasks.ListFollowers', '1/h', destination=['celery@ip-172-31-42-159'], reply=True)
diff --git a/facebook_scraping/mturk.py b/facebook_scraping/mturk.py
new file mode 100644
index 0000000..f6322da
--- /dev/null
+++ b/facebook_scraping/mturk.py
@@ -0,0 +1,16 @@
+import csv
+import os.path as op
+from glob import glob
+
+for fname in glob("*.csv"):
+    with open(fname) as f:
+        reader = csv.reader(f)
+        oname, _ = op.splitext(fname)
+        oname = oname + ".txt"
+        with open(oname, "w") as of:
+            for i, row in enumerate(reader):
+                if i == 0:
+                    continue
+                if row[-1] == "":
+                    row = row[:-1]
+                of.write("\t".join(row) + "\n")
diff --git a/facebook_scraping/run.py b/facebook_scraping/run.py
new file mode 100644
index 0000000..94eb1a4
--- /dev/null
+++ b/facebook_scraping/run.py
@@ -0,0 +1,91 @@
+from tasks import NumFollowers, ListFollowers, normalize, strip
+from bs4 import BeautifulSoup
+from celery.result import ResultSet
+import os.path as op
+from datetime import datetime
+import sys
+
+nf = NumFollowers()
+lf = ListFollowers()
+
+users = {}
+try:
+    with open(sys.argv[1]) as f:
+        for line in f:
+            values = line.strip().split()
+            users[values[0]] = int(values[1].replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore"))
+except IOError:
+    pass
+
+output = open(sys.argv[1], "a")
+bad = open("bad.txt", "a")
+
+
+def add_user(user, degree):
+    users[user] = degree
+    output.write(user + " " + str(degree) + "\n")
+
+
+def call_back(tid, value):
+    print datetime.now().isoformat() + " " + str(value)
+    if "nfriends" in value:
+        if value["nfriends"] is None:
+            bad.write(value["orig"] + "\n")
+            bad.flush()
+            return
+        basename, fname, getname = normalize(value["for"])
+        n_friends = int(str(value["nfriends"]).replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore"))
+        add_user(fname, n_friends)
+        return
+
+if sys.argv[4] == "True":
+    todo = ResultSet([])
+    soup = BeautifulSoup(open(sys.argv[2]))
+    links = [div.a["href"] for div in soup.findAll("div", class_="fsl")]
+    chunk = []
+    for link in links:
+        basename, finame, getname = normalize(link)
+        if op.isfile("facebook/" + finame):
+            with open("facebook/" + finame) as f:
+                for line in f:
+                    basename, fname, getname = normalize(line.strip())
+                    if fname not in users:
+                        print finame
+                        todo.add(nf.delay(basename))
+    todo.join_native(callback=call_back)
+todo = []
+
+
+def call_back_fd(tid, value):
+    print datetime.now().isoformat() + " " + str(value)
+    if value["friends"] is None:
+        bad.write(value["orig"] + "\n")
+        bad.flush()
+        return
+    basename, fname, getname = normalize(strip(value["for"]))
+    add_user(fname, len(value["friends"]))
+    with open("facebook/" + fname, "w") as f:
+        for friend in value["friends"]:
+            basename, fname, getname = normalize(friend)
+            f.write(basename + "\n")
+            if fname not in users:
+                todo.append(basename)
+
+soup = BeautifulSoup(open(sys.argv[2]))
+links = [div.a["href"] for div in soup.findAll("div", class_="fsl")]
+chunk = []
+for link in links:
+    basename, fname, getname = normalize(link)
+    if not op.isfile("facebook/" + fname):
+        chunk.append(getname)
+        if len(chunk) == int(sys.argv[3]):
+            todofd = ResultSet([])
+            for name in chunk:
+                todofd.add(lf.delay(name))
+            chunk = []
+            todofd.join_native(callback=call_back_fd)
+            todos = ResultSet([])
+            for name in todo:
+                todos.add(nf.delay(name))
+            todo = []
+            todos.join_native(callback=call_back)
diff --git a/facebook_scraping/run2.py b/facebook_scraping/run2.py
new file mode 100644
index 0000000..a52a37b
--- /dev/null
+++ b/facebook_scraping/run2.py
@@ -0,0 +1,90 @@
+from tasks import NumFollowers, ListFollowers, normalize, Likes
+from bs4 import BeautifulSoup
+from celery.result import ResultSet
+import os.path as op
+from datetime import datetime
+import sys
+
+nf = NumFollowers()
+lf = ListFollowers()
+likes = Likes()
+
+users = {}
+try:
+    with open(sys.argv[1]) as f:
+        for line in f:
+            values = line.strip().split()
+            users[values[0]] = int(values[1].replace(",", "").replace(".", "").replace(" ", "").encode("ascii", "ignore"))
+except IOError:
+    pass
+
+users_likes = {}
+try:
+    with open(sys.argv[3]) as f:
+        for line in f:
+            values = line.strip().split()
+            users_likes[values[0]] = True
+except IOError:
+    pass
+
+output = open(sys.argv[3], "a")
+bad = open("bad.txt", "a")
+
+
+def add_user(user, degree):
+    users[user] = degree
+    output.write(user + " " + str(degree) + "\n")
+
+
+def add_user2(user, likes):
+    output.write(user + "\t" + likes + "\n")
+
+
+def strip2(url):
+    l = "/video_tv_show_favorite"
+    if url.endswith(l):
+        return url[:-len(l)]
+    else:
+        return url.split("&")[0]
+
+
+def call_back(tid, value):
+    print datetime.now().isoformat() + " " + str(value)
+    if "likes" in value:
+        if value["likes"] is None:
+            bad.write(value["orig"] + "\n")
+            bad.flush()
+            return
+        basename, fname, getname = normalize(strip2(value["for"]))
+        add_user2(fname, value["likes"])
+        return
+
+
+def normalize2(url):
+    if "profile.php" in url:
+        basename = url.split("&")[0]
+        fname = basename.split("=")[-1]
+        getname = basename + "&sk=video_tv_show_favorite"
+    else:
+        basename = url.split("?")[0]
+        fname = basename.split("/")[-1]
+        getname = basename + "/video_tv_show_favorite"
+    return basename, fname, getname
+
+soup = BeautifulSoup(open(sys.argv[2]))
+links = [div.a["href"] for div in soup.findAll("div", class_="fsl")]
+chunk = []
+for link in links:
+    basename, finame, getname = normalize(link)
+    if op.isfile("facebook/" + finame):
+        with open("facebook/" + finame) as f:
+            for line in f:
+                basename, fname, getname = normalize2(line.strip())
+                if fname in users and users[fname] > 0 and fname not in users_likes:
+                    chunk.append(getname)
+                    if len(chunk) == 100:
+                        todo = ResultSet([])
+                        for name in chunk:
+                            todo.add(likes.delay(name))
+                        chunk = []
+                        todo.join_native(callback=call_back)
diff --git a/facebook_scraping/seed.py b/facebook_scraping/seed.py
new file mode 100644
index 0000000..932c16b
--- /dev/null
+++ b/facebook_scraping/seed.py
@@ -0,0 +1,7 @@
+import sys
+from bs4 import BeautifulSoup
+
+soup = BeautifulSoup(open(sys.argv[1]))
+links = [div.a["href"] for div in soup.findAll("div", class_="fsl")]
+for link in links:
+    print link
diff --git a/facebook_scraping/server.py b/facebook_scraping/server.py
new file mode 100644
index 0000000..6425c7b
--- /dev/null
+++ b/facebook_scraping/server.py
@@ -0,0 +1,16 @@
+from bottle import route, run, request
+
+
+@route('/')
+def index():
+    d = {}
+    with open("credentials.txt") as f:
+        for line in f:
+            values = line.strip().split()
+            d[values[0]] = values[1:3]
+
+    ip = request.environ.get('REMOTE_ADDR')
+    return " ".join(d[ip])
+
+
+run(host='0.0.0.0', port=8080)
author	Thibaut Horel <thibaut.horel@gmail.com>	2014-10-24 12:16:51 -0400
committer	Thibaut Horel <thibaut.horel@gmail.com>	2014-10-24 12:16:51 -0400
commit	ece1d828d53d6123fcecb5ea8bf9b126d1728ccc (patch)
tree	b669382d0e5f1234556d1aeb7fa919891510b24d /facebook_scraping
parent	7426d8ff0e7969eb1a86bdb5bec8a0c971309e2b (diff)
download	fast-seeding-ece1d828d53d6123fcecb5ea8bf9b126d1728ccc.tar.gz