6 files changed, 319 insertions, 0 deletions
diff --git a/ps1/centrality.pdf b/ps1/centrality.pdf
new file mode 100644
index 0000000..e00c9de
--- /dev/null
+++ b/ps1/centrality.pdf
diff --git a/ps1/degr.pdf b/ps1/degr.pdf
new file mode 100644
index 0000000..f14dbb8
--- /dev/null
+++ b/ps1/degr.pdf
diff --git a/ps1/degr_log.pdf b/ps1/degr_log.pdf
new file mode 100644
index 0000000..0ca04d8
--- /dev/null
+++ b/ps1/degr_log.pdf
diff --git a/ps1/main.tex b/ps1/main.tex
new file mode 100644
index 0000000..55b24e4
--- /dev/null
+++ b/ps1/main.tex
@@ -0,0 +1,319 @@
+\documentclass[10pt]{article}
+\usepackage{fullpage}
+\usepackage[utf8x]{inputenc}
+\usepackage{amsmath,amsfonts,amsthm}
+\usepackage[english]{babel}
+\usepackage[capitalize, noabbrev]{cleveref}
+\usepackage{algorithm}
+\usepackage{algpseudocode}
+\usepackage{graphicx}
+
+\newenvironment{enumerate*}%
+  {\vspace{-2ex} \begin{enumerate} %
+     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
+  {\end{enumerate}}
+ 
+\newenvironment{itemize*}%
+  {\vspace{-2ex} \begin{itemize} %
+     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
+  {\end{itemize}}
+ 
+\newenvironment{description*}%
+  {\vspace{-2ex} \begin{description} %
+     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
+  {\end{description}}
+
+\DeclareMathOperator{\E}{\mathbb{E}}
+\let\P\relax
+\DeclareMathOperator{\P}{\mathbb{P}}
+\newcommand{\ex}[1]{\E\left[#1\right]}
+\newcommand{\prob}[1]{\P\left[#1\right]}
+\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
+\newcommand{\R}{\mathbf{R}}
+\newcommand{\N}{\mathbf{N}}
+\newcommand{\C}{\mathcal{C}}
+\newcommand{\eps}{\varepsilon}
+\newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}}
+\newcommand{\llbracket}{[\![}
+
+\newtheorem{theorem}{Theorem}
+\newtheorem{lemma}{Lemma}
+
+\author{Thibaut Horel}
+\title{CS 28r Problem Set 1 -- Solutions}
+\date{October 29, 2014}
+
+\begin{document}
+\maketitle
+
+\section*{Problem 1}
+
+\paragraph{(a)} We first prove that submodularity implies the following property:
+\begin{equation}\label{eq:sub}
+    \forall S,T\subseteq N,\; f(S\cup T) + f(S\cap T) \leq f(S) + f(T).
+\end{equation}
+For this consider $A = S\cap T$, and $B = T\setminus S$. For an arbitrary
+ordering $B = \{b_1,\ldots, b_m\}$ of the elements in $B$, let us write $B_i
+= \{b_1,\ldots, b_i\}$ with the convention that $B_0 = \emptyset$.
+
+Then one can write:
+\begin{displaymath}
+    f(T) - f(A) = \sum_{i=1}^m f(A\cup B_i) - f(A\cup B_{i-1})
+    \leq \sum_{i=1}^m f(S\cup B_i) - f(S\cup B_{i-1}) = f(S\cup T) - f(S)
+\end{displaymath}
+where the first equality is simply a telescopic sum, the inequality directly
+follows from submodularity since $A\cup B_{i-1}\subseteq S\cup B_{i-1}$, and
+the last equality is a telescopic sum again. Reordering the terms in the
+previous equation and using the definition of $A$ we obtain
+Equation~\ref{eq:sub}.
+
+Subadditivity directly follows from Equation~\ref{eq:sub} by observing that by
+positivity of the function $f$, $f(S\cup T) + f(S\cap T) \geq f(S\cup T)$.
+
+\paragraph{(b)} We first show that if $f$ is submodular, then the marginal
+contribution function is subadditive. If $f$ is submodular, it is easy to see
+that $f_S$ is also submodular. Indeed, for $T\subseteq T'$ and $u\in N\setminus
+T'$:
+\begin{displaymath}
+    f_S(T\cup\{u\}) - f_S(T) = f(S\cup T\cup\{u\}) - f(S\cup T)
+\geq f(S\cup T'\cup\{u\}) - f(S\cup T')
+= f_S(T'\cup\{u\}) - f_S(T')
+\end{displaymath}
+where the inequality follows from the submodularity of $f$. We saw in
+\textbf{(a)} that a submodular function is subadditive, hence $f_S$ is
+subadditive.
+
+Conversely, let us assume that $f_S$ is subadditive for all $S$ and let us
+consider two sets $S\subseteq T$ and $u\in N\setminus T$. Then:
+\begin{displaymath}
+    f_S(T\cup\{u\})\leq f_S(T) + f_S(\{u\})
+\end{displaymath}
+using the definition of $f_S$, this is equivalent to:
+\begin{displaymath}
+    f(S\cup T\cup\{u\}) - f(S)\leq f(S\cup T) - f(S) + f(S\cup\{u\}) - f(S)
+\end{displaymath}
+observing that $S\cup T = T$ and reordering the terms we obtain:
+\begin{displaymath}
+    f(T\cup\{u\}) - f(T)\leq f(S\cup\{u\}) - f(S)
+\end{displaymath}
+which is exactly the submodularity of $f$.
+
+\section*{Problem 2}
+We adapt the proof of the approximation ratio of the greedy algorithm from
+Lecture 5. Using the same notations, if $o_{max}$ is the element of highest
+marginal contribution in $O$ at stage $i+1$ and $a_{i+1}$ is the element added
+by the greedy algorithm (with approximate oracle) at step $i$, we have:
+\begin{displaymath}
+    \tilde{f}_{S_i}(a_{i+1}) \geq \tilde{f}_{S_i}(o_{max})
+\end{displaymath}
+now using that $\tilde{f}$ is an $\alpha$ approximate oracle, we can upper
+bound $\tilde{f}_{S_i}(a_{i+1})$ by $f_{S_i}(a_{i+1})$ and lower bound
+$\tilde{f}_{S_i}(o_{max})$ by $\alpha f_{S_i}(o_{max})$ and obtain:
+\begin{displaymath}
+    f_{S_i}(o_{max})\leq \frac{1}{\alpha}f_{S_i}(a_{i+1}) 
+\end{displaymath}
+plugging this inequality into equation (3) in the proof of Lemma 8 directly
+gives the ``modified'' lemma for $\alpha$-approximate oracle:
+\begin{displaymath}
+    f(S_{i+1}) - f(S_i) \geq \frac{\alpha}{k} (f(O) - f(S_i)),\; i\in[k]
+\end{displaymath}
+From there, applying verbatim the proof of Lemma 9, we obtain by induction:
+\begin{equation}\label{eq:ind}
+    f(S_i)\geq \left(1-\left(1-\frac{\alpha}{k}\right)^i\right)f(O),\; i\in[k]
+\end{equation}
+
+We then conclude by observing that:
+\begin{displaymath}
+    \left(1-\frac{\alpha}{k}\right)^k = e^{k\log (1-\frac{\alpha}{k})}\leq
+    e^{-\alpha}
+\end{displaymath}
+where we used the inequality $\log(1+x)\leq x$. Plugging this into
+\eqref{eq:ind}  gives:
+\begin{displaymath}
+    f(S_k)\geq \left(1-\frac{1}{e^{\alpha}}\right)f(O)
+\end{displaymath}
+which is exactly what we wanted.
+
+\section*{Problem 3}
+In this exercise we denote by $I(S)$ the influence of a set $S$ in the
+independent cascade model.
+
+\paragraph{(a)} As we saw in class, the influence of a node $v$ in the
+independent cascade model is exactly the expectation of the number of nodes
+reachable from $v$ over every possible realizations of subgraphs $G'$ of $G$
+where each edge $(u,v)$ realizes independently with probability $p_{u,v}$. Let
+us writhe $I(v)$ the influence of $v$.
+
+The algorithm SampleInfluence is simply considering $m$ independent realizations of
+$G'$ and averaging the number of nodes reachable from $v$ over these $m$
+realizations. It suffices to find $m$ such that:
+\begin{displaymath}
+    \Pr\left[\left|\frac{1}{m}\sum_{i=1}r_i - I(v)\right| > \eps
+    I(v)\right]\leq \gamma
+\end{displaymath}
+indeed, this event implies that $\frac{1}{m}\sum_i r_i$ is
+a $1-\eps$-approximation of $I(v)$ with probability at least $1-\gamma$.
+
+Applying the Chernoff bound with $\delta = \eps m I(v)$, we obtain:
+\begin{displaymath}
+    \Pr\left[\left|\frac{1}{m}\sum_{i=1}r_i - I(v)\right| > \eps
+    I(v)\right]\leq 2e^{\frac{-m\eps^2I(v)^2}{n^2}}
+\end{displaymath}
+since the variables $r_i$ clearly live in the interval $[0, n]$ (we take $b
+= n$).
+
+Now we assume that $I(v)\geq 1$\footnote{This depends on how you formulate the
+    independent cascade model, do we always consider that a node influences
+    himself? If the assumption doesn't hold, we can lower bound $I(v)$ by
+    $p_{min}$ where $p_{min}$ is the minimum probability that appears on any
+edge in $G$ and all the following results go through by replacing $1$ by
+$p_{min}$}. Taking $m = \lceil\frac{n^2}{\eps^2}\log(2/\gamma)\rceil$, we
+obtain:
+\begin{displaymath}
+    \Pr\left[\left|\frac{1}{m}\sum_{i=1}r_i - I(v)\right| > \eps
+    I(v)\right]\leq \gamma
+\end{displaymath}
+and $\frac{1}{m}\sum_i r_i$ is a $1-\eps$-approximation of $I(v)$ with
+probability at least $1-\gamma$ as we wanted.
+
+\textbf{(b)} We first note that we can adapt question \textbf{(a)} to obtain
+$\tilde{I}_S(x)$ an approximate oracle of $I_S(x)$ for any $S$ and $x$. Let us
+denote by $R(S, G)$ the set of nodes reachable from any node in $S$ in graph
+$G$. We modify the SampleInfluence algorithm as follows: we generate $m$
+realizations $G_i$ ($i\in[m]$) of the random graph and for each realization
+$i\in[m]$, we define $r_i$ to be $|R(S\cup\{x\}, G_i)| - |R(S, G_i)|$, and
+$\tilde{I}_S(x)$ to be $\frac{1}{m}\sum_i r_i$. By applying the same Chernoff
+bound as in question \textbf{(a)}, we can guarantee that choosing $m
+= \lceil\frac{n^2}{\eps^2}\log(2/\gamma)\rceil$ implies that:
+\begin{equation}\label{eq:oracle}
+    \Pr\left[\left|\tilde{I}_S(x) - I_S(x)\right| > \eps
+    I_S(x)\right]\leq \gamma
+\end{equation}
+
+Now we want to use this approximate oracle $\tilde{I}_S(x)$ the same way as in
+Problem 2. We note that the greedy algorithm is going to make at most $nk$
+calls to the approximate oracle. Hence, by choosing $\gamma
+= \frac{\delta}{nk}$ and applying a union bound on \eqref{eq:oracle}, we get
+that with probability at least $1-\delta$, each time we call the oracle we will
+have:
+\begin{displaymath}
+    (1-\eps)I_S(x)\leq\tilde{I}_S(x) \leq (1+\eps)I_S(x)
+\end{displaymath}
+Dividing this previous inequality by $1+\eps$, we obtain
+\begin{displaymath}
+    \frac{1-\eps}{1+\eps}I_S(x)\leq\frac{1}{1+\eps}\tilde{I}_S(x) \leq I_S(x)
+\end{displaymath}
+Applying the result of Problem 2 with $\alpha = \frac{1+\eps}{1-\eps}$, we
+obtain that using $\tilde{I}_S(x)$ as an approximate oracle in the greedy
+algorithm yields an approximation ratio of:
+\begin{displaymath}
+    \rho = (1+\eps)\left(1-\frac{1}{e^{\frac{1-\eps}{1+\eps}}}\right)
+\end{displaymath}
+But a Taylor expansion of $\rho$ shows that:
+\begin{displaymath}
+    \rho \geq  1-\frac{1}{e} - C\eps
+\end{displaymath}
+for some constant $C$. Choosing $\eps' = \eps/C$, we get that by using $m
+= \lceil\frac{n^2C^2}{\eps^{2}}\log(2nk/\delta)\rceil$ samples each time we invoke
+the oracle, the greedy algorithm will return a $1-1/e-\eps$ approximation with
+probability at least $1-\delta$.
+
+We are doing $nk$ calls to the oracle, each call can be
+computed in time $O(mn^3)$ (generating a sample takes time $n^2$ at most and
+a BFS traversal takes linear time). Given the expression we have for $m$, the
+overall complexity is $O(\frac{kn^6}{\eps^2}\log\frac{nk}{\delta})$ which is
+polynomial in the number of nodes.
+
+\section*{Problem 4}
+We assume that $x_0\leq \min_i x_i$, otherwise the power-law distribution
+cannot explain the data and the maximum likelihood problem is ill-defined.
+Maximizing the likelihood is equivalent to maximizing the log-likelihood,
+hence, we want to find:
+\begin{displaymath}
+    \hat{\alpha} \in \arg\max_\alpha \sum_{i=1}^n
+    \log\left(\frac{\alpha-1}{x_0}\left(\frac{x_i}{x_0}\right)^{-\alpha}\right)
+\end{displaymath}
+removing constant terms, this is equivalent to finding:
+\begin{displaymath}
+    \hat{\alpha} \in \arg\max_\alpha n\log(\alpha-1)
+    - \alpha\sum_{i=1}^n\log\frac{x_i}{x_0}
+\end{displaymath}
+The function that we are maximizing converges to $-\infty$ as $\alpha$
+converges to $1$ and to $+\infty$, hence it has a global maximum in
+$(1,\infty)$. We find this maximum by solving the critical point equation
+(finding points where the derivate is zero):
+\begin{displaymath}
+    \frac{n}{\alpha-1} = \sum_{i=1}^n\log\frac{x_i}{x_0}
+\end{displaymath}
+This equation has a unique solution which is the maximum likelihood estimator:
+\begin{displaymath}
+    \boxed{
+        \hat{a} = 1 +\frac{n}{\sum_{i=1}^n\log\frac{x_i}{x_0}}
+}
+\end{displaymath}
+
+\section*{Problem 5}
+
+In addition to the three provided datasets, I used the largest strongly
+connected component of the PGP graph.  PGP is data encryption software using
+asymmetric encryption. Asymmetric encryption relies critically on being able to
+associate a public key to an owner (user or institution) in a tamper-proof way.
+This is usually done by using certificate authorities (for example in SSL). In
+contrast, PGP relies on building a \emph{web of trust}: users certificate the
+ownership of the keys of their friends, which in turn certificate the keys of
+their friends, thus propagating the trust and building a web of trust. As
+predicted by the random graph theory, it is assumed that the PGP web of trust
+has a single large strongly connected component, which is called the \emph{PGP
+strong-set}. It is easy to discover this strong-set by doing a breadth first
+traversal of the trust links, starting from famous privacy activists.
+
+\paragraph{(a)} The degree distributions for the four networks can be found in
+Figure~\ref{fig:deg}. Because of the quick decay, these plots are hard to read
+(all the points are concentrated in the bottom left corner). See the next
+question for easier to read graphs.
+
+\begin{figure}
+    \includegraphics[width=\textwidth]{degr.pdf}
+    \caption{Degree distributions.}
+    \label{fig:deg}
+\end{figure}
+
+\paragraph{(b)} The degree distributions in log-log scale can be found in
+Figure~\ref{fig:deg-log}. We observe that these distributions approximately
+look like lines. This is characteristic of a power-law degree distribution.
+Indeed, if $f(d) = kd^{-\alpha}$, then $\log f(d) = \log  k - \alpha \log d$.
+Hence if the degree distribution follows a power law, the distribution is
+a line in log-log scale. The slope of the line is the negative of power-law
+exponent.
+\begin{figure}
+    \includegraphics[width=\textwidth]{degr_log.pdf}
+    \caption{Degree distributions in log-log scale.}
+    \label{fig:deg-log}
+\end{figure}
+
+\paragraph{(c)} I computed the harmonic centrality (see code) and plotted its
+distribution across nodes. This can be found in Figure~\ref{fig:centrality}.
+\begin{figure}
+    \includegraphics[width=\textwidth]{centrality.pdf}
+    \caption{Harmonic centrality distributions.}
+    \label{fig:centrality}
+\end{figure}
+
+\paragraph{(e)} I computed the voter model influence (see code) for $t=100$ and
+plotted its distribution across nodes in log-log scale. This can be found in
+Figure~\ref{fig:voter}.
+\begin{figure}
+    \includegraphics[width=\textwidth]{voter.pdf}
+    \caption{Voter model influence distributions in log-log scale}
+    \label{fig:voter}
+\end{figure}
+
+\paragraph{(e)} I computed the pagerank for $t=100$ (see code) and plotted its
+distribution across nodes in log-log scale. This can be found in
+Figure~\ref{fig:pr}.
+\begin{figure}
+    \includegraphics[width=\textwidth]{pr.pdf}
+    \caption{Page ranks distributions in log-log scale}
+    \label{fig:pr}
+\end{figure}
+\end{document}
diff --git a/ps1/pr.pdf b/ps1/pr.pdf
new file mode 100644
index 0000000..dff6d18
--- /dev/null
+++ b/ps1/pr.pdf
diff --git a/ps1/voter.pdf b/ps1/voter.pdf
new file mode 100644
index 0000000..5ccf46f
--- /dev/null
+++ b/ps1/voter.pdf