\documentclass[10pt]{article}
\usepackage{fullpage}
\usepackage[utf8x]{inputenc}
\usepackage{amsmath,amsfonts,amsthm}
\usepackage[english]{babel}
\usepackage[capitalize, noabbrev]{cleveref}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage{graphicx}

\newenvironment{enumerate*}%
  {\vspace{-2ex} \begin{enumerate} %
     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
  {\end{enumerate}}
 
\newenvironment{itemize*}%
  {\vspace{-2ex} \begin{itemize} %
     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
  {\end{itemize}}
 
\newenvironment{description*}%
  {\vspace{-2ex} \begin{description} %
     \setlength{\itemsep}{-1ex} \setlength{\parsep}{0pt}}%
  {\end{description}}

\DeclareMathOperator{\E}{\mathbb{E}}
\let\P\relax
\DeclareMathOperator{\P}{\mathbb{P}}
\newcommand{\ex}[1]{\E\left[#1\right]}
\newcommand{\prob}[1]{\P\left[#1\right]}
\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
\newcommand{\R}{\mathbf{R}}
\newcommand{\N}{\mathbf{N}}
\newcommand{\C}{\mathcal{C}}
\newcommand{\eps}{\varepsilon}
\newcommand{\eqdef}{\mathbin{\stackrel{\rm def}{=}}}
\newcommand{\llbracket}{[\![}

\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}

\author{Thibaut Horel}
\title{CS 28r Problem Set 1 -- Solutions}
\date{October 29, 2014}

\begin{document}
\maketitle

\section*{Problem 1}

\paragraph{(a)} We first prove that submodularity implies the following property:
\begin{equation}\label{eq:sub}
    \forall S,T\subseteq N,\; f(S\cup T) + f(S\cap T) \leq f(S) + f(T).
\end{equation}
For this consider $A = S\cap T$, and $B = T\setminus S$. For an arbitrary
ordering $B = \{b_1,\ldots, b_m\}$ of the elements in $B$, let us write $B_i
= \{b_1,\ldots, b_i\}$ with the convention that $B_0 = \emptyset$.

Then one can write:
\begin{displaymath}
    f(T) - f(A) = \sum_{i=1}^m f(A\cup B_i) - f(A\cup B_{i-1})
    \leq \sum_{i=1}^m f(S\cup B_i) - f(S\cup B_{i-1}) = f(S\cup T) - f(S)
\end{displaymath}
where the first equality is simply a telescopic sum, the inequality directly
follows from submodularity since $A\cup B_{i-1}\subseteq S\cup B_{i-1}$, and
the last equality is a telescopic sum again. Reordering the terms in the
previous equation and using the definition of $A$ we obtain
Equation~\ref{eq:sub}.

Subadditivity directly follows from Equation~\ref{eq:sub} by observing that by
positivity of the function $f$, $f(S\cup T) + f(S\cap T) \geq f(S\cup T)$.

\paragraph{(b)} We first show that if $f$ is submodular, then the marginal
contribution function is subadditive. If $f$ is submodular, it is easy to see
that $f_S$ is also submodular. Indeed, for $T\subseteq T'$ and $u\in N\setminus
T'$:
\begin{displaymath}
    f_S(T\cup\{u\}) - f_S(T) = f(S\cup T\cup\{u\}) - f(S\cup T)
\geq f(S\cup T'\cup\{u\}) - f(S\cup T')
= f_S(T'\cup\{u\}) - f_S(T')
\end{displaymath}
where the inequality follows from the submodularity of $f$. We saw in
\textbf{(a)} that a submodular function is subadditive, hence $f_S$ is
subadditive.

Conversely, let us assume that $f_S$ is subadditive for all $S$ and let us
consider two sets $S\subseteq T$ and $u\in N\setminus T$. Then:
\begin{displaymath}
    f_S(T\cup\{u\})\leq f_S(T) + f_S(\{u\})
\end{displaymath}
using the definition of $f_S$, this is equivalent to:
\begin{displaymath}
    f(S\cup T\cup\{u\}) - f(S)\leq f(S\cup T) - f(S) + f(S\cup\{u\}) - f(S)
\end{displaymath}
observing that $S\cup T = T$ and reordering the terms we obtain:
\begin{displaymath}
    f(T\cup\{u\}) - f(T)\leq f(S\cup\{u\}) - f(S)
\end{displaymath}
which is exactly the submodularity of $f$.

\section*{Problem 2}
We adapt the proof of the approximation ratio of the greedy algorithm from
Lecture 5. Using the same notations, if $o_{max}$ is the element of highest
marginal contribution in $O$ at stage $i+1$ and $a_{i+1}$ is the element added
by the greedy algorithm (with approximate oracle) at step $i$, we have:
\begin{displaymath}
    \tilde{f}_{S_i}(a_{i+1}) \geq \tilde{f}_{S_i}(o_{max})
\end{displaymath}
now using that $\tilde{f}$ is an $\alpha$ approximate oracle, we can upper
bound $\tilde{f}_{S_i}(a_{i+1})$ by $f_{S_i}(a_{i+1})$ and lower bound
$\tilde{f}_{S_i}(o_{max})$ by $\alpha f_{S_i}(o_{max})$ and obtain:
\begin{displaymath}
    f_{S_i}(o_{max})\leq \frac{1}{\alpha}f_{S_i}(a_{i+1}) 
\end{displaymath}
plugging this inequality into equation (3) in the proof of Lemma 8 directly
gives the ``modified'' lemma for $\alpha$-approximate oracle:
\begin{displaymath}
    f(S_{i+1}) - f(S_i) \geq \frac{\alpha}{k} (f(O) - f(S_i)),\; i\in[k]
\end{displaymath}
From there, applying verbatim the proof of Lemma 9, we obtain by induction:
\begin{equation}\label{eq:ind}
    f(S_i)\geq \left(1-\left(1-\frac{\alpha}{k}\right)^i\right)f(O),\; i\in[k]
\end{equation}

We then conclude by observing that:
\begin{displaymath}
    \left(1-\frac{\alpha}{k}\right)^k = e^{k\log (1-\frac{\alpha}{k})}\leq
    e^{-\alpha}
\end{displaymath}
where we used the inequality $\log(1+x)\leq x$. Plugging this into
\eqref{eq:ind}  gives:
\begin{displaymath}
    f(S_k)\geq \left(1-\frac{1}{e^{\alpha}}\right)f(O)
\end{displaymath}
which is exactly what we wanted.

\section*{Problem 3}
In this exercise we denote by $I(S)$ the influence of a set $S$ in the
independent cascade model.

\paragraph{(a)} As we saw in class, the influence of a node $v$ in the
independent cascade model is exactly the expectation of the number of nodes
reachable from $v$ over every possible realizations of subgraphs $G'$ of $G$
where each edge $(u,v)$ realizes independently with probability $p_{u,v}$. Let
us writhe $I(v)$ the influence of $v$.

The algorithm SampleInfluence is simply considering $m$ independent realizations of
$G'$ and averaging the number of nodes reachable from $v$ over these $m$
realizations. It suffices to find $m$ such that:
\begin{displaymath}
    \Pr\left[\left|\frac{1}{m}\sum_{i=1}r_i - I(v)\right| > \eps
    I(v)\right]\leq \gamma
\end{displaymath}
indeed, this event implies that $\frac{1}{m}\sum_i r_i$ is
a $1-\eps$-approximation of $I(v)$ with probability at least $1-\gamma$.

Applying the Chernoff bound with $\delta = \eps m I(v)$, we obtain:
\begin{displaymath}
    \Pr\left[\left|\frac{1}{m}\sum_{i=1}r_i - I(v)\right| > \eps
    I(v)\right]\leq 2e^{\frac{-m\eps^2I(v)^2}{n^2}}
\end{displaymath}
since the variables $r_i$ clearly live in the interval $[0, n]$ (we take $b
= n$).

Now we assume that $I(v)\geq 1$\footnote{This depends on how you formulate the
    independent cascade model, do we always consider that a node influences
    himself? If the assumption doesn't hold, we can lower bound $I(v)$ by
    $p_{min}$ where $p_{min}$ is the minimum probability that appears on any
edge in $G$ and all the following results go through by replacing $1$ by
$p_{min}$}. Taking $m = \lceil\frac{n^2}{\eps^2}\log(2/\gamma)\rceil$, we
obtain:
\begin{displaymath}
    \Pr\left[\left|\frac{1}{m}\sum_{i=1}r_i - I(v)\right| > \eps
    I(v)\right]\leq \gamma
\end{displaymath}
and $\frac{1}{m}\sum_i r_i$ is a $1-\eps$-approximation of $I(v)$ with
probability at least $1-\gamma$ as we wanted.

\textbf{(b)} We first note that we can adapt question \textbf{(a)} to obtain
$\tilde{I}_S(x)$ an approximate oracle of $I_S(x)$ for any $S$ and $x$. Let us
denote by $R(S, G)$ the set of nodes reachable from any node in $S$ in graph
$G$. We modify the SampleInfluence algorithm as follows: we generate $m$
realizations $G_i$ ($i\in[m]$) of the random graph and for each realization
$i\in[m]$, we define $r_i$ to be $|R(S\cup\{x\}, G_i)| - |R(S, G_i)|$, and
$\tilde{I}_S(x)$ to be $\frac{1}{m}\sum_i r_i$. By applying the same Chernoff
bound as in question \textbf{(a)}, we can guarantee that choosing $m
= \lceil\frac{n^2}{\eps^2}\log(2/\gamma)\rceil$ implies that:
\begin{equation}\label{eq:oracle}
    \Pr\left[\left|\tilde{I}_S(x) - I_S(x)\right| > \eps
    I_S(x)\right]\leq \gamma
\end{equation}

Now we want to use this approximate oracle $\tilde{I}_S(x)$ the same way as in
Problem 2. We note that the greedy algorithm is going to make at most $nk$
calls to the approximate oracle. Hence, by choosing $\gamma
= \frac{\delta}{nk}$ and applying a union bound on \eqref{eq:oracle}, we get
that with probability at least $1-\delta$, each time we call the oracle we will
have:
\begin{displaymath}
    (1-\eps)I_S(x)\leq\tilde{I}_S(x) \leq (1+\eps)I_S(x)
\end{displaymath}
Dividing this previous inequality by $1+\eps$, we obtain
\begin{displaymath}
    \frac{1-\eps}{1+\eps}I_S(x)\leq\frac{1}{1+\eps}\tilde{I}_S(x) \leq I_S(x)
\end{displaymath}
Applying the result of Problem 2 with $\alpha = \frac{1+\eps}{1-\eps}$, we
obtain that using $\tilde{I}_S(x)$ as an approximate oracle in the greedy
algorithm yields an approximation ratio of:
\begin{displaymath}
    \rho = (1+\eps)\left(1-\frac{1}{e^{\frac{1-\eps}{1+\eps}}}\right)
\end{displaymath}
But a Taylor expansion of $\rho$ shows that:
\begin{displaymath}
    \rho \geq  1-\frac{1}{e} - C\eps
\end{displaymath}
for some constant $C$. Choosing $\eps' = \eps/C$, we get that by using $m
= \lceil\frac{n^2C^2}{\eps^{2}}\log(2nk/\delta)\rceil$ samples each time we invoke
the oracle, the greedy algorithm will return a $1-1/e-\eps$ approximation with
probability at least $1-\delta$.

We are doing $nk$ calls to the oracle, each call can be
computed in time $O(mn^3)$ (generating a sample takes time $n^2$ at most and
a BFS traversal takes linear time). Given the expression we have for $m$, the
overall complexity is $O(\frac{kn^6}{\eps^2}\log\frac{nk}{\delta})$ which is
polynomial in the number of nodes.

\section*{Problem 4}
We assume that $x_0\leq \min_i x_i$, otherwise the power-law distribution
cannot explain the data and the maximum likelihood problem is ill-defined.
Maximizing the likelihood is equivalent to maximizing the log-likelihood,
hence, we want to find:
\begin{displaymath}
    \hat{\alpha} \in \arg\max_\alpha \sum_{i=1}^n
    \log\left(\frac{\alpha-1}{x_0}\left(\frac{x_i}{x_0}\right)^{-\alpha}\right)
\end{displaymath}
removing constant terms, this is equivalent to finding:
\begin{displaymath}
    \hat{\alpha} \in \arg\max_\alpha n\log(\alpha-1)
    - \alpha\sum_{i=1}^n\log\frac{x_i}{x_0}
\end{displaymath}
The function that we are maximizing converges to $-\infty$ as $\alpha$
converges to $1$ and to $+\infty$, hence it has a global maximum in
$(1,\infty)$. We find this maximum by solving the critical point equation
(finding points where the derivate is zero):
\begin{displaymath}
    \frac{n}{\alpha-1} = \sum_{i=1}^n\log\frac{x_i}{x_0}
\end{displaymath}
This equation has a unique solution which is the maximum likelihood estimator:
\begin{displaymath}
    \boxed{
        \hat{a} = 1 +\frac{n}{\sum_{i=1}^n\log\frac{x_i}{x_0}}
}
\end{displaymath}

\section*{Problem 5}

In addition to the three provided datasets, I used the largest strongly
connected component of the PGP graph.  PGP is data encryption software using
asymmetric encryption. Asymmetric encryption relies critically on being able to
associate a public key to an owner (user or institution) in a tamper-proof way.
This is usually done by using certificate authorities (for example in SSL). In
contrast, PGP relies on building a \emph{web of trust}: users certificate the
ownership of the keys of their friends, which in turn certificate the keys of
their friends, thus propagating the trust and building a web of trust. As
predicted by the random graph theory, it is assumed that the PGP web of trust
has a single large strongly connected component, which is called the \emph{PGP
strong-set}. It is easy to discover this strong-set by doing a breadth first
traversal of the trust links, starting from famous privacy activists.

\paragraph{(a)} The degree distributions for the four networks can be found in
Figure~\ref{fig:deg}. Because of the quick decay, these plots are hard to read
(all the points are concentrated in the bottom left corner). See the next
question for easier to read graphs.

\begin{figure}
    \includegraphics[width=\textwidth]{degr.pdf}
    \caption{Degree distributions.}
    \label{fig:deg}
\end{figure}

\paragraph{(b)} The degree distributions in log-log scale can be found in
Figure~\ref{fig:deg-log}. We observe that these distributions approximately
look like lines. This is characteristic of a power-law degree distribution.
Indeed, if $f(d) = kd^{-\alpha}$, then $\log f(d) = \log  k - \alpha \log d$.
Hence if the degree distribution follows a power law, the distribution is
a line in log-log scale. The slope of the line is the negative of power-law
exponent.
\begin{figure}
    \includegraphics[width=\textwidth]{degr_log.pdf}
    \caption{Degree distributions in log-log scale.}
    \label{fig:deg-log}
\end{figure}

\paragraph{(c)} I computed the harmonic centrality (see code) and plotted its
distribution across nodes. This can be found in Figure~\ref{fig:centrality}.
\begin{figure}
    \includegraphics[width=\textwidth]{centrality.pdf}
    \caption{Harmonic centrality distributions.}
    \label{fig:centrality}
\end{figure}

\paragraph{(e)} I computed the voter model influence (see code) for $t=100$ and
plotted its distribution across nodes in log-log scale. This can be found in
Figure~\ref{fig:voter}.
\begin{figure}
    \includegraphics[width=\textwidth]{voter.pdf}
    \caption{Voter model influence distributions in log-log scale}
    \label{fig:voter}
\end{figure}

\paragraph{(e)} I computed the pagerank for $t=100$ (see code) and plotted its
distribution across nodes in log-log scale. This can be found in
Figure~\ref{fig:pr}.
\begin{figure}
    \includegraphics[width=\textwidth]{pr.pdf}
    \caption{Page ranks distributions in log-log scale}
    \label{fig:pr}
\end{figure}
\end{document}