summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2012-10-30 12:37:58 +0100
committerThibaut Horel <thibaut.horel@gmail.com>2012-10-30 12:37:58 +0100
commitfe8aa3a0134a9493a2443a0965392254a7125094 (patch)
treef99d8e971597bac36ba35ba0cf7ddae529e6b5f0
parent0d80fbea985c73831e9e20a97e259adf864f41be (diff)
downloadrecommendation-fe8aa3a0134a9493a2443a0965392254a7125094.tar.gz
Making notations more consistent, typos
-rw-r--r--definitions.tex2
-rw-r--r--main.tex76
-rw-r--r--problem.tex57
3 files changed, 68 insertions, 67 deletions
diff --git a/definitions.tex b/definitions.tex
index 378781d..e603e5a 100644
--- a/definitions.tex
+++ b/definitions.tex
@@ -19,3 +19,5 @@
\DeclareMathOperator*{\argmin}{arg\,min}
\newcommand{\reals}{\ensuremath{\mathbbm{R}}}
\newcommand{\stratis}[1]{\textcolor{red}{Stratis: #1}}
+\newcommand{\thibaut}[1]{\textcolor{blue}{Thibaut: #1}}
+\newcommand{\T}[1]{#1^T}
diff --git a/main.tex b/main.tex
index 189156c..148aedc 100644
--- a/main.tex
+++ b/main.tex
@@ -37,7 +37,7 @@ Here, we use a relaxation of the objective function which is tailored to the
problem of ridge regression. We define:
\begin{displaymath}
\forall\lambda\in[0,1]^{|\mathcal{N}|}\,\quad L_{\mathcal{N}}(\lambda) \defeq
- \log\det\left(I_d + \mu\sum_{i\in\mathcal{N}} \lambda_i x_i x_i^*\right)
+ \log\det\left(I_d + \mu\sum_{i\in\mathcal{N}} \lambda_i x_i \T{x_i}\right)
\end{displaymath}
We can now present the mechanism we use, which has a same flavor to Chen's and
@@ -47,7 +47,7 @@ Singer's
\caption{Mechanism for ridge regression}
\begin{algorithmic}[1]
\State $i^* \gets \argmax_{j\in\mathcal{N}}V(j)$
- \State $x^* \gets \argmax_{x\in[0,1]^n} \{L_{\mathcal{N}\setminus\{i^*\}}(x)
+ \State $x^* \gets \argmax_{x\in[0,1]^{|\mathcal{N}|}} \{L_{\mathcal{N}\setminus\{i^*\}}(x)
\,|\, c(x)\leq B\}$
\Statex
\If{$L(x^*) < CV(i^*)$}
@@ -118,7 +118,7 @@ The mechanism is monotone.
\end{displaymath}
in an earlier iteration of the greedy heuristic. Let us denote by $S_i$
(resp. $S_i'$) the set to which $i$ is added when reporting cost $c_i$
- (resp. $c_i'$). We have $S_i'\subset S_i$. Moreover:
+ (resp. $c_i'$). We have $S_i'\subseteq S_i$. Moreover:
\begin{align*}
c_i' & \leq c_i \leq
\frac{B}{2}\frac{V(S_i\cup\{i\})-V(S_i)}{V(S_i\cup\{i\})}\\
@@ -245,30 +245,30 @@ of the objective function and thus ensures a constant approximation of the
value function. The difficulty resides in showing that the ratio of the two
relaxations is bounded.
-We say that $R_\mathcal{N}:[0,1]^n\rightarrow\mathbf{R}$ is a relaxation of the
+We say that $R_\mathcal{N}:[0,1]^{|\mathcal{N}|}\rightarrow\reals$ is a relaxation of the
value function $V$ over $\mathcal{N}$ if it coincides with $V$ at binary
-points. Formally, for any $S\subset\mathcal{N}$, let $\mathbf{1}_S$ denote the
+points. Formally, for any $S\subseteq\mathcal{N}$, let $\mathbf{1}_S$ denote the
indicator vector of $S$. $R_\mathcal{N}$ is a relaxation of $V$ over
$\mathcal{N}$ iff:
\begin{displaymath}
- \forall S\subset\mathcal{N},\; R_\mathcal{N}(\mathbf{1}_S) = V(S)
+ \forall S\subseteq\mathcal{N},\; R_\mathcal{N}(\mathbf{1}_S) = V(S)
\end{displaymath}
We can extend the optimisation problem defined above to a relaxation by
extending the cost function:
\begin{displaymath}
- \forall \lambda\in[0,1]^n,\; c(\lambda)
+ \forall \lambda\in[0,1]^{|\mathcal{N}|},\; c(\lambda)
= \sum_{i\in\mathcal{N}}\lambda_ic_i
\end{displaymath}
The optimisation problem becomes:
\begin{displaymath}
OPT(R_\mathcal{N}, B) =
- \max_{\lambda\in[0,1]^n}\left\{R_\mathcal{N}(\lambda)\,|\, c(\lambda)\leq B\right\}
+ \max_{\lambda\in[0,1]^{|\mathcal{N}|}}\left\{R_\mathcal{N}(\lambda)\,|\, c(\lambda)\leq B\right\}
\end{displaymath}
The relaxations we will consider here rely on defining a probability
distribution over subsets of $\mathcal{N}$.
-Let $\lambda\in[0,1]^n$, let us define:
+Let $\lambda\in[0,1]^{|\mathcal{N}|}$, let us define:
\begin{displaymath}
P_\mathcal{N}^\lambda(S) = \prod_{i\in S}\lambda_i
\prod_{i\in\mathcal{N}\setminus S}(1-\lambda_i)
@@ -284,17 +284,17 @@ We will consider two relaxations of the value function $V$ over $\mathcal{N}$:
\begin{align*}
F_\mathcal{N}(\lambda)
& = \mathbb{E}_{S\sim P_\mathcal{N}^\lambda}\big[\log\det A(S)\big]\\
- & = \sum_{S\subset\mathcal{N}} P_\mathcal{N}^\lambda(S) V(S)\\
- & = \sum_{S\subset\mathcal{N}} P_\mathcal{N}^\lambda(S) \log\det A(S)\\
+ & = \sum_{S\subseteq\mathcal{N}} P_\mathcal{N}^\lambda(S) V(S)\\
+ & = \sum_{S\subseteq\mathcal{N}} P_\mathcal{N}^\lambda(S) \log\det A(S)\\
\end{align*}
\item the \emph{concave relaxation} of $V$:
\begin{align*}
L_{\mathcal{N}}(\lambda)
& = \log\det \mathbb{E}_{S\sim P_\mathcal{N}^\lambda}\big[A(S)\big]\\
- & = \log\det\left(\sum_{S\subset N}
+ & = \log\det\left(\sum_{S\subseteq N}
P_\mathcal{N}^\lambda(S)A(S)\right)\\
& = \log\det\left(I_d + \mu\sum_{i\in\mathcal{N}}
- \lambda_ix_ix_i^*\right)\\
+ \lambda_ix_i\T{x_i}\right)\\
& \defeq \log\det \tilde{A}(\lambda)
\end{align*}
\end{itemize}
@@ -315,8 +315,8 @@ We will consider two relaxations of the value function $V$ over $\mathcal{N}$:
\end{proof}
\begin{lemma}[Rounding]\label{lemma:rounding}
- For any feasible $\lambda\in[0,1]^n$, there exists a feasible
- $\bar{\lambda}\in[0,1]^n$ such that at most one of its component is
+ For any feasible $\lambda\in[0,1]^{|\mathcal{N}|}$, there exists a feasible
+ $\bar{\lambda}\in[0,1]^{|\mathcal{N}|}$ such that at most one of its component is
fractional, that is, lies in $(0,1)$ and:
\begin{displaymath}
F_{\mathcal{N}}(\lambda)\leq F_{\mathcal{N}}(\bar{\lambda})
@@ -373,7 +373,7 @@ We will consider two relaxations of the value function $V$ over $\mathcal{N}$:
\begin{lemma}\label{lemma:relaxation-ratio}
The following inequality holds:
\begin{displaymath}
- \forall\lambda\in[0,1]^n,\;
+ \forall\lambda\in[0,1]^{|\mathcal{N}|},\;
\frac{\log\big(1+\mu\big)}{2\mu}
\,L_\mathcal{N}(\lambda)\leq
F_\mathcal{N}(\lambda)\leq L_{\mathcal{N}}(\lambda)
@@ -422,9 +422,9 @@ We will consider two relaxations of the value function $V$ over $\mathcal{N}$:
Hence:
\begin{multline*}
\partial_i F_\mathcal{N} =
- \sum_{\substack{S\subset\mathcal{N}\\ i\in S}}
+ \sum_{\substack{S\subseteq\mathcal{N}\\ i\in S}}
P_{\mathcal{N}\setminus\{i\}}^\lambda(S\setminus\{i\})V(S)\\
- - \sum_{\substack{S\subset\mathcal{N}\\ i\in \mathcal{N}\setminus S}}
+ - \sum_{\substack{S\subseteq\mathcal{N}\\ i\in \mathcal{N}\setminus S}}
P_{\mathcal{N}\setminus\{i\}}^\lambda(S)V(S)\\
\end{multline*}
@@ -432,9 +432,9 @@ We will consider two relaxations of the value function $V$ over $\mathcal{N}$:
$S'\cup\{i\}$, we can write:
\begin{multline*}
\partial_i F_\mathcal{N} =
- \sum_{\substack{S\subset\mathcal{N}\\ i\in\mathcal{N}\setminus S}}
+ \sum_{\substack{S\subseteq\mathcal{N}\\ i\in\mathcal{N}\setminus S}}
P_{\mathcal{N}\setminus\{i\}}^\lambda(S)V(S\cup\{i\})\\
- - \sum_{\substack{S\subset\mathcal{N}\\ i\in \mathcal{N}\setminus S}}
+ - \sum_{\substack{S\subseteq\mathcal{N}\\ i\in \mathcal{N}\setminus S}}
P_{\mathcal{N}\setminus\{i\}}^\lambda(S)V(S)\\
\end{multline*}
@@ -442,50 +442,50 @@ We will consider two relaxations of the value function $V$ over $\mathcal{N}$:
$S$:
\begin{displaymath}
\partial_i F_\mathcal{N}(\lambda) =
- \sum_{\substack{S\subset\mathcal{N}\\ i\in\mathcal{N}\setminus S}}
+ \sum_{\substack{S\subseteq\mathcal{N}\\ i\in\mathcal{N}\setminus S}}
P_{\mathcal{N}\setminus\{i\}}^\lambda(S)
- \log\Big(1 + \mu x_i^*A(S)^{-1}x_i\Big)
+ \log\Big(1 + \mu \T{x_i}A(S)^{-1}x_i\Big)
\end{displaymath}
The computation of the derivative of $L_\mathcal{N}$ uses standard matrix
calculus and gives:
\begin{displaymath}
\partial_i L_\mathcal{N}(\lambda)
- = \mu x_i^* \tilde{A}(\lambda)^{-1}x_i
+ = \mu \T{x_i}\tilde{A}(\lambda)^{-1}x_i
\end{displaymath}
Using the following inequalities:
\begin{gather*}
- \forall S\subset\mathcal{N}\setminus\{i\},\quad
+ \forall S\subseteq\mathcal{N}\setminus\{i\},\quad
P_{\mathcal{N}\setminus\{i\}}^\lambda(S)\geq
P_{\mathcal{N}\setminus\{i\}}^\lambda(S\cup\{i\})\\
- \forall S\subset\mathcal{N},\quad P_{\mathcal{N}\setminus\{i\}}^\lambda(S)
+ \forall S\subseteq\mathcal{N},\quad P_{\mathcal{N}\setminus\{i\}}^\lambda(S)
\geq P_\mathcal{N}^\lambda(S)\\
- \forall S\subset\mathcal{N},\quad A(S)^{-1} \geq A(S\cup\{i\})^{-1}\\
+ \forall S\subseteq\mathcal{N},\quad A(S)^{-1} \geq A(S\cup\{i\})^{-1}\\
\end{gather*}
we get:
\begin{align*}
\partial_i F_\mathcal{N}(\lambda)
- & \geq \sum_{\substack{S\subset\mathcal{N}\\ i\in\mathcal{N}\setminus S}}
+ & \geq \sum_{\substack{S\subseteq\mathcal{N}\\ i\in\mathcal{N}\setminus S}}
P_\mathcal{N}^\lambda(S)
- \log\Big(1 + \mu x_i^*A(S)^{-1}x_i\Big)\\
+ \log\Big(1 + \mu \T{x_i}A(S)^{-1}x_i\Big)\\
& \geq \frac{1}{2}
- \sum_{\substack{S\subset\mathcal{N}\\ i\in\mathcal{N}\setminus S}}
+ \sum_{\substack{S\subseteq\mathcal{N}\\ i\in\mathcal{N}\setminus S}}
P_\mathcal{N}^\lambda(S)
- \log\Big(1 + \mu x_i^*A(S)^{-1}x_i\Big)\\
+ \log\Big(1 + \mu \T{x_i}A(S)^{-1}x_i\Big)\\
&\hspace{-3.5em}+\frac{1}{2}
- \sum_{\substack{S\subset\mathcal{N}\\ i\in\mathcal{N}\setminus S}}
+ \sum_{\substack{S\subseteq\mathcal{N}\\ i\in\mathcal{N}\setminus S}}
P_\mathcal{N}^\lambda(S\cup\{i\})
- \log\Big(1 + \mu x_i^*A(S\cup\{i\})^{-1}x_i\Big)\\
+ \log\Big(1 + \mu \T{x_i}A(S\cup\{i\})^{-1}x_i\Big)\\
&\geq \frac{1}{2}
- \sum_{S\subset\mathcal{N}}
+ \sum_{S\subseteq\mathcal{N}}
P_\mathcal{N}^\lambda(S)
- \log\Big(1 + \mu x_i^*A(S)^{-1}x_i\Big)\\
+ \log\Big(1 + \mu \T{x_i}A(S)^{-1}x_i\Big)\\
\end{align*}
Using that $A(S)\geq I_d$ we get that:
\begin{displaymath}
- \mu x_i^*A(S)^{-1}x_i \leq \mu
+ \mu \T{x_i}A(S)^{-1}x_i \leq \mu
\end{displaymath}
Moreover:
@@ -498,7 +498,7 @@ We will consider two relaxations of the value function $V$ over $\mathcal{N}$:
\begin{displaymath}
\partial_i F_\mathcal{N}(\lambda) \geq
\frac{\log\big(1+\mu\big)}{2\mu}
- x_i^*\bigg(\sum_{S\subset\mathcal{N}}P_\mathcal{N}^\lambda(S)A(S)^{-1}\bigg)x_i
+ \T{x_i}\bigg(\sum_{S\subseteq\mathcal{N}}P_\mathcal{N}^\lambda(S)A(S)^{-1}\bigg)x_i
\end{displaymath}
Finally, using that the inverse is a matrix convex function over symmetric
@@ -506,7 +506,7 @@ We will consider two relaxations of the value function $V$ over $\mathcal{N}$:
\begin{align*}
\partial_i F_\mathcal{N}(\lambda) &\geq
\frac{\log\big(1+\mu\big)}{2\mu}
- x_i^*\bigg(\sum_{S\subset\mathcal{N}}P_\mathcal{N}^\lambda(S)A(S)\bigg)^{-1}x_i\\
+ \T{x_i}\bigg(\sum_{S\subseteq\mathcal{N}}P_\mathcal{N}^\lambda(S)A(S)\bigg)^{-1}x_i\\
& \geq \frac{\log\big(1+\mu\big)}{2\mu}
\partial_i L_\mathcal{N}(\lambda)
\end{align*}
@@ -515,7 +515,7 @@ We will consider two relaxations of the value function $V$ over $\mathcal{N}$:
We can now prove lemma TODO from previous section.
\begin{proof}
- Let us consider a feasible point $\lambda^*\in[0,1]^n$ such that $L_\mathcal{N}(\lambda^*)
+ Let us consider a feasible point $\lambda^*\in[0,1]^{|\mathcal{N}|}$ such that $L_\mathcal{N}(\lambda^*)
= OPT(L_\mathcal{N}, B)$. By applying lemma~\ref{lemma:relaxation-ratio}
and lemma~\ref{lemma:rounding} we get a feasible point $\bar{\lambda}$ with at most
one fractional component such that:
diff --git a/problem.tex b/problem.tex
index 3faef3b..75529ff 100644
--- a/problem.tex
+++ b/problem.tex
@@ -15,7 +15,7 @@ For example, the features could be the age, weight, or height of user $i$, while
The variables $x_i$ and $y_i$ are related through the following relationship:
\begin{align}
- y_i = \beta^T x_i + \varepsilon_i,\quad\forall i\in\mathcal{N},\label{model}
+ y_i = \T{\beta} x_i + \varepsilon_i,\quad\forall i\in\mathcal{N},\label{model}
\end{align}
where $\beta\in\reals^d$ is the unknown parameter and $\varepsilon_i\in\reals$ are independent, normally distributed random variables, with zero mean and variance $\sigma^2_0$.
The experimenter wishes to learn $\beta$ by observing the undisclosed $y_i$ of a subset of users $i\in S$. Hence, the set of possible experiments comprises all random variables $y_{S}\in\reals^{|S|}$, $S\subseteq \mathcal{N}$.
@@ -27,11 +27,11 @@ Learning $\beta$ has many interesting applications, that make linear regression
In the Bayesian setting,
it is commonly assumed that $\beta$ follows a
multivariate normal distribution of mean zero and covariance matrix $\sigma_1^2
-I_d$. Under this prior and the linear model \eqref{model}, the value of information \eqref{voi} of an experiment $Y_S$ is given by \cite{...}
+I_d$. Under this prior and the linear model \eqref{model}, the value of information \eqref{voi} of an experiment $y_S$ is given by \cite{...}
\begin{align}\label{vs}
V(S)
& \defeq I(\beta;y_S) = \frac{1}{2}\log\det\left(I_d
- + \mu\sum_{i\in S} x_ix_i^T\right),
+ + \mu\sum_{i\in S} x_i\T{x_i}\right),
% & \defeq \frac{1}{2}\log\det A(S)
\end{align}
where $\mu = \sigma_1^2/\sigma_0^2$.
@@ -40,7 +40,7 @@ Some intuition behind the Gaussian prior on $\beta$ and the role that parameter
It is easy to show that, under the linearity assumption \eqref{model} and the gaussian prior on $\beta$, maximum a posteriori estimation leads to the following maximization
problem:
\begin{displaymath}
- \hat{\beta} = \argmin_{\beta\in\reals^d} \sum_i (y_i - \beta^*x_i)^2
+ \hat{\beta} = \argmin_{\beta\in\reals^d} \sum_i (y_i - \T{\beta}x_i)^2
+ \frac{1}{\mu}\sum_i \norm{\beta}_2^2
\end{displaymath}
This optimization, commonly known as \emph{ridge regression}, reduces to a least squares fit for $\mu=\infty$. For finite $\mu$, ridge regression
@@ -55,18 +55,17 @@ problem. Explain the goals:
\item individually rational
\item budget feasible
\item has a good approximation ratio
-
-TODO Explain what is already known: it is ok when the function is submodular. When
-should we introduce the notion of submodularity?
\end{itemize}
+\thibaut{TODO Explain what is already known: it is ok when the function is submodular. When
+should we introduce the notion of submodularity?}
\begin{comment}
\subsection{Linear and Ridge Regression}
Linear regression, a true workhorse of statistical analysis, fits a linear function to a set of observable input and output variables. In particular, consider a set of $n$ value pairs $(x_i,y_i)$, $i\in \mathcal{N} =\{1,\ldots,n\}$. The variables $x_i\in \reals^d$ are usually referred to as \emph{input} variables or \emph{covariates} and $y_i\in \reals$ are referred to as \emph{output} or \emph{response} variables. For example, the former could be the age, weight, or height of a person, while the latter can be their propensity to contract a disease. Assume that input and output variables are related through the following relationship:
\begin{displaymath}
- y_i = \beta^* x_i + \varepsilon_i,\quad\forall i\in\mathcal{N},
+ y_i = \T{\beta} x_i + \varepsilon_i,\quad\forall i\in\mathcal{N},
\end{displaymath}
where $\beta\in\reals^d$, is called the \emph{model parameter vector} and $\varepsilon_i\in\reals$ are independent, normally distributed random variables, with zero mean and variance $\sigma^2$.
@@ -85,7 +84,7 @@ multivariate normal distribution of mean zero and covariance matrix $\kappa
I_d$. Under this assumption, maximum a posteriori estimation leads to the following maximization
problem, commonly known as \emph{ridge regression}:
\begin{displaymath}
- \hat{\beta} = \argmin_{\beta\in\reals^d} \sum_i (y_i - \beta^*x_i)^2
+ \hat{\beta} = \argmin_{\beta\in\reals^d} \sum_i (y_i - T{\beta}x_i)^2
+ \frac{1}{\mu}\sum_i \norm{\beta}_2^2
\end{displaymath}
where
@@ -102,16 +101,16 @@ acts as a sort of ``Occam's razor'', favoring a \emph{parsimonious} model for $\
Throughout the paper, we will make use of the following notations: if $x$ is
-a (column) vector in $\reals^d$, $x^*$ denotes its transposed (line)
+a (column) vector in $\reals^d$, $\T{x}$ denotes its transposed (line)
vector. Thus, the standard inner product between two vectors $x$ and $y$ is
-simply $x^* y$. $\norm{x}_2 = x^*x$ will denote the $L_2$ norm of $x$.
+simply $\T{x}y$. $\norm{x}_2 = \T{x}x$ will denote the $L_2$ norm of $x$.
We will also often use the following order over symmetric matrices: if $A$ and
$B$ are two $d\times d$ and $B$ are two $d\times d$ real symmetric matrices, we
write that $A\leq B$ iff:
\begin{displaymath}
\forall x\in\reals^d,\quad
- x^*Ax \leq x^*Bx
+ \T{x}Ax \leq \T{x}Bx
\end{displaymath}
That is, iff $B-A$ is symmetric semi-definite positive.
@@ -127,7 +126,7 @@ definite positive matrices.
Linear regression, a true workhorse of statistical analysis, fits a linear function to a set of observable input and output variables. In particular, consider a set of $n$ value pairs $(x_i,y_i)$, $i\in \mathcal{N} =\{1,\ldots,n\}$. The variables $x_i\in \reals^d$ are usually referred to as \emph{input} variables or \emph{covariates} and $y_i\in \reals$ are referred to as \emph{output} or \emph{response} variables. For example, the former could be the age, weight, or height of a person, while the latter can be their propensity to contract a disease. Assume that input and output variables are related through the following relationship:
\begin{displaymath}
- y_i = \beta^* x_i + \varepsilon_i,\quad\forall i\in\mathcal{N},
+ y_i = \T{\beta} x_i + \varepsilon_i,\quad\forall i\in\mathcal{N},
\end{displaymath}
where $\beta\in\reals^d$, is called the \emph{model parameter vector} and $\varepsilon_i\in\reals$ are independent, normally distributed random variables, with zero mean and variance $\sigma^2$.
@@ -136,9 +135,9 @@ The purpose of linear regression is to recover the vector $\beta$ upon the obser
-In the absense of any additional information, a natural method for estimating $\beta$ is through a least squares fit. However, in a more general setup,
+In the absence of any additional information, a natural method for estimating $\beta$ is through a least squares fit. However, in a more general setup,
a prior knowledge about $\beta$, captured by a distribution over
-$\reals^d$, can also be taken into account. In this setup, after observing the data, $\beta$ can be retreived through \emph{maximum a posteriori estimation}: computing the parameters which maximize the
+$\reals^d$, can also be taken into account. In this setup, after observing the data, $\beta$ can be retrieved through \emph{maximum a posteriori estimation}: computing the parameters which maximize the
posterior distribution of $\beta$ given the observations.
A common prior assumption for linear regression is that $\beta$ follows a
@@ -146,7 +145,7 @@ multivariate normal distribution of mean zero and covariance matrix $\kappa
I_d$. Under this assumption, maximum a posteriori estimation leads to the following maximization
problem, commonly known as \emph{ridge regression}:
\begin{displaymath}
- \hat{\beta} = \argmin_{\beta\in\reals^d} \sum_i (y_i - \beta^*x_i)^2
+ \hat{\beta} = \argmin_{\beta\in\reals^d} \sum_i (y_i - \T{\beta}x_i)^2
+ \frac{1}{\mu}\sum_i \norm{\beta}_2^2
\end{displaymath}
where
@@ -184,7 +183,7 @@ experimenter has not seen the data yet, but he wants to know which users he
should be selecting. His goal is to learn the model underlying the data. Here,
we assume a linear model:
\begin{displaymath}
- \forall i\in\mathcal{N},\quad y_i = \beta^* x_i + \varepsilon_i
+ \forall i\in\mathcal{N},\quad y_i = \T{\beta}x_i + \varepsilon_i
\end{displaymath}
where $\beta\in\reals^d$ and $\varepsilon_i\in\reals$ follows a normal
distribution of mean $0$ and variance $\sigma^2$. Furthermore, we assume the
@@ -203,7 +202,7 @@ a multivariate normal distribution of mean zero and covariance matrix $\kappa
I_d$. Maximum a posteriori estimation leads to the following maximization
problem:
\begin{displaymath}
- \beta_{\text{max}} = \argmax_{\beta\in\reals^d} \sum_i (y_i - \beta^*x_i)^2
+ \beta_{\text{max}} = \argmax_{\beta\in\reals^d} \sum_i (y_i - \T{\beta}x_i)^2
+ \frac{1}{\mu}\sum_i \norm{\beta}_2^2
\end{displaymath}
which is the well-known \emph{ridge regression}. $\mu
@@ -241,17 +240,17 @@ where $Y_S = \{y_i,\,i\in S\}$ is the set of observed data.
\begin{align*}
\forall S\subset\mathcal{N},\; V(S)
& = \frac{1}{2}\log\det\left(I_d
- + \mu\sum_{i\in S} x_ix_i^*\right)\\
+ + \mu\sum_{i\in S} x_i\T{x_i}\right)\\
& \defeq \frac{1}{2}\log\det A(S)
\end{align*}
\end{theorem}
\begin{proof}
-Let us denote by $X_S$ the matrix whose rows are the vectors $(x_i^*)_{i\in
+Let us denote by $X_S$ the matrix whose rows are the vectors $(\T{x_i})_{i\in
S}$. Observe that $A_S$ can simply be written as:
\begin{displaymath}
- A_S = I_d + \mu X_S^* X_S
+ A_S = I_d + \mu \T{X_S} X_S
\end{displaymath}
Let us recall that the entropy of a multivariate normal variable $B$ over
@@ -275,19 +274,19 @@ distribution of mean $X\beta$ and of covariance matrix $\sigma^2 I_n$. Hence:
$(Y_S)$ also follows a multivariate normal distribution of mean zero. Let us
compute its covariance matrix, $\Sigma_Y$:
\begin{align*}
- \Sigma_Y & = \expt{YY^*} = \expt{(X_S\beta + E)(X_S\beta + E)^*}\\
- & = \kappa X_S X_S^* + \sigma^2I_n
+ \Sigma_Y & = \expt{Y\T{Y}} = \expt{(X_S\beta + E)\T{(X_S\beta + E)}}\\
+ & = \kappa X_S \T{X_S} + \sigma^2I_n
\end{align*}
Thus, we get that:
\begin{equation}\label{eq:h2}
\mathbb{H}(Y_S)
- = \frac{1}{2}\log\left((2\pi e)^n \det(\kappa X_S X_S^* + \sigma^2 I_n)\right)
+ = \frac{1}{2}\log\left((2\pi e)^n \det(\kappa X_S \T{X_S} + \sigma^2 I_n)\right)
\end{equation}
Combining \eqref{eq:h1} and \eqref{eq:h2} we get:
\begin{displaymath}
V(S) = \frac{1}{2}\log\det\left(I_n+\frac{\kappa}{\sigma^2}X_S
- X_S^*\right)
+ \T{X_S}\right)
\end{displaymath}
Finally, we can use Sylvester's determinant theorem to get the result.
@@ -300,7 +299,7 @@ We have the following lemma.
\begin{lemma}[Marginal contribution]
\begin{displaymath}
\Delta_i V(S)\defeq V(S\cup\{i\}) - V(S)
- = \frac{1}{2}\log\left(1 + \mu x_i^*A(S)^{-1}x_i\right)
+ = \frac{1}{2}\log\left(1 + \mu \T{x_i} A(S)^{-1}x_i\right)
\end{displaymath}
\end{lemma}
@@ -308,10 +307,10 @@ We have the following lemma.
We have:
\begin{align*}
V(S\cup\{i\}) & = \frac{1}{2}\log\det A(S\cup\{i\})\\
- & = \frac{1}{2}\log\det\left(A(S) + \mu x_i x_i^*\right)\\
+ & = \frac{1}{2}\log\det\left(A(S) + \mu x_i \T{x_i}\right)\\
& = V(S) + \frac{1}{2}\log\det\left(I_d + \mu A(S)^{-1}x_i
- x_i^*\right)\\
- & = V(S) + \frac{1}{2}\log\left(1 + \mu x_i^* A(S)^{-1}x_i\right)
+ \T{x_i}\right)\\
+ & = V(S) + \frac{1}{2}\log\left(1 + \mu \T{x_i} A(S)^{-1}x_i\right)
\end{align*}
where the last equality comes from Sylvester's determinant formula.
\end{proof}