diff options
| -rw-r--r-- | definitions.tex | 2 | ||||
| -rw-r--r-- | main.tex | 76 | ||||
| -rw-r--r-- | problem.tex | 57 |
3 files changed, 68 insertions, 67 deletions
diff --git a/definitions.tex b/definitions.tex index 378781d..e603e5a 100644 --- a/definitions.tex +++ b/definitions.tex @@ -19,3 +19,5 @@ \DeclareMathOperator*{\argmin}{arg\,min} \newcommand{\reals}{\ensuremath{\mathbbm{R}}} \newcommand{\stratis}[1]{\textcolor{red}{Stratis: #1}} +\newcommand{\thibaut}[1]{\textcolor{blue}{Thibaut: #1}} +\newcommand{\T}[1]{#1^T} @@ -37,7 +37,7 @@ Here, we use a relaxation of the objective function which is tailored to the problem of ridge regression. We define: \begin{displaymath} \forall\lambda\in[0,1]^{|\mathcal{N}|}\,\quad L_{\mathcal{N}}(\lambda) \defeq - \log\det\left(I_d + \mu\sum_{i\in\mathcal{N}} \lambda_i x_i x_i^*\right) + \log\det\left(I_d + \mu\sum_{i\in\mathcal{N}} \lambda_i x_i \T{x_i}\right) \end{displaymath} We can now present the mechanism we use, which has a same flavor to Chen's and @@ -47,7 +47,7 @@ Singer's \caption{Mechanism for ridge regression} \begin{algorithmic}[1] \State $i^* \gets \argmax_{j\in\mathcal{N}}V(j)$ - \State $x^* \gets \argmax_{x\in[0,1]^n} \{L_{\mathcal{N}\setminus\{i^*\}}(x) + \State $x^* \gets \argmax_{x\in[0,1]^{|\mathcal{N}|}} \{L_{\mathcal{N}\setminus\{i^*\}}(x) \,|\, c(x)\leq B\}$ \Statex \If{$L(x^*) < CV(i^*)$} @@ -118,7 +118,7 @@ The mechanism is monotone. \end{displaymath} in an earlier iteration of the greedy heuristic. Let us denote by $S_i$ (resp. $S_i'$) the set to which $i$ is added when reporting cost $c_i$ - (resp. $c_i'$). We have $S_i'\subset S_i$. Moreover: + (resp. $c_i'$). We have $S_i'\subseteq S_i$. Moreover: \begin{align*} c_i' & \leq c_i \leq \frac{B}{2}\frac{V(S_i\cup\{i\})-V(S_i)}{V(S_i\cup\{i\})}\\ @@ -245,30 +245,30 @@ of the objective function and thus ensures a constant approximation of the value function. The difficulty resides in showing that the ratio of the two relaxations is bounded. -We say that $R_\mathcal{N}:[0,1]^n\rightarrow\mathbf{R}$ is a relaxation of the +We say that $R_\mathcal{N}:[0,1]^{|\mathcal{N}|}\rightarrow\reals$ is a relaxation of the value function $V$ over $\mathcal{N}$ if it coincides with $V$ at binary -points. Formally, for any $S\subset\mathcal{N}$, let $\mathbf{1}_S$ denote the +points. Formally, for any $S\subseteq\mathcal{N}$, let $\mathbf{1}_S$ denote the indicator vector of $S$. $R_\mathcal{N}$ is a relaxation of $V$ over $\mathcal{N}$ iff: \begin{displaymath} - \forall S\subset\mathcal{N},\; R_\mathcal{N}(\mathbf{1}_S) = V(S) + \forall S\subseteq\mathcal{N},\; R_\mathcal{N}(\mathbf{1}_S) = V(S) \end{displaymath} We can extend the optimisation problem defined above to a relaxation by extending the cost function: \begin{displaymath} - \forall \lambda\in[0,1]^n,\; c(\lambda) + \forall \lambda\in[0,1]^{|\mathcal{N}|},\; c(\lambda) = \sum_{i\in\mathcal{N}}\lambda_ic_i \end{displaymath} The optimisation problem becomes: \begin{displaymath} OPT(R_\mathcal{N}, B) = - \max_{\lambda\in[0,1]^n}\left\{R_\mathcal{N}(\lambda)\,|\, c(\lambda)\leq B\right\} + \max_{\lambda\in[0,1]^{|\mathcal{N}|}}\left\{R_\mathcal{N}(\lambda)\,|\, c(\lambda)\leq B\right\} \end{displaymath} The relaxations we will consider here rely on defining a probability distribution over subsets of $\mathcal{N}$. -Let $\lambda\in[0,1]^n$, let us define: +Let $\lambda\in[0,1]^{|\mathcal{N}|}$, let us define: \begin{displaymath} P_\mathcal{N}^\lambda(S) = \prod_{i\in S}\lambda_i \prod_{i\in\mathcal{N}\setminus S}(1-\lambda_i) @@ -284,17 +284,17 @@ We will consider two relaxations of the value function $V$ over $\mathcal{N}$: \begin{align*} F_\mathcal{N}(\lambda) & = \mathbb{E}_{S\sim P_\mathcal{N}^\lambda}\big[\log\det A(S)\big]\\ - & = \sum_{S\subset\mathcal{N}} P_\mathcal{N}^\lambda(S) V(S)\\ - & = \sum_{S\subset\mathcal{N}} P_\mathcal{N}^\lambda(S) \log\det A(S)\\ + & = \sum_{S\subseteq\mathcal{N}} P_\mathcal{N}^\lambda(S) V(S)\\ + & = \sum_{S\subseteq\mathcal{N}} P_\mathcal{N}^\lambda(S) \log\det A(S)\\ \end{align*} \item the \emph{concave relaxation} of $V$: \begin{align*} L_{\mathcal{N}}(\lambda) & = \log\det \mathbb{E}_{S\sim P_\mathcal{N}^\lambda}\big[A(S)\big]\\ - & = \log\det\left(\sum_{S\subset N} + & = \log\det\left(\sum_{S\subseteq N} P_\mathcal{N}^\lambda(S)A(S)\right)\\ & = \log\det\left(I_d + \mu\sum_{i\in\mathcal{N}} - \lambda_ix_ix_i^*\right)\\ + \lambda_ix_i\T{x_i}\right)\\ & \defeq \log\det \tilde{A}(\lambda) \end{align*} \end{itemize} @@ -315,8 +315,8 @@ We will consider two relaxations of the value function $V$ over $\mathcal{N}$: \end{proof} \begin{lemma}[Rounding]\label{lemma:rounding} - For any feasible $\lambda\in[0,1]^n$, there exists a feasible - $\bar{\lambda}\in[0,1]^n$ such that at most one of its component is + For any feasible $\lambda\in[0,1]^{|\mathcal{N}|}$, there exists a feasible + $\bar{\lambda}\in[0,1]^{|\mathcal{N}|}$ such that at most one of its component is fractional, that is, lies in $(0,1)$ and: \begin{displaymath} F_{\mathcal{N}}(\lambda)\leq F_{\mathcal{N}}(\bar{\lambda}) @@ -373,7 +373,7 @@ We will consider two relaxations of the value function $V$ over $\mathcal{N}$: \begin{lemma}\label{lemma:relaxation-ratio} The following inequality holds: \begin{displaymath} - \forall\lambda\in[0,1]^n,\; + \forall\lambda\in[0,1]^{|\mathcal{N}|},\; \frac{\log\big(1+\mu\big)}{2\mu} \,L_\mathcal{N}(\lambda)\leq F_\mathcal{N}(\lambda)\leq L_{\mathcal{N}}(\lambda) @@ -422,9 +422,9 @@ We will consider two relaxations of the value function $V$ over $\mathcal{N}$: Hence: \begin{multline*} \partial_i F_\mathcal{N} = - \sum_{\substack{S\subset\mathcal{N}\\ i\in S}} + \sum_{\substack{S\subseteq\mathcal{N}\\ i\in S}} P_{\mathcal{N}\setminus\{i\}}^\lambda(S\setminus\{i\})V(S)\\ - - \sum_{\substack{S\subset\mathcal{N}\\ i\in \mathcal{N}\setminus S}} + - \sum_{\substack{S\subseteq\mathcal{N}\\ i\in \mathcal{N}\setminus S}} P_{\mathcal{N}\setminus\{i\}}^\lambda(S)V(S)\\ \end{multline*} @@ -432,9 +432,9 @@ We will consider two relaxations of the value function $V$ over $\mathcal{N}$: $S'\cup\{i\}$, we can write: \begin{multline*} \partial_i F_\mathcal{N} = - \sum_{\substack{S\subset\mathcal{N}\\ i\in\mathcal{N}\setminus S}} + \sum_{\substack{S\subseteq\mathcal{N}\\ i\in\mathcal{N}\setminus S}} P_{\mathcal{N}\setminus\{i\}}^\lambda(S)V(S\cup\{i\})\\ - - \sum_{\substack{S\subset\mathcal{N}\\ i\in \mathcal{N}\setminus S}} + - \sum_{\substack{S\subseteq\mathcal{N}\\ i\in \mathcal{N}\setminus S}} P_{\mathcal{N}\setminus\{i\}}^\lambda(S)V(S)\\ \end{multline*} @@ -442,50 +442,50 @@ We will consider two relaxations of the value function $V$ over $\mathcal{N}$: $S$: \begin{displaymath} \partial_i F_\mathcal{N}(\lambda) = - \sum_{\substack{S\subset\mathcal{N}\\ i\in\mathcal{N}\setminus S}} + \sum_{\substack{S\subseteq\mathcal{N}\\ i\in\mathcal{N}\setminus S}} P_{\mathcal{N}\setminus\{i\}}^\lambda(S) - \log\Big(1 + \mu x_i^*A(S)^{-1}x_i\Big) + \log\Big(1 + \mu \T{x_i}A(S)^{-1}x_i\Big) \end{displaymath} The computation of the derivative of $L_\mathcal{N}$ uses standard matrix calculus and gives: \begin{displaymath} \partial_i L_\mathcal{N}(\lambda) - = \mu x_i^* \tilde{A}(\lambda)^{-1}x_i + = \mu \T{x_i}\tilde{A}(\lambda)^{-1}x_i \end{displaymath} Using the following inequalities: \begin{gather*} - \forall S\subset\mathcal{N}\setminus\{i\},\quad + \forall S\subseteq\mathcal{N}\setminus\{i\},\quad P_{\mathcal{N}\setminus\{i\}}^\lambda(S)\geq P_{\mathcal{N}\setminus\{i\}}^\lambda(S\cup\{i\})\\ - \forall S\subset\mathcal{N},\quad P_{\mathcal{N}\setminus\{i\}}^\lambda(S) + \forall S\subseteq\mathcal{N},\quad P_{\mathcal{N}\setminus\{i\}}^\lambda(S) \geq P_\mathcal{N}^\lambda(S)\\ - \forall S\subset\mathcal{N},\quad A(S)^{-1} \geq A(S\cup\{i\})^{-1}\\ + \forall S\subseteq\mathcal{N},\quad A(S)^{-1} \geq A(S\cup\{i\})^{-1}\\ \end{gather*} we get: \begin{align*} \partial_i F_\mathcal{N}(\lambda) - & \geq \sum_{\substack{S\subset\mathcal{N}\\ i\in\mathcal{N}\setminus S}} + & \geq \sum_{\substack{S\subseteq\mathcal{N}\\ i\in\mathcal{N}\setminus S}} P_\mathcal{N}^\lambda(S) - \log\Big(1 + \mu x_i^*A(S)^{-1}x_i\Big)\\ + \log\Big(1 + \mu \T{x_i}A(S)^{-1}x_i\Big)\\ & \geq \frac{1}{2} - \sum_{\substack{S\subset\mathcal{N}\\ i\in\mathcal{N}\setminus S}} + \sum_{\substack{S\subseteq\mathcal{N}\\ i\in\mathcal{N}\setminus S}} P_\mathcal{N}^\lambda(S) - \log\Big(1 + \mu x_i^*A(S)^{-1}x_i\Big)\\ + \log\Big(1 + \mu \T{x_i}A(S)^{-1}x_i\Big)\\ &\hspace{-3.5em}+\frac{1}{2} - \sum_{\substack{S\subset\mathcal{N}\\ i\in\mathcal{N}\setminus S}} + \sum_{\substack{S\subseteq\mathcal{N}\\ i\in\mathcal{N}\setminus S}} P_\mathcal{N}^\lambda(S\cup\{i\}) - \log\Big(1 + \mu x_i^*A(S\cup\{i\})^{-1}x_i\Big)\\ + \log\Big(1 + \mu \T{x_i}A(S\cup\{i\})^{-1}x_i\Big)\\ &\geq \frac{1}{2} - \sum_{S\subset\mathcal{N}} + \sum_{S\subseteq\mathcal{N}} P_\mathcal{N}^\lambda(S) - \log\Big(1 + \mu x_i^*A(S)^{-1}x_i\Big)\\ + \log\Big(1 + \mu \T{x_i}A(S)^{-1}x_i\Big)\\ \end{align*} Using that $A(S)\geq I_d$ we get that: \begin{displaymath} - \mu x_i^*A(S)^{-1}x_i \leq \mu + \mu \T{x_i}A(S)^{-1}x_i \leq \mu \end{displaymath} Moreover: @@ -498,7 +498,7 @@ We will consider two relaxations of the value function $V$ over $\mathcal{N}$: \begin{displaymath} \partial_i F_\mathcal{N}(\lambda) \geq \frac{\log\big(1+\mu\big)}{2\mu} - x_i^*\bigg(\sum_{S\subset\mathcal{N}}P_\mathcal{N}^\lambda(S)A(S)^{-1}\bigg)x_i + \T{x_i}\bigg(\sum_{S\subseteq\mathcal{N}}P_\mathcal{N}^\lambda(S)A(S)^{-1}\bigg)x_i \end{displaymath} Finally, using that the inverse is a matrix convex function over symmetric @@ -506,7 +506,7 @@ We will consider two relaxations of the value function $V$ over $\mathcal{N}$: \begin{align*} \partial_i F_\mathcal{N}(\lambda) &\geq \frac{\log\big(1+\mu\big)}{2\mu} - x_i^*\bigg(\sum_{S\subset\mathcal{N}}P_\mathcal{N}^\lambda(S)A(S)\bigg)^{-1}x_i\\ + \T{x_i}\bigg(\sum_{S\subseteq\mathcal{N}}P_\mathcal{N}^\lambda(S)A(S)\bigg)^{-1}x_i\\ & \geq \frac{\log\big(1+\mu\big)}{2\mu} \partial_i L_\mathcal{N}(\lambda) \end{align*} @@ -515,7 +515,7 @@ We will consider two relaxations of the value function $V$ over $\mathcal{N}$: We can now prove lemma TODO from previous section. \begin{proof} - Let us consider a feasible point $\lambda^*\in[0,1]^n$ such that $L_\mathcal{N}(\lambda^*) + Let us consider a feasible point $\lambda^*\in[0,1]^{|\mathcal{N}|}$ such that $L_\mathcal{N}(\lambda^*) = OPT(L_\mathcal{N}, B)$. By applying lemma~\ref{lemma:relaxation-ratio} and lemma~\ref{lemma:rounding} we get a feasible point $\bar{\lambda}$ with at most one fractional component such that: diff --git a/problem.tex b/problem.tex index 3faef3b..75529ff 100644 --- a/problem.tex +++ b/problem.tex @@ -15,7 +15,7 @@ For example, the features could be the age, weight, or height of user $i$, while The variables $x_i$ and $y_i$ are related through the following relationship: \begin{align} - y_i = \beta^T x_i + \varepsilon_i,\quad\forall i\in\mathcal{N},\label{model} + y_i = \T{\beta} x_i + \varepsilon_i,\quad\forall i\in\mathcal{N},\label{model} \end{align} where $\beta\in\reals^d$ is the unknown parameter and $\varepsilon_i\in\reals$ are independent, normally distributed random variables, with zero mean and variance $\sigma^2_0$. The experimenter wishes to learn $\beta$ by observing the undisclosed $y_i$ of a subset of users $i\in S$. Hence, the set of possible experiments comprises all random variables $y_{S}\in\reals^{|S|}$, $S\subseteq \mathcal{N}$. @@ -27,11 +27,11 @@ Learning $\beta$ has many interesting applications, that make linear regression In the Bayesian setting, it is commonly assumed that $\beta$ follows a multivariate normal distribution of mean zero and covariance matrix $\sigma_1^2 -I_d$. Under this prior and the linear model \eqref{model}, the value of information \eqref{voi} of an experiment $Y_S$ is given by \cite{...} +I_d$. Under this prior and the linear model \eqref{model}, the value of information \eqref{voi} of an experiment $y_S$ is given by \cite{...} \begin{align}\label{vs} V(S) & \defeq I(\beta;y_S) = \frac{1}{2}\log\det\left(I_d - + \mu\sum_{i\in S} x_ix_i^T\right), + + \mu\sum_{i\in S} x_i\T{x_i}\right), % & \defeq \frac{1}{2}\log\det A(S) \end{align} where $\mu = \sigma_1^2/\sigma_0^2$. @@ -40,7 +40,7 @@ Some intuition behind the Gaussian prior on $\beta$ and the role that parameter It is easy to show that, under the linearity assumption \eqref{model} and the gaussian prior on $\beta$, maximum a posteriori estimation leads to the following maximization problem: \begin{displaymath} - \hat{\beta} = \argmin_{\beta\in\reals^d} \sum_i (y_i - \beta^*x_i)^2 + \hat{\beta} = \argmin_{\beta\in\reals^d} \sum_i (y_i - \T{\beta}x_i)^2 + \frac{1}{\mu}\sum_i \norm{\beta}_2^2 \end{displaymath} This optimization, commonly known as \emph{ridge regression}, reduces to a least squares fit for $\mu=\infty$. For finite $\mu$, ridge regression @@ -55,18 +55,17 @@ problem. Explain the goals: \item individually rational \item budget feasible \item has a good approximation ratio - -TODO Explain what is already known: it is ok when the function is submodular. When -should we introduce the notion of submodularity? \end{itemize} +\thibaut{TODO Explain what is already known: it is ok when the function is submodular. When +should we introduce the notion of submodularity?} \begin{comment} \subsection{Linear and Ridge Regression} Linear regression, a true workhorse of statistical analysis, fits a linear function to a set of observable input and output variables. In particular, consider a set of $n$ value pairs $(x_i,y_i)$, $i\in \mathcal{N} =\{1,\ldots,n\}$. The variables $x_i\in \reals^d$ are usually referred to as \emph{input} variables or \emph{covariates} and $y_i\in \reals$ are referred to as \emph{output} or \emph{response} variables. For example, the former could be the age, weight, or height of a person, while the latter can be their propensity to contract a disease. Assume that input and output variables are related through the following relationship: \begin{displaymath} - y_i = \beta^* x_i + \varepsilon_i,\quad\forall i\in\mathcal{N}, + y_i = \T{\beta} x_i + \varepsilon_i,\quad\forall i\in\mathcal{N}, \end{displaymath} where $\beta\in\reals^d$, is called the \emph{model parameter vector} and $\varepsilon_i\in\reals$ are independent, normally distributed random variables, with zero mean and variance $\sigma^2$. @@ -85,7 +84,7 @@ multivariate normal distribution of mean zero and covariance matrix $\kappa I_d$. Under this assumption, maximum a posteriori estimation leads to the following maximization problem, commonly known as \emph{ridge regression}: \begin{displaymath} - \hat{\beta} = \argmin_{\beta\in\reals^d} \sum_i (y_i - \beta^*x_i)^2 + \hat{\beta} = \argmin_{\beta\in\reals^d} \sum_i (y_i - T{\beta}x_i)^2 + \frac{1}{\mu}\sum_i \norm{\beta}_2^2 \end{displaymath} where @@ -102,16 +101,16 @@ acts as a sort of ``Occam's razor'', favoring a \emph{parsimonious} model for $\ Throughout the paper, we will make use of the following notations: if $x$ is -a (column) vector in $\reals^d$, $x^*$ denotes its transposed (line) +a (column) vector in $\reals^d$, $\T{x}$ denotes its transposed (line) vector. Thus, the standard inner product between two vectors $x$ and $y$ is -simply $x^* y$. $\norm{x}_2 = x^*x$ will denote the $L_2$ norm of $x$. +simply $\T{x}y$. $\norm{x}_2 = \T{x}x$ will denote the $L_2$ norm of $x$. We will also often use the following order over symmetric matrices: if $A$ and $B$ are two $d\times d$ and $B$ are two $d\times d$ real symmetric matrices, we write that $A\leq B$ iff: \begin{displaymath} \forall x\in\reals^d,\quad - x^*Ax \leq x^*Bx + \T{x}Ax \leq \T{x}Bx \end{displaymath} That is, iff $B-A$ is symmetric semi-definite positive. @@ -127,7 +126,7 @@ definite positive matrices. Linear regression, a true workhorse of statistical analysis, fits a linear function to a set of observable input and output variables. In particular, consider a set of $n$ value pairs $(x_i,y_i)$, $i\in \mathcal{N} =\{1,\ldots,n\}$. The variables $x_i\in \reals^d$ are usually referred to as \emph{input} variables or \emph{covariates} and $y_i\in \reals$ are referred to as \emph{output} or \emph{response} variables. For example, the former could be the age, weight, or height of a person, while the latter can be their propensity to contract a disease. Assume that input and output variables are related through the following relationship: \begin{displaymath} - y_i = \beta^* x_i + \varepsilon_i,\quad\forall i\in\mathcal{N}, + y_i = \T{\beta} x_i + \varepsilon_i,\quad\forall i\in\mathcal{N}, \end{displaymath} where $\beta\in\reals^d$, is called the \emph{model parameter vector} and $\varepsilon_i\in\reals$ are independent, normally distributed random variables, with zero mean and variance $\sigma^2$. @@ -136,9 +135,9 @@ The purpose of linear regression is to recover the vector $\beta$ upon the obser -In the absense of any additional information, a natural method for estimating $\beta$ is through a least squares fit. However, in a more general setup, +In the absence of any additional information, a natural method for estimating $\beta$ is through a least squares fit. However, in a more general setup, a prior knowledge about $\beta$, captured by a distribution over -$\reals^d$, can also be taken into account. In this setup, after observing the data, $\beta$ can be retreived through \emph{maximum a posteriori estimation}: computing the parameters which maximize the +$\reals^d$, can also be taken into account. In this setup, after observing the data, $\beta$ can be retrieved through \emph{maximum a posteriori estimation}: computing the parameters which maximize the posterior distribution of $\beta$ given the observations. A common prior assumption for linear regression is that $\beta$ follows a @@ -146,7 +145,7 @@ multivariate normal distribution of mean zero and covariance matrix $\kappa I_d$. Under this assumption, maximum a posteriori estimation leads to the following maximization problem, commonly known as \emph{ridge regression}: \begin{displaymath} - \hat{\beta} = \argmin_{\beta\in\reals^d} \sum_i (y_i - \beta^*x_i)^2 + \hat{\beta} = \argmin_{\beta\in\reals^d} \sum_i (y_i - \T{\beta}x_i)^2 + \frac{1}{\mu}\sum_i \norm{\beta}_2^2 \end{displaymath} where @@ -184,7 +183,7 @@ experimenter has not seen the data yet, but he wants to know which users he should be selecting. His goal is to learn the model underlying the data. Here, we assume a linear model: \begin{displaymath} - \forall i\in\mathcal{N},\quad y_i = \beta^* x_i + \varepsilon_i + \forall i\in\mathcal{N},\quad y_i = \T{\beta}x_i + \varepsilon_i \end{displaymath} where $\beta\in\reals^d$ and $\varepsilon_i\in\reals$ follows a normal distribution of mean $0$ and variance $\sigma^2$. Furthermore, we assume the @@ -203,7 +202,7 @@ a multivariate normal distribution of mean zero and covariance matrix $\kappa I_d$. Maximum a posteriori estimation leads to the following maximization problem: \begin{displaymath} - \beta_{\text{max}} = \argmax_{\beta\in\reals^d} \sum_i (y_i - \beta^*x_i)^2 + \beta_{\text{max}} = \argmax_{\beta\in\reals^d} \sum_i (y_i - \T{\beta}x_i)^2 + \frac{1}{\mu}\sum_i \norm{\beta}_2^2 \end{displaymath} which is the well-known \emph{ridge regression}. $\mu @@ -241,17 +240,17 @@ where $Y_S = \{y_i,\,i\in S\}$ is the set of observed data. \begin{align*} \forall S\subset\mathcal{N},\; V(S) & = \frac{1}{2}\log\det\left(I_d - + \mu\sum_{i\in S} x_ix_i^*\right)\\ + + \mu\sum_{i\in S} x_i\T{x_i}\right)\\ & \defeq \frac{1}{2}\log\det A(S) \end{align*} \end{theorem} \begin{proof} -Let us denote by $X_S$ the matrix whose rows are the vectors $(x_i^*)_{i\in +Let us denote by $X_S$ the matrix whose rows are the vectors $(\T{x_i})_{i\in S}$. Observe that $A_S$ can simply be written as: \begin{displaymath} - A_S = I_d + \mu X_S^* X_S + A_S = I_d + \mu \T{X_S} X_S \end{displaymath} Let us recall that the entropy of a multivariate normal variable $B$ over @@ -275,19 +274,19 @@ distribution of mean $X\beta$ and of covariance matrix $\sigma^2 I_n$. Hence: $(Y_S)$ also follows a multivariate normal distribution of mean zero. Let us compute its covariance matrix, $\Sigma_Y$: \begin{align*} - \Sigma_Y & = \expt{YY^*} = \expt{(X_S\beta + E)(X_S\beta + E)^*}\\ - & = \kappa X_S X_S^* + \sigma^2I_n + \Sigma_Y & = \expt{Y\T{Y}} = \expt{(X_S\beta + E)\T{(X_S\beta + E)}}\\ + & = \kappa X_S \T{X_S} + \sigma^2I_n \end{align*} Thus, we get that: \begin{equation}\label{eq:h2} \mathbb{H}(Y_S) - = \frac{1}{2}\log\left((2\pi e)^n \det(\kappa X_S X_S^* + \sigma^2 I_n)\right) + = \frac{1}{2}\log\left((2\pi e)^n \det(\kappa X_S \T{X_S} + \sigma^2 I_n)\right) \end{equation} Combining \eqref{eq:h1} and \eqref{eq:h2} we get: \begin{displaymath} V(S) = \frac{1}{2}\log\det\left(I_n+\frac{\kappa}{\sigma^2}X_S - X_S^*\right) + \T{X_S}\right) \end{displaymath} Finally, we can use Sylvester's determinant theorem to get the result. @@ -300,7 +299,7 @@ We have the following lemma. \begin{lemma}[Marginal contribution] \begin{displaymath} \Delta_i V(S)\defeq V(S\cup\{i\}) - V(S) - = \frac{1}{2}\log\left(1 + \mu x_i^*A(S)^{-1}x_i\right) + = \frac{1}{2}\log\left(1 + \mu \T{x_i} A(S)^{-1}x_i\right) \end{displaymath} \end{lemma} @@ -308,10 +307,10 @@ We have the following lemma. We have: \begin{align*} V(S\cup\{i\}) & = \frac{1}{2}\log\det A(S\cup\{i\})\\ - & = \frac{1}{2}\log\det\left(A(S) + \mu x_i x_i^*\right)\\ + & = \frac{1}{2}\log\det\left(A(S) + \mu x_i \T{x_i}\right)\\ & = V(S) + \frac{1}{2}\log\det\left(I_d + \mu A(S)^{-1}x_i - x_i^*\right)\\ - & = V(S) + \frac{1}{2}\log\left(1 + \mu x_i^* A(S)^{-1}x_i\right) + \T{x_i}\right)\\ + & = V(S) + \frac{1}{2}\log\left(1 + \mu \T{x_i} A(S)^{-1}x_i\right) \end{align*} where the last equality comes from Sylvester's determinant formula. \end{proof} |
