statnotes/linear_model.tex at master · kwichmann/statnotes · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
\documentclass[12pt, a4paper]{article}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amsthm}
\usepackage{mathtools}
\usepackage{MnSymbol}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{definition}{Definition}[section]
\numberwithin{equation}{section}

\title{The Linear Model}
\author{Kristian Wichmann}

\begin{document}

\maketitle

The \textit{linear model} is a theoretical framework that unifies a number of statistical concepts, like ANOVA and regression.

\section{Derivatives and linear algebra}
We will need a few results concerning derivatives of linear algebra expressions. Consider a linear function:
\begin{equation}
f: \mathbb{R}^n\rightarrow\mathbb{R}, f(\beta)=a^t\beta=
\begin{pmatrix}
a_1 & a_2 & \cdots & a_n
\end{pmatrix}
\begin{pmatrix}
\beta_1 \\ \beta_2 \\ \vdots \\ \beta_n
\end{pmatrix}
\end{equation}
Here, $a\in\mathbb{R}^{n\times 1}$ and $\beta\in\mathbb{R}^{n\times 1}$, so in other words:
\begin{equation}
f(\beta)=a_1\beta_1+a_2\beta_2+\cdots a_n\beta_n
\end{equation}
The (multidimensional) derivative is therefore:
\begin{equation}
\label{scalarproductdif}
\frac{\partial f}{\partial \beta}=\nabla_\beta f=
\begin{pmatrix}
a_1 \\ a_2 \\ \vdots \\ a_n
\end{pmatrix}
=a
\end{equation}
If $A\in\mathbb{R}^{n\times m}$ instead, each column in $a$ will map according to equation \ref{scalarproductdif} under differentiation, and so generalizes to:
\begin{equation}
\label{matrixdif}
\frac{\partial}{\partial\beta}A^t\beta = A
\end{equation}
Since $a^t\beta=\beta^t a$, by analogy this means that we also have:
\begin{equation}
\label{matrixdif2}
\frac{\partial}{\partial\beta}\beta^t A = A
\end{equation}
Similarly, consider a quadratic form in $\beta$:
\begin{equation}
g: \mathbb{R}^n\rightarrow\mathbb{R}, g(\beta)=\beta^t A\beta =
\begin{pmatrix}
\beta_1 & \beta_2 & \cdots & \beta_n
\end{pmatrix}
\begin{pmatrix}
a_{11} & a_{12} & \cdots & a_{1n} \\
a_{21} & a_{22} & \cdots & a_{2n} \\
\vdots & \vdots & \ddots & \vdots \\
a_{n1} & a_{n2} & \cdots & a_{nn}
\end{pmatrix}
\begin{pmatrix}
\beta_1 \\ \beta_2 \\ \vdots \\ \beta_n
\end{pmatrix}
\end{equation}
Here, $A\in\mathbb{R}^{n\times n}$ and $\beta\in\mathbb{R}^{n\times 1}$. Furthermore, $A$ is assumed to be symmetric, such that $a_{ij}=a_{ji}$. Multiplying out, this means that:
\begin{equation}
g(\beta)=\sum_{i=1}^n\sum_{j=1}^n\beta_i a_{ij}\beta_j
\end{equation}
Differentiating with respect to $\beta_k$ only terms where $i=k$ or $i=j$ will contribute. However, the case $i=j=k$ is distinct. So, when $i=k$ we get the contribution $a_{kj}\beta_j$. When $j=k$ we get $\beta_i a_{ij}$. And when $i=j=k$ we get $2a_{kk}\beta_k$. All in all, when summing up, we get two of each $a$-$\beta$ set (because of the symmetry of $A$). So:
\begin{equation}
\frac{\partial g}{\partial\beta_k}=2\sum_{i=1}^n a_{ik}\beta_i
\end{equation}
Or more compactly:
\begin{equation}
\label{quadformdif}
\frac{\partial g}{\partial \beta}=\nabla_\beta g=2A\beta
\end{equation}

\section{Ordinary least squares estimation (OLS)}

\subsection{Statement of the problem}
The general problem is this: We wish to model a linear relationship between a response variables $Y$ and $p$ predictor variables $X_1, X_2,\cdots X_p$. In other words:
\begin{equation}
\label{linear1}
Y=\beta_1 X_1 + \beta_2 X_2 + \cdots + \beta_p X_p+\epsilon
\end{equation}
Here, the $\beta$'s are the coefficients corresponding to the $X$'s. The random term $\epsilon$ is known as the \textit{error term} and represents the deviations from the exact model. Since it is a random variable, so is $Y$. Now, assume that we have $n$ realizations (data points), so that $y_i$ corresponds to $x_{i1}, x_{i2},\cdots x_{ip}$. In matrix form equation $(\ref{linear1})$ now becomes:
\begin{equation}
\label{linear2}
y=X\beta+\epsilon
\end{equation}
Here, $y\in\mathbb{R}^{n\times 1}, X\in\mathbb{R}^{n\times p}$ and $\beta\in\mathbb{R}^{p\times 1}$:
\begin{equation}
y =
\begin{pmatrix}
	y_1 \\ y_2 \\ \vdots \\ y_n
\end{pmatrix},\quad
X =
\begin{pmatrix}
	x_{11} 	& x_{12} 	& \ldots	& x_{1p} \\
	x_{21} 	& x_{22} 	& \ldots	& x_{2p} \\
	\vdots	& \vdots	& \ddots	& \vdots \\
	x_{n1} 	& x_{n2} 	& \ldots	& x_{np} \\
\end{pmatrix},\quad
\beta =
\begin{pmatrix}
	\beta_1	\\ \beta_2	\\ \vdots\\	\beta_p
\end{pmatrix},\quad
\epsilon =
\begin{pmatrix}
	\epsilon_1 \\ \epsilon_2 \\ \vdots \\ \epsilon_n
\end{pmatrix}
\end{equation}
$X$ is known as the \textit{design matrix}. Given $y$ and $X$, we seek the best fit for $\beta$.

\subsection{Least squares}
There's a number of criteria one could use to pick the best fitting $\beta$. Here, we will search for the one that minimizes the square of the differences in predicted and actual $y$ values. When predicting, we can't include the error term, and so the predicted values for a set of parameters $\hat{\beta}$ are simply:
\begin{equation}
y=X\hat{\beta}
\end{equation}
The squared difference is:
\begin{align*}
||y-X\hat{\beta}||^2 &=(y-X\hat{\beta})^t(y-X\hat{\beta})=(y^t-\hat{\beta}^t X^t)(y-X\hat{\beta})\\
&=y^t y - y^t X\hat{\beta} - \hat{\beta}^t X^t y + \hat{\beta}^t X^t X\hat{\beta}
\end{align*}
Taking the derivative with respect to $\beta$ we can now use equations \ref{matrixdif}, \ref{matrixdif2}, and \ref{quadformdif} to yield:
\begin{equation}
-2X^t y + 2X^t X\hat{\beta}
\end{equation}
Since we're looking for a minimum, this vector should be equal to zero:
\begin{equation}
\label{normaleq}
-2X^t y + 2X^t X\hat{\beta}=0\Leftrightarrow\hat{\beta}=(X^t X)^{-1}X^t y
\end{equation}
Here it has been assumed that $X^t X$ is invertible. If $X^t X$ is not invertible, we have a case of \textit{perfect (multi)collinearity}. The equations in \ref{normaleq} are known as the \textit{normal equations} for the model. Inserting into equation $(\ref{linear2})$ we get the corresponding predicted $y$-values, also denoted by a hat:
\begin{equation}
\hat{y}=X\hat{\beta}=\underbrace{X(X^t X)^{-1}X^t}_{H}y
\end{equation}
The matrix $H=X(X^t X)^{-1}X^t$ is often called the \textit{hat matrix}, since it puts the hat on the $y$'s. The hat matrix can also be used to find \textit{residuals}, i.e. the difference between actual and predicted $y$-values:
\begin{equation}
e=y-\hat{y}=y-Hy=\underbrace{(I-H)}_{M} y
\end{equation}

\subsection{Properties of the OLS estimator}
First of all we note, that the estimation is a linear function of the $y$ values.

The estimated value of $\beta$ according to OLS is:
\begin{equation}
\hat{\beta}=(X^t X)^{-1}X^t y=(X^t X)^{-1}X^t(X\beta+\epsilon)
\end{equation}
Here, we have re-inserted equation \ref{linear2}. We may now consider the expected value:
\begin{equation}
\label{ols_var}
E[\hat{\beta}]=\beta+(X^t X)X^t E[\epsilon]
\end{equation}


\section{Geometric picture}
It is useful to adapt the picture of the columns of $X$ spanning a $p$-dimensional hyperplane in $n$-dimensional space. $y$ is then a vector, and $X\hat{\beta}$ is found by projecting $y$ onto the hyperplane; The corresponding point is exactly the one that minimizes the distance between $y$ (as a point) and the hyperplane.

\subsection{Projection operators}
A linear map that is symmetric and idempotent is called a \textit{projection}. A matrix corresponding to such a mapping is a projection matrix.

\begin{theorem}
The hat matrix $H$ is a projection matrix.
\end{theorem}
\begin{proof}
We need to show that $H$ is symmetric and idempotent. Symmetry:
\begin{equation}
X(X^t X)^{-1}X^t)^t=X\left[(X^t X)^{-1}\right]^t X^t
\end{equation}
But the transpose of an inverse is the same as the inverse of the transpose, so:
\begin{equation}
\left[(X^t X)^{-1}\right]^t=\left[(X^t X)^t\right]^{-1}=(X^t X)^{-1}
\end{equation}
This proves the symmetry of $H$. Idempotency:
\begin{equation}
H^2=\left[X(X^t X)^{-1}X^t\right]^2=X(X^t X)^{-1}X^tX(X^t X)^{-1}X^t=X(X^t X)^{-1}X^t=H
\end{equation}
\end{proof}

This also turns out to be true for the matrix used to find residuals:

\begin{theorem}
The matrix $M=I-H$ is a projection matrix.
\end{theorem}
\begin{proof}
Symmetry follows from the symmetry of $H$. Idempotency:
\begin{equation}
M^2=(I-H)^2=I^2+H^2-2H=I+H-2H=I-H=M
\end{equation}
\end{proof}

\section{The Gauss-Markov theorem}
So far, we have considered only the ordinary least squares (OLS) estimator of the vector $\beta$. But clearly it is not the only possibility. What is the justification for picking this particular estimator? The answer lies in the \textit{Gauss-Markov theorem}. According to this theorem, under certain conditions, the OLS is the best linear unbiased estimator. This is often abbreviated to \textit{BLUE}. Let's examine the meaning of this.

\subsection{Linear and unbiased}
We already know that the OLS estimator is linear in terms of the $y$'s.

Recall, that an estimator is \textit{unbiased} if its expectation value is the true value. Here it means:
\begin{equation}
E[\hat{\beta}]=\beta
\end{equation}
From equation \ref{ols_var} we know, that this is true exactly when the expectation value of $\epsilon$ is zero.

\subsection{'Best'}
In this context, "best" means having the smallest possible variance. We could express this by requiring every estimator element of the $\hat{\beta}$ vector to have a minimal variance. But we will go further than this: Let $\hat{\gamma}=\sum_{i=1}^p c_i\hat{\beta}_i=C\hat{\beta}$ be an arbitrary linear combination of the predictors. Then the variance of every such expression should be minimal. According to the usual rules of calculating variance:
\begin{equation}
\textrm{var}(\hat{\gamma})=\textrm{var}(C\hat{\beta})=C\textrm{var}(\hat{\beta})C^t
\end{equation}
Consider another estimator $\tilde{\beta}$ which has a covariance matrix of:
\begin{equation}
\textrm{var}(\tilde{\beta})=\textrm{var}(\hat{\beta})+\Delta
\end{equation}
Here $\Delta$ is the deviation from our proposed best estimator. The variance of a linear combination of the tilde estimator is:
\begin{equation}
\textrm{var}(\tilde{\gamma})=C\textrm{var}(\tilde{\beta})C^t=C(\textrm{var}(\hat{\beta})+\Delta)C^t=\textrm{var}(\tilde{\beta})+\underbrace{C\Delta C^t}
\end{equation}
Hence $\hat{\beta}$ is the best estimator if and only if the underbraced quantity is always positive, except when $C=0$. In other words, exactly when $\Delta$ is positive definite.

\subsection{The theorem}

\begin{theorem}
(Gauss-Markov) Given a linear model with design matrix $X$, responses $y$, and true parameters $\beta$. Assume the following three conditions for the error terms $\epsilon_i$ are met:
\begin{itemize}
\item The expected value is zero: $E[\epsilon_i]=0$.
\item The variance of the error terms are finite and constant: $\textrm{var}(\epsilon_i)=\sigma^2<\infty$. This is known as homoscedasticity.
\item The error terms are pairwise uncorrelated: $\textrm{cov}(\epsilon_i,\epsilon_j)=0, i\neq j$.
\end{itemize}
Then, the OLS estimator $\hat{\beta}=(X^t X)^{-1}X^t y$ is BLUE.
\end{theorem}

\begin{proof}
Let $\tilde{\beta}=Cy$ be another unbiased linear estimator of $\beta$. We may  then write the matrix $C$ as $(X^t X)^{-1}X^t+D$, where $D$ is the deviation from the OLS estimator. Then we may calculate the expected value:
\begin{equation}
E[\tilde{\beta}]=E[Cy]=E[(X^t X)^{-1}X^t+D)(X\beta+\epsilon)]=(X^t X)^{-1}X^t+D)(X\beta+E[\epsilon])
\end{equation}
By the first assumption this is:
\begin{equation}
((X^t X)^{-1}X^t+D)X\beta=(X^t X)^{-1}X^t X\beta+DX\beta=\beta+DX\beta
\end{equation}
Since $\tilde{\beta}$ is an unbiased estimator, we must have $DX=0$. Now, let's compute the variance:
\begin{equation}
\textrm{var}(\tilde{\beta})=\textrm{var}(Cy)=C\textrm{var}(y)C^t
\end{equation}
Here, we've used a property of variances. By the homoscedasticity assumptions, this is simply:
\begin{equation}
\sigma^2 CC^t=\sigma^2((X^t X)^{-1}X^t+D)((X^t X)^{-1}X^t+D)^t
\end{equation}
Since $X^t X$ is symmetric, so is the inverse, so $((X^t X)^{-1}X^t+D)^t=X(X^t X)^{-1}+D^t$. So we get:
\begin{equation}
\sigma^2((X^t X)^{-1}X^t+D)(X(X^t X)^{-1}+D^t)
\end{equation}
Ignoring the $\sigma^2$ factor for a while, this is:
\begin{equation}
(X^t X)^{-1}X^tX(X^t X)^{-1}+(X^t X)^{-1}X^tD^t+DX(X^t X)^{-1}+DD^t
\end{equation}
But since we just concluded $DX=0$ the two middle terms vanish (since $X^tD^t=(DX)^t=0$). So, reinstating the $\sigma^2$, the variance is
\begin{equation}
\textrm{var}(\tilde{\beta})=\sigma^2(X^t X)^{-1}+\sigma^2 DD^t
\end{equation}
The first term is what we would get without the $D$ term, and is therefore the variance of the OLS estimator. $DD^t$ is a positive definite matrix, and hence according to the section above, $\hat{\beta}$ is the least variance estimator.
\end{proof}

Note that no assumptions of independence, identical distribution or normality is assumed of the error terms.

\subsection{Omitted variable bias}
Assume we have forgotten, missed or simply not had access to a (set of) important predictor variables $z_1, z_2,\ldots, z_n$. The parameters corresponding to these are called $\gamma$, and we may now rewrite the model:
\begin{equation}
y=X\beta+\underbrace{Z\gamma+\delta}_{\epsilon}
\end{equation}
Here, $\delta$ is the error terms associated with the new variables. We will assume that since the missing variables are important/good, the expectation value of this error is zero: $E[\delta]=0$. The error term of the original model is now the underbraced part of the equation.

\section{Linear models in Euclidean space}
Let $V=\mathbb{R}^n$ with the usual inner product. A linear model on $V$ consists of the following basic ingredients:
\begin{itemize}
\item A subspace $L\subset V$ (notice the requirement for $L$ to be a proper subspace), known as the \textit{mean value subspace}. The dimension of $L$ is denoted $k$.
\item A symmetric, positive definite matrix $\Sigma$, which will act as the base covariance of the elements of the model.
\end{itemize}
The linear model then consists of all the regular normal distributions on $V$ where the mean value is in $L$, and the variance matrix is proportional to $\Sigma$:
\begin{equation}
X_{\mu,\sigma^2}\sim N\left(\mu,\sigma^2\Sigma\right),\quad\mu\in L,\sigma^2\in\mathbb{R}_+
\end{equation}
Often, we will simply let $\Sigma=I_n$. This corresponds to the situation where the components of $X$ are independent and have the same variance. This is the assumption in models like linear regression and analysis of variance.

\subsection{Likelihood function}
The density function of a multivariate normal gives us the likelihood function for the model:
\begin{equation}
L_x(\mu,\sigma^2)=\frac{1}{\sqrt{\textrm{det}(2\pi\sigma^2\Sigma)}}\exp\left[-\frac{1}{2\sigma^2}(x-\mu)^t\Sigma^{-1}(x-\mu)\right]
\end{equation}
Here $x\in\mathbb{R}^n$ is the observation. The log-likelihood is then:
\begin{equation}
l_x(\mu,\sigma^2)=\frac{1}{2}\log\left(\textrm{det}(2\pi\sigma^2\Sigma)\right)+\frac{1}{2\sigma^2}(x-\mu)^t\Sigma^{-1}(x-\mu)
\end{equation}
We can simplify this by introducing a new inner product on $V$ with associated norm:
\begin{equation}
\langle x,y\rangle_\Sigma=x^t\Sigma^{-1}y,\quad ||x||^2_\Sigma=x^t\Sigma^{-1}x
\end{equation}
Then the log-likelihood is simply:
\begin{equation}
l_X(\mu,\sigma^2)=\frac{1}{2}\log\left(\textrm{det}(2\pi\sigma^2\Sigma)\right)+\frac{1}{2\sigma^2}||x-\mu||^2_\Sigma
\end{equation}

\subsection{Maximum likelihood estimation}
We can now perform MLE on the model.

\subsubsection{MLE for $\mu$}
Remember that $\mu$ is constrained to lie in $L$. Let $\{e_1,e_2,\cdots,e_k\}$ be an orthonormal basis for $L$ with respect to the inner product $\langle\cdot,\cdot\rangle_\Sigma$. Then the constraint is:
\begin{equation}
\mu=\sum_{j=1}^k c_j e_i
\end{equation}
Now we may write the log-likelihood as a function of $c$:
\begin{equation}
\label{loglikelihood_c}
l_X(c,\sigma^2)=\frac{1}{2}\log\left(\textrm{det}(2\pi\sigma^2\Sigma)\right)+\frac{1}{2\sigma^2}||x-\sum_{j=1}^k c_j e_j||^2_\Sigma
\end{equation}
The full norm expression from equation \ref{loglikelihood_c} can be expanded as:
\begin{align}
||x-\sum_{j=1}^k c_j e_j||^2_\Sigma=&\langle x,x\rangle_\Sigma+\langle\sum_{j=1}^k c_j e_j,\sum_{j'=1}^k c_{j'}e_{j'}\rangle_\Sigma-2\langle x,\sum_{j=1}^k c_j e_j\rangle_\Sigma=\\
&\langle x,x\rangle_\Sigma+\sum_{j=1}^k\sum_{j'=1}^k c_j c_{j'}\langle e_j,e_{j'}\rangle_\Sigma-2\sum_{j=1}^k c_j\langle x,e_j\rangle_\Sigma=\\
&\langle x,x\rangle_\Sigma+\sum_{j=1}^k\left[c_j^2-2c_j\langle x,e_j\rangle_\Sigma\right]
\end{align}
Here we've used the orthonormality of the $e$'s. We now find the derivative of the log-likelihood:
\begin{equation}
\frac{l_X}{\partial c_i}=\frac{1}{2\sigma^2}\sum_{j=1}^k\left[2c_j\delta_{ij}-2\delta_{ij}\langle x,e_j\rangle_\Sigma\right]=\frac{1}{\sigma^2}[c_i-\langle x,e_i\rangle_\Sigma]
\end{equation}
These should be zero for all $i$ at minimum, so:
\begin{equation}
c_i=\langle x,e_i\rangle_\Sigma\end{equation}
But this corresponds to projecting $x$ on to $L$ with respect to the inner product $\langle\cdot,\cdot\rangle_\Sigma$. And so the maximum likelihood estimate is:
\begin{equation}
\mu_{\textrm{MLE}}=p_L(x)
\end{equation}
Here it's understood that the projection is with respect to the new inner $\Sigma$-product. This fact could also have been found by using Pythagoras' theorem:
\begin{equation}
\label{pythagoras_mle}
||x-\mu||^2_\Sigma=||x-p_L(x)||^2_\Sigma+||p_L(x)-\mu||^2_\Sigma
\end{equation}
The first term is constant given the observations. So we need to minimize $||p_L(x)-\mu||^2_\Sigma$. Which is clearly happening when $p_L(x)-\mu=0$, leading to the very same conclusion.

\subsubsection{MLE for $\sigma^2$}
The profile log-likelihood given the MLE estimate for $\mu$ is:
\begin{equation}
\tilde{l}_X(\sigma^2)=\frac{1}{2}\log\left(\textrm{det}(2\pi\sigma^2\Sigma)\right)+\frac{1}{2\sigma^2}||x-\mu_{\textrm{MLE}}||^2_\Sigma
\end{equation}
But from equation \ref{pythagoras_mle} we know that:
\begin{equation}
||x-\mu_{\textrm{MLE}}||^2_\Sigma=||x-p_L(x)||^2_\Sigma
\end{equation}
We also want to rewrite the determinant as:
\begin{equation}
\textrm{det}(2\pi\sigma^2\Sigma)=\textrm{det}(2\pi\Sigma)(\sigma^2)^n
\end{equation}
So, splitting the logarithm into a sum:
\begin{equation}
\tilde{l}_X(\sigma^2)=\frac{1}{2}\log\left(\textrm{det}(2\pi\Sigma)\right)+\frac{n}{2}\log(\sigma^2)+\frac{1}{2\sigma^2}||x-p_L(x)||^2_\Sigma
\end{equation}
The derivative with respect to $\sigma^2$ is:
\begin{equation}
\frac{\partial\tilde{l}_x}{\partial(\sigma^2)}=\frac{n}{2\sigma^2}-\frac{1}{2(\sigma^2)^2}||x-p_L(x)||^2_\Sigma=\frac{1}{2\sigma^2}\left[n-\frac{1}{\sigma^2}||x-p_L(x)||^2_\Sigma\right]
\end{equation}
For this to be zero, we must have:
\begin{equation}
\frac{1}{\sigma^2}||x-p_L(x)||^2_\Sigma=n\Leftrightarrow\sigma^2=\frac{||x-p_L(x)||^2_\Sigma}{n}
\end{equation}
This is the MLE estimate of $\sigma^2$:
\begin{equation}
\sigma^2_{\textrm{MLE}}=\frac{||x-p_L(x)||^2_\Sigma}{n}
\end{equation}
Since $p_L(x)$ and $x-p_L(x)$ are orthogonal according to the decomposition theorem, we can use Pythagoras' theorem:
\begin{equation}
||x||^2_\Sigma=||x-p_L(x)+p_L(x)||^2_\Sigma=||x-p_L(x)||^2_\Sigma+||p_L(x)||^2_\Sigma
\end{equation}
So we could also calculate the MLE estimate as:
\begin{equation}
\sigma^2_{\textrm{MLE}}=\frac{||x||^2_\Sigma-||p_L(x)||^2_\Sigma}{n}
\end{equation}
As we will shortly see, however, the MLE estimate of $\sigma^2$ is biased.

\subsection{Distribution of the MLE estimators}
We now wish to investigate the distributions of these estimates. For this, the decomposition theorem will come in handy.

First, consider the estimator for $\mu$, $p_L(X)$. We know that $p_L(X)$ is normally distributed with mean $p_L(\mu)$, where $\mu$ is the true mean. We also know that the covariance matrix is $\sigma^2 p_L\Sigma p_L$, where once again $\sigma^2$ is the true value. But this is inconvenient, since we know that it has rank $k$ - the values are constrained to $L$. Let $\{e_1,\cdots,e_k\}$ be a basis for $L$. Then an arbitrary point $x$ in $L$ can be written:
\begin{equation}
x=\sum_{i=1}^k\beta_i e_i=A\beta
\end{equation}
Here, we've collected the basis vectors in the $n\times k$ matrix $A$:
\begin{equation}
A=
\begin{pmatrix}
| & \cdots & | \\
e_1 & \cdots & e_k \\
| & \cdots & |
\end{pmatrix}
\end{equation}
The question now is what the distribution of the vector $\beta$ is when $X=A\beta$. $A$ is not invertible, but it has the left inverse $A_L^{-1}=(A^t A)^{-1}A^t$:
\begin{equation}
(A^t A)^{-1}A^t X=(A^t A)^{-1}A^t A\beta=\beta
\end{equation}
So, $\beta$ is normally distributed with mean $A_L^{-1}\mu$. But $A_L^{-1}=p_\beta$ is the projection operator on $L$ in terms of the basis $\{e_1,\cdots,e_k\}$, so we get exactly the $\beta$-coordinates of $p_L(\mu)$. The covariance matrix is:
\begin{equation}
\underbrace{A((A^t A)^{-1})^t}_{p_\beta^t}\sigma^2\Sigma\underbrace{(A^t A)^{-1}A^t}_{p_\beta}
\end{equation}
So, essentially the true covariance matrix transformed into the $\beta$ space.

Now consider the estimator of the variance $\sigma^2$. According to the decomposition theorem, $X-p_L(X)$ is normally distributed with a covariance matrix $C$ of rank $n-k$. Consider:
\begin{equation}

\end{equation}


\section{Abstract definition of a linear model}
This section will deal with the linear model in its most abstract form.

Let $V$ be a vector space of finite dimension $N$. To specify a linear model we need two ingredients:
\begin{itemize}
\item A subspace $L\subset V$. Do note that we require $L$ to be a proper subset of $V$, i.e. $\textrm{dim}L<N$. This subspace is known as the \textit{mean value subspace}.
\item An inner product $\langle\cdot,\cdot\rangle$ on $V$.
\end{itemize}
The inner product induces a family of inner products $\llangle\cdot,\cdot\rrangle_{\sigma^2}$ parametrized by $\sigma^2>0$:
\begin{equation}
\llangle\cdot,\cdot\rrangle_{\sigma^2}=\frac{\langle\cdot,\cdot\rangle}{\sigma^2}
\end{equation}
These inner products are known as \textit{precisions}. While they do not agree on distances, the precisions do agree on orthogonality.

The linear model


\end{document}