statnotes/errors.tex at master · kwichmann/statnotes · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
\documentclass[12pt, a4paper]{article}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amsthm}
\usepackage{mathtools}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{definition}{Definition}[section]
\numberwithin{equation}{section}
\usepackage{pgfplots}
\pgfplotsset{width=10cm,compat=1.9}
\graphicspath{ {img/} }
\DeclareGraphicsExtensions{.png}

\title{Error functions - classification}
\author{Kristian Wichmann}

\begin{document}
\maketitle

\section{Continous-valued predictors}
Most predictors for classification will be based on a continuous-valued score for each of the two outcomes. The prediction is then made based on these values (usually, the one with the highest score is chosen).

Given the values from such a predictor, we would like to have some quantitative measure of how good the prediction is.

\subsection{Terminology}
Consider classification between $n$ cases. If the true classification for a sample $x$ is $k$, then we define the $n$-dimensional vector $t$ by $t_i=\delta_{ik}$. Let the values made by the predictor be the vector $h$, also a $n$-dimensional vector.

\section{Error functions}

\subsection{Squared error}
In this scheme we use the same error as used for OLS regression - the squared error:
\begin{equation}
\label{error_squared}
J(h)=\frac{1}{2}||h-t||^2=\frac{1}{2}(h-t)^T(h-t)=\frac{1}{2}\sum_{i=1}^n(h_i-t_i)^2
\end{equation}

\subsubsection{Derivative}
The derivative with respect to the $h_j$ is:
\begin{equation}
\frac{\partial}{\partial h_j}J(h)=\frac{1}{2}\sum_{i=1}^n\delta_{ij}2(h_i-t_i)=h_j-t_j
\end{equation}

\subsection{Cross-entropy error: Bernoulli}
This error is based on the sum of log-likelihoods of $n$ Bernoulli experiments:
\begin{equation}
\label{error_crossentropy_bernoulli}
J(h)=\sum_{i=1}^n\left[-t_i\log(h_i)-(1-t_i)\log(1-h_i)\right]
\end{equation}

\subsubsection{Derivative}
The derivative can be shown to be:
\begin{equation}
\frac{\partial}{\partial h_j}J(h)=\frac{h_j-t_j}{h_j(1-h_j)}
\end{equation}

\subsection{Cross-entropy error}
This error is similar to the one above, but here we consider $h$ to be a vector of probabilities. These must sum to one. The cross-entropy between $t$ and $h$ is then:
\begin{equation}
\label{error_crossentropy}
J(h)=-\sum_{i=1}^n t_i\log(h_i)=-\log(h_k)
\end{equation}
In the last step it's been used that $t_i=\delta_{ik}$.

\subsubsection{Derivative}
Once again, we're interested in the derivative:
\begin{equation}
\frac{\partial}{\partial h_j}J(h)=-\delta_{jk}\frac{1}{h_k}
\end{equation}

\section{Forms of the hypothesis $h$}
The hypothesis $h$ is in general the function of some $m$-dimensional input feature vector $x$, so $h=h(x)$. Here we will consider only hypotheses in which $h_i$ depends on linearly weighted combinations of the vector $x$. The resulting vector $z$ can therefore be written $z=w^T x$ in matrix form. Or element-wise $z_i=w_i x$. In other words, we only look at $h_i(x)=f_i(z)=f_i(w^T x)$. The functions $f$ are known as the \textit{activation functions}.

In general the derivative with respect to a weight $w_{jk}$ will then be:
\begin{equation}
\frac{\partial}{\partial w_{jk}}h_i=\sum_l\frac{\partial f_i}{\partial z_l}\frac{\partial z_l}{\partial w_{jk}}
\end{equation}
Since the last partial derivative is $\delta_{jl}x_k$ this means:
\begin{equation}
\label{hypothesis_derivative}
\frac{\partial}{\partial w_{jk}}h_i=\frac{\partial f_i}{\partial z_j}x_k
\end{equation}
Often, the activation function $f_i$ depends only on $z_i$, and so only derivatives with respect to $w_{ij}$ are non-zero:
\begin{equation}
\label{hypothesis_derivative2}
\frac{\partial}{\partial w_{ij}}h_i=\frac{\partial f_i}{\partial z_i}x_j
\end{equation}

\subsection{Linear hypothesis}
Here, the hypothesis is simply equal to $w_i^T x$, so that:
\begin{equation}
\label{hyp_linear}
h_i(x)=w_i^Tx
\end{equation}

\subsection{Derivative}
Since we're using the identity function as $f$, equation \ref{hypothesis_derivative2} is simply:
\begin{equation}
\frac{\partial}{\partial w_{ij}}h_i=x_j
\end{equation}

\subsection{Logistic hypothesis}
Here, the hypothesis is the logistic function of $w_i^T x$. The logistic function is:
\begin{equation}
\sigma(z)=\frac{1}{1+e^{-z}}
\end{equation}
And so, the hypothesis is:
\begin{equation}
\label{hyp_logistic}
h_i(x)=\sigma(w_i^T x)
\end{equation}

\subsubsection{Derivative}
We seek the derivative with respect to the weight $w_{ij}$. According to equation \ref{hypothesis_derivative2}:
\begin{equation}
\frac{\partial}{\partial w_{ij}}h_i=\frac{\partial\sigma}{\partial z_i}x_j
\end{equation}
The derivative of the logistic function can be shown to be $\sigma'(z)=\sigma(z)(1-\sigma(z))$. So:
\begin{equation}
\frac{\partial}{\partial w_{ij}}h_i=\sigma(z_i)(1-\sigma(z_i))x_j
\end{equation}

\subsection{Softmax hypothesis}
Here, the softmax function is used to normalize the hypothesis, so that they sum to 1. The softmax is defined by:
\begin{equation}
s_i(z)=\frac{e^{z_i}}{\sum_{j=1}^n e^{z_j}}
\end{equation}
And the hypothesis:
\begin{equation}
\label{hyp_softmax}
h_i(x)=s_i(w^T x)=\frac{e^{w_i^T x}}{N(w,x)}
\end{equation}
Here, $w$ is a matrix in which the columns are the weights $w_i$, and $N(w,x)$ is the normalization constant $\sum_{j=1}^n e^{z_j}$.

\subsubsection{Derivative}
It can be shown that the derivatives of the softmax is:
\begin{equation}
\frac{\partial s_i(z)}{\partial z_j}=s_i(z)(\delta_{ij}-s_j(z))
\end{equation}
So according to equation \ref{hypothesis_derivative}:
\begin{equation}
\frac{\partial}{\partial w_{jk}}h_i=s_i(z)(\delta_{ij}-s_j(z))x_k
\end{equation}

\section{Combining error and hypothesis types}

\subsection{Squared error with logistic hypothesis}
Combining equations \ref{error_squared} and \ref{hyp_logistic} we get the error function:
\begin{equation}
J(w,x)=\frac{1}{2}\sum_{i=1}^n(\sigma(w_i^T x)-t_i)^2
\end{equation}

\subsection{Cross-entropy Bernoulli error with logistic hypo\-thesis}
Combining equations \ref{error_crossentropy_bernoulli} and \ref{hyp_logistic} we arrive at the error function:
\begin{equation}
J(w,x)=\sum_{i=1}^n\left[-t_i\log(\sigma(w_i^T x))-(1-t_i)\log(1-\sigma(w_i^T x))\right]
\end{equation}

\subsection{Cross-entropy error with softmax hypothesis}
Combining equations \ref{error_crossentropy} and \ref{hyp_softmax} we arrive at the error function:
\begin{equation}
J(w,x)=-\log\frac{e^{w_k^T x}}{N(w,x)}=\log N(w,x)-w_k^T x
\end{equation}

\end{document}