MatGPT/Block.m at master · ngdxzy/MatGPT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
function [out] = Block(Weights, in)
% function Block
% it is a transformer block
% Weights: A structure containing all weights:
%
%                                      ▲
%                                      │    out
%                                   ┌──┴──┐
%                                   │  +  ◄────────┐
%                                   └──▲──┘        │
%                                      │           │
%                              ┌───────┴──────┐    │
%          ┌───────────────────► Linear Layer │    │
%          │  mlp_c_proj_weight└───────▲──────┘    │
%          │  mlp_c_proj_bias          │           │
%          │                           │           │
%          │                      ┌────┴────┐      │
%          │                      │ NewRELU │      │
%          │                      └────▲────┘      │    temp
%          │                           │           │
%          │                   ┌───────┴──────┐    │
%          ├───────────────────► Linear Layer │    │
%          │  mlp_c_fc_weight  └───────▲──────┘    │
%          │  mlp_c_fc_bias            │           │
%          │                   ┌───────┴──────┐    │
%          ├───────────────────►  Layer Norm  │    │
%          │  ln_2_weights     └───────▲──────┘    │
%          │  ln_2_bias                │           │
%          │                           ├───────────┘
%          │                           │
%          │                        ┌──┴──┐
%  ────────┤                        │  +  ◄─────────────┐
%  Weights │  attn_c_proj_weight    └──▲──┘             │
%          │  attn_c_proj_bias         │                │
%          │                 ┌─────────┴────────┐       │
%          ├─────────────────►  Self Attention  │       │
%          │                 └─────────▲────────┘       │
%          │  attn_c_attn_weight       │                │   temp
%          │  attn_c_attn_bias         │                │
%          │                   ┌───────┴──────┐         │
%          └───────────────────►  Layer Norm  │         │
%             ln_1_weights     └───────▲──────┘         │
%             ln_1_bias                │                │
%                                      ├────────────────┘
%                                      │ in
%

% Save temp for future sum
temp = in;

% LayerNorm
in = LayerNorm(Weights.ln_1_weight, Weights.ln_1_bias, in);
% Do self-attention
in = SelfAttention(Weights, in);
% add together
in = temp + in;

% Save temp for future sum
temp = in;
% LayerNorm
in = LayerNorm(Weights.ln_2_weight, Weights.ln_2_bias, in);

% Fully connected layer 1: d_model to 4*d_model
% in     : N * d_model
% *weight: (4*d_model) * d_model
% *bias  : 1 * (4*d_model)

in = in * Weights.mlp_c_fc_weight' + Weights.mlp_c_fc_bias;
% Activation function
in = NGELU(in);

% Fully connected layer 2: 4*d_model to d_model
% in     : N * (4 * d_model)
% *weight: d_model * (4*d_model)
% *bias  : 1 * d_model
in = in * Weights.mlp_c_proj_weight' + Weights.mlp_c_proj_bias;

% add
out = in + temp;

% Dropout is not required in forwarding
% out = Dropout(in, 0.1);

end