From 93b7aa2f787a7b1efb9a7946c7d5c44b184efbc0 Mon Sep 17 00:00:00 2001 From: Li Xing Date: Thu, 22 Feb 2024 10:33:33 +0800 Subject: [PATCH 1/3] split Attention and Transformer --- .gitignore | 162 ++++++++++++++++++++++++++++ model/Attention.py | 207 +++++++++++++++++++++++++++++++++++ model/MyTransformer.py | 237 ++++++++--------------------------------- 3 files changed, 412 insertions(+), 194 deletions(-) create mode 100644 .gitignore create mode 100644 model/Attention.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..66efa11 --- /dev/null +++ b/.gitignore @@ -0,0 +1,162 @@ +cache + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/model/Attention.py b/model/Attention.py new file mode 100644 index 0000000..f8bd286 --- /dev/null +++ b/model/Attention.py @@ -0,0 +1,207 @@ +import copy + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Parameter +from torch.nn.init import xavier_uniform_ + +is_print_shape = True + + +class MyMultiheadAttention(nn.Module): + """ + 多头注意力机制的计算公式为(就是论文第5页的公式): + .. math:: + \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O + \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) + """ + + def __init__(self, embed_dim, num_heads, dropout=0., bias=True): + super(MyMultiheadAttention, self).__init__() + """ + :param embed_dim: 词嵌入的维度,也就是前面的d_model参数,论文中的默认值为512 + :param num_heads: 多头注意力机制中多头的数量,也就是前面的nhead参数, 论文默认值为 8 + :param dropout: + :param bias: 最后对多头的注意力(组合)输出进行线性变换时,是否使用偏置 + """ + self.embed_dim = embed_dim # 前面的d_model参数 + self.head_dim = embed_dim // num_heads # head_dim 指的就是d_k,d_v + self.kdim = self.head_dim + self.vdim = self.head_dim + + self.num_heads = num_heads # 多头个数 + self.dropout = dropout + + assert self.head_dim * num_heads == self.embed_dim, "embed_dim 除以 num_heads必须为整数" + # 上面的限制条件就是论文中的 d_k = d_v = d_model/n_head 条件 + + # embed_dim = kdim * num_heads + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + # 这里第二个维度之所以是embed_dim,实际上这里是同时初始化了num_heads个W_q堆叠起来的, 也就是num_heads个头 + # W_k, embed_dim = kdim * num_heads + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + # W_v, embed_dim = vdim * num_heads + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + # 最后将所有的Z组合起来的时候,也是一次性完成, embed_dim = vdim * num_heads + self._reset_parameters() + + def _reset_parameters(self): + """ + 以特定方式来初始化参数 + :return: + """ + for p in self.parameters(): + if p.dim() > 1: + xavier_uniform_(p) + + def forward(self, query, key, value, attn_mask=None, key_padding_mask=None): + """ + 在论文中,编码时query, key, value 都是同一个输入, 解码时 输入的部分也都是同一个输入, + 解码和编码交互时 key,value指的是 memory, query指的是tgt + :param query: # [tgt_len, batch_size, embed_dim], tgt_len 表示目标序列的长度 + :param key: # [src_len, batch_size, embed_dim], src_len 表示源序列的长度 + :param value: # [src_len, batch_size, embed_dim], src_len 表示源序列的长度 + :param attn_mask: # [tgt_len,src_len] or [num_heads*batch_size,tgt_len, src_len] + 一般只在解码时使用,为了并行一次喂入所有解码部分的输入,所以要用mask来进行掩盖当前时刻之后的位置信息 + :param key_padding_mask: [batch_size, src_len], src_len 表示源序列的长度 + :return: + attn_output: [tgt_len, batch_size, embed_dim] + attn_output_weights: # [batch_size, tgt_len, src_len] + """ + return multi_head_attention_forward(query, key, value, self.num_heads, + self.dropout, + out_proj=self.out_proj, + training=self.training, + key_padding_mask=key_padding_mask, + q_proj=self.q_proj, + k_proj=self.k_proj, + v_proj=self.v_proj, + attn_mask=attn_mask) + + +def multi_head_attention_forward(query, # [tgt_len,batch_size, embed_dim] + key, # [src_len, batch_size, embed_dim] + value, # [src_len, batch_size, embed_dim] + num_heads, + dropout_p, + # [embed_dim = vdim * num_heads, embed_dim = vdim * num_heads] + out_proj, + training=True, + # [batch_size,src_len/tgt_len] + key_padding_mask=None, + q_proj=None, # [embed_dim,kdim * num_heads] + k_proj=None, # [embed_dim, kdim * num_heads] + v_proj=None, # [embed_dim, vdim * num_heads] + # [tgt_len,src_len] or [num_heads*batch_size,tgt_len, src_len] + attn_mask=None, + ): + q = q_proj(query) + # [tgt_len,batch_size, embed_dim] x [embed_dim,kdim * num_heads] = [tgt_len,batch_size,kdim * num_heads] + + k = k_proj(key) + # [src_len, batch_size, embed_dim] x [embed_dim, kdim * num_heads] = [src_len, batch_size, kdim * num_heads] + + v = v_proj(value) + # [src_len, batch_size, embed_dim] x [embed_dim, vdim * num_heads] = [src_len, batch_size, vdim * num_heads] + if is_print_shape: + print("" + "=" * 80) + print("进入多头注意力计算:") + print( + f"\t 多头num_heads = {num_heads}, d_model={query.size(-1)}, d_k = d_v = d_model/num_heads={query.size(-1) // num_heads}") + print( + f"\t query的shape([tgt_len, batch_size, embed_dim]):{query.shape}") + print( + f"\t W_q 的shape([embed_dim,kdim * num_heads]):{q_proj.weight.shape}") + print( + f"\t Q 的shape([tgt_len, batch_size,kdim * num_heads]):{q.shape}") + print("\t" + "-" * 70) + + print(f"\t key 的shape([src_len,batch_size, embed_dim]):{key.shape}") + print( + f"\t W_k 的shape([embed_dim,kdim * num_heads]):{k_proj.weight.shape}") + print( + f"\t K 的shape([src_len,batch_size,kdim * num_heads]):{k.shape}") + print("\t" + "-" * 70) + + print(f"\t value的shape([src_len,batch_size, embed_dim]):{value.shape}") + print( + f"\t W_v 的shape([embed_dim,vdim * num_heads]):{v_proj.weight.shape}") + print( + f"\t V 的shape([src_len,batch_size,vdim * num_heads]):{v.shape}") + print("\t" + "-" * 70) + print("\t ***** 注意,这里的W_q, W_k, W_v是多个head同时进行计算的. 因此,Q,K,V分别也是包含了多个head的q,k,v堆叠起来的结果 *****") + + tgt_len, bsz, embed_dim = query.size() # [tgt_len,batch_size, embed_dim] + src_len = key.size(0) + head_dim = embed_dim // num_heads # num_heads * head_dim = embed_dim + scaling = float(head_dim) ** -0.5 + q = q * scaling # [query_len,batch_size,kdim * num_heads] + + # [tgt_len,src_len] or [num_heads*batch_size,tgt_len, src_len] + if attn_mask is not None: + if attn_mask.dim() == 2: + attn_mask = attn_mask.unsqueeze(0) # [1, tgt_len,src_len] + if list(attn_mask.size()) != [1, query.size(0), key.size(0)]: + raise RuntimeError( + 'The size of the 2D attn_mask is not correct.') + elif attn_mask.dim() == 3: + if list(attn_mask.size()) != [bsz * num_heads, query.size(0), key.size(0)]: + raise RuntimeError( + 'The size of the 3D attn_mask is not correct.') + # 现在 atten_mask 的维度就变成了3D + + q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1) + # [batch_size * num_heads,tgt_len,kdim] + # 因为前面是num_heads个头一起参与的计算,所以这里要进行一下变形,以便于后面计算。 且同时交换了0,1两个维度 + k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, + 1) # [batch_size * num_heads,src_len,kdim] + v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, + 1) # [batch_size * num_heads,src_len,vdim] + attn_output_weights = torch.bmm(q, k.transpose(1, 2)) + # [batch_size * num_heads,tgt_len,kdim] x [batch_size * num_heads, kdim, src_len] + # = [batch_size * num_heads, tgt_len, src_len] 这就num_heads个QK相乘后的注意力矩阵 + + if attn_mask is not None: + # [batch_size * num_heads, tgt_len, src_len] + attn_output_weights += attn_mask + + if key_padding_mask is not None: + attn_output_weights = attn_output_weights.view( + bsz, num_heads, tgt_len, src_len) + # 变成 [batch_size, num_heads, tgt_len, src_len]的形状 + attn_output_weights = attn_output_weights.masked_fill( + key_padding_mask.unsqueeze(1).unsqueeze(2), + float('-inf')) # + # 扩展维度,key_padding_mask从[batch_size,src_len]变成[batch_size,1,1,src_len] + # 然后再对attn_output_weights进行填充 + attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, + src_len) # [batch_size * num_heads, tgt_len, src_len] + + # [batch_size * num_heads, tgt_len, src_len] + attn_output_weights = F.softmax(attn_output_weights, dim=-1) + attn_output_weights = F.dropout( + attn_output_weights, p=dropout_p, training=training) + attn_output = torch.bmm(attn_output_weights, v) + # Z = [batch_size * num_heads, tgt_len, src_len] x [batch_size * num_heads,src_len,vdim] + # = # [batch_size * num_heads,tgt_len,vdim] + # 这就num_heads个Attention(Q,K,V)结果 + + attn_output = attn_output.transpose( + 0, 1).contiguous().view(tgt_len, bsz, embed_dim) + # 先transpose成 [tgt_len, batch_size* num_heads ,kdim] + # 再view成 [tgt_len,batch_size,num_heads*kdim] + attn_output_weights = attn_output_weights.view( + bsz, num_heads, tgt_len, src_len) + + Z = out_proj(attn_output) + # 这里就是多个z 线性组合成Z [tgt_len,batch_size,embed_dim] + if is_print_shape: + print( + f"\t 多头注意力中,多头计算结束后的形状(堆叠)为([tgt_len,batch_size,num_heads*kdim]){attn_output.shape}") + print( + f"\t 多头计算结束后,再进行线性变换时的权重W_o的形状为([num_heads*vdim, num_heads*vdim ]){out_proj.weight.shape}") + print(f"\t 多头线性变化后的形状为([tgt_len,batch_size,embed_dim]) {Z.shape}") + # average attention weights over heads + return Z, attn_output_weights.sum(dim=1) / num_heads diff --git a/model/MyTransformer.py b/model/MyTransformer.py index 4e11c85..b603406 100755 --- a/model/MyTransformer.py +++ b/model/MyTransformer.py @@ -1,9 +1,10 @@ -from torch.nn.init import xavier_uniform_ -import torch.nn.functional as F -from torch.nn import Parameter -import torch.nn as nn import copy + import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Parameter +from torch.nn.init import xavier_uniform_ is_print_shape = True @@ -24,14 +25,18 @@ def __init__(self, d_model=512, nhead=8, num_encoder_layers=6, """ # ================ 编码部分 ===================== - encoder_layer = MyTransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout) + encoder_layer = MyTransformerEncoderLayer( + d_model, nhead, dim_feedforward, dropout) encoder_norm = nn.LayerNorm(d_model) - self.encoder = MyTransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) + self.encoder = MyTransformerEncoder( + encoder_layer, num_encoder_layers, encoder_norm) # ================ 解码部分 ===================== - decoder_layer = MyTransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout) + decoder_layer = MyTransformerDecoderLayer( + d_model, nhead, dim_feedforward, dropout) decoder_norm = nn.LayerNorm(d_model) - self.decoder = MyTransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm) + self.decoder = MyTransformerDecoder( + decoder_layer, num_decoder_layers, decoder_norm) self._reset_parameters() @@ -61,19 +66,22 @@ def forward(self, src, tgt, src_mask=None, tgt_mask=None, :param memory_key_padding_mask: [batch_size, src_len] :return: [tgt_len, batch_size, num_heads * kdim] <==> [tgt_len,batch_size,embed_dim] """ - memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask) + memory = self.encoder(src, mask=src_mask, + src_key_padding_mask=src_key_padding_mask) # [src_len, batch_size, num_heads * kdim] <==> [src_len,batch_size,embed_dim] output = self.decoder(tgt=tgt, memory=memory, tgt_mask=tgt_mask, memory_mask=memory_mask, tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask) - return output # [tgt_len, batch_size, num_heads * kdim] <==> [tgt_len,batch_size,embed_dim] + # [tgt_len, batch_size, num_heads * kdim] <==> [tgt_len,batch_size,embed_dim] + return output def generate_square_subsequent_mask(self, sz): r"""Generate a square mask for the sequence. The masked positions are filled with float('-inf'). Unmasked positions are filled with float(0.0). """ mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1) - mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) + mask = mask.float().masked_fill(mask == 0, float( + '-inf')).masked_fill(mask == 1, float(0.0)) return mask # [sz,sz] @@ -110,16 +118,20 @@ def forward(self, src, src_mask=None, src_key_padding_mask=None): :return """ src2 = self.self_attn(src, src, src, attn_mask=src_mask, - key_padding_mask=src_key_padding_mask, )[0] # 计算多头注意力 + # 计算多头注意力 + key_padding_mask=src_key_padding_mask, )[0] # src2: [src_len,batch_size,num_heads*kdim] num_heads*kdim = embed_dim src = src + self.dropout1(src2) # 残差连接 src = self.norm1(src) # [src_len,batch_size,num_heads*kdim] - src2 = self.activation(self.linear1(src)) # [src_len,batch_size,dim_feedforward] - src2 = self.linear2(self.dropout(src2)) # [src_len,batch_size,num_heads*kdim] + # [src_len,batch_size,dim_feedforward] + src2 = self.activation(self.linear1(src)) + # [src_len,batch_size,num_heads*kdim] + src2 = self.linear2(self.dropout(src2)) src = src + self.dropout2(src2) src = self.norm2(src) - return src # [src_len, batch_size, num_heads * kdim] <==> [src_len,batch_size,embed_dim] + # [src_len, batch_size, num_heads * kdim] <==> [src_len,batch_size,embed_dim] + return src class MyTransformerEncoder(nn.Module): @@ -131,7 +143,8 @@ def __init__(self, encoder_layer, num_layers, norm=None): norm: 归一化层 """ - self.layers = _get_clones(encoder_layer, num_layers) # 克隆得到多个encoder layers 论文中默认为6 + self.layers = _get_clones( + encoder_layer, num_layers) # 克隆得到多个encoder layers 论文中默认为6 self.num_layers = num_layers self.norm = norm @@ -147,7 +160,8 @@ def forward(self, src, mask=None, src_key_padding_mask=None): src_key_padding_mask=src_key_padding_mask) # 多个encoder layers层堆叠后的前向传播过程 if self.norm is not None: output = self.norm(output) - return output # [src_len, batch_size, num_heads * kdim] <==> [src_len,batch_size,embed_dim] + # [src_len, batch_size, num_heads * kdim] <==> [src_len,batch_size,embed_dim] + return output def _get_clones(module, N): @@ -163,9 +177,11 @@ def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1): :param dim_feedforward: 全连接中向量的维度,论文默认值为 2048 :param dropout: 丢弃率,论文中的默认值为 0.1 """ - self.self_attn = MyMultiheadAttention(embed_dim=d_model, num_heads=nhead, dropout=dropout) + self.self_attn = MyMultiheadAttention( + embed_dim=d_model, num_heads=nhead, dropout=dropout) # 解码部分输入序列之间的多头注意力(也就是论文结构图中的Masked Multi-head attention) - self.multihead_attn = MyMultiheadAttention(embed_dim=d_model, num_heads=nhead, dropout=dropout) + self.multihead_attn = MyMultiheadAttention( + embed_dim=d_model, num_heads=nhead, dropout=dropout) # 编码部分输出(memory)和解码部分之间的多头注意力机制。 # Implementation of Feedforward model @@ -209,12 +225,15 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, tgt_key_padding_ tgt = tgt + self.dropout2(tgt2) # 残差连接 tgt = self.norm2(tgt) # [tgt_len, batch_size, embed_dim] - tgt2 = self.activation(self.linear1(tgt)) # [tgt_len, batch_size, dim_feedforward] - tgt2 = self.linear2(self.dropout(tgt2)) # [tgt_len, batch_size, embed_dim] + # [tgt_len, batch_size, dim_feedforward] + tgt2 = self.activation(self.linear1(tgt)) + # [tgt_len, batch_size, embed_dim] + tgt2 = self.linear2(self.dropout(tgt2)) # 最后的两层全连接 tgt = tgt + self.dropout3(tgt2) tgt = self.norm3(tgt) - return tgt # [tgt_len, batch_size, num_heads * kdim] <==> [tgt_len,batch_size,embed_dim] + # [tgt_len, batch_size, num_heads * kdim] <==> [tgt_len,batch_size,embed_dim] + return tgt class MyTransformerDecoder(nn.Module): @@ -246,175 +265,5 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, tgt_key_padding_ if self.norm is not None: output = self.norm(output) - return output # [tgt_len, batch_size, num_heads * kdim] <==> [tgt_len,batch_size,embed_dim] - - -class MyMultiheadAttention(nn.Module): - """ - 多头注意力机制的计算公式为(就是论文第5页的公式): - .. math:: - \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O - \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) - """ - - def __init__(self, embed_dim, num_heads, dropout=0., bias=True): - super(MyMultiheadAttention, self).__init__() - """ - :param embed_dim: 词嵌入的维度,也就是前面的d_model参数,论文中的默认值为512 - :param num_heads: 多头注意力机制中多头的数量,也就是前面的nhead参数, 论文默认值为 8 - :param dropout: - :param bias: 最后对多头的注意力(组合)输出进行线性变换时,是否使用偏置 - """ - self.embed_dim = embed_dim # 前面的d_model参数 - self.head_dim = embed_dim // num_heads # head_dim 指的就是d_k,d_v - self.kdim = self.head_dim - self.vdim = self.head_dim - - self.num_heads = num_heads # 多头个数 - self.dropout = dropout - - assert self.head_dim * num_heads == self.embed_dim, "embed_dim 除以 num_heads必须为整数" - # 上面的限制条件就是论文中的 d_k = d_v = d_model/n_head 条件 - - self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) # embed_dim = kdim * num_heads - # 这里第二个维度之所以是embed_dim,实际上这里是同时初始化了num_heads个W_q堆叠起来的, 也就是num_heads个头 - self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) # W_k, embed_dim = kdim * num_heads - self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) # W_v, embed_dim = vdim * num_heads - self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - # 最后将所有的Z组合起来的时候,也是一次性完成, embed_dim = vdim * num_heads - self._reset_parameters() - - def _reset_parameters(self): - """ - 以特定方式来初始化参数 - :return: - """ - for p in self.parameters(): - if p.dim() > 1: - xavier_uniform_(p) - - def forward(self, query, key, value, attn_mask=None, key_padding_mask=None): - """ - 在论文中,编码时query, key, value 都是同一个输入, 解码时 输入的部分也都是同一个输入, - 解码和编码交互时 key,value指的是 memory, query指的是tgt - :param query: # [tgt_len, batch_size, embed_dim], tgt_len 表示目标序列的长度 - :param key: # [src_len, batch_size, embed_dim], src_len 表示源序列的长度 - :param value: # [src_len, batch_size, embed_dim], src_len 表示源序列的长度 - :param attn_mask: # [tgt_len,src_len] or [num_heads*batch_size,tgt_len, src_len] - 一般只在解码时使用,为了并行一次喂入所有解码部分的输入,所以要用mask来进行掩盖当前时刻之后的位置信息 - :param key_padding_mask: [batch_size, src_len], src_len 表示源序列的长度 - :return: - attn_output: [tgt_len, batch_size, embed_dim] - attn_output_weights: # [batch_size, tgt_len, src_len] - """ - return multi_head_attention_forward(query, key, value, self.num_heads, - self.dropout, - out_proj=self.out_proj, - training=self.training, - key_padding_mask=key_padding_mask, - q_proj=self.q_proj, - k_proj=self.k_proj, - v_proj=self.v_proj, - attn_mask=attn_mask) - - -def multi_head_attention_forward(query, # [tgt_len,batch_size, embed_dim] - key, # [src_len, batch_size, embed_dim] - value, # [src_len, batch_size, embed_dim] - num_heads, - dropout_p, - out_proj, # [embed_dim = vdim * num_heads, embed_dim = vdim * num_heads] - training=True, - key_padding_mask=None, # [batch_size,src_len/tgt_len] - q_proj=None, # [embed_dim,kdim * num_heads] - k_proj=None, # [embed_dim, kdim * num_heads] - v_proj=None, # [embed_dim, vdim * num_heads] - attn_mask=None, # [tgt_len,src_len] or [num_heads*batch_size,tgt_len, src_len] - ): - q = q_proj(query) - # [tgt_len,batch_size, embed_dim] x [embed_dim,kdim * num_heads] = [tgt_len,batch_size,kdim * num_heads] - - k = k_proj(key) - # [src_len, batch_size, embed_dim] x [embed_dim, kdim * num_heads] = [src_len, batch_size, kdim * num_heads] - - v = v_proj(value) - # [src_len, batch_size, embed_dim] x [embed_dim, vdim * num_heads] = [src_len, batch_size, vdim * num_heads] - if is_print_shape: - print("" + "=" * 80) - print("进入多头注意力计算:") - print( - f"\t 多头num_heads = {num_heads}, d_model={query.size(-1)}, d_k = d_v = d_model/num_heads={query.size(-1) // num_heads}") - print(f"\t query的shape([tgt_len, batch_size, embed_dim]):{query.shape}") - print(f"\t W_q 的shape([embed_dim,kdim * num_heads]):{q_proj.weight.shape}") - print(f"\t Q 的shape([tgt_len, batch_size,kdim * num_heads]):{q.shape}") - print("\t" + "-" * 70) - - print(f"\t key 的shape([src_len,batch_size, embed_dim]):{key.shape}") - print(f"\t W_k 的shape([embed_dim,kdim * num_heads]):{k_proj.weight.shape}") - print(f"\t K 的shape([src_len,batch_size,kdim * num_heads]):{k.shape}") - print("\t" + "-" * 70) - - print(f"\t value的shape([src_len,batch_size, embed_dim]):{value.shape}") - print(f"\t W_v 的shape([embed_dim,vdim * num_heads]):{v_proj.weight.shape}") - print(f"\t V 的shape([src_len,batch_size,vdim * num_heads]):{v.shape}") - print("\t" + "-" * 70) - print("\t ***** 注意,这里的W_q, W_k, W_v是多个head同时进行计算的. 因此,Q,K,V分别也是包含了多个head的q,k,v堆叠起来的结果 *****") - - tgt_len, bsz, embed_dim = query.size() # [tgt_len,batch_size, embed_dim] - src_len = key.size(0) - head_dim = embed_dim // num_heads # num_heads * head_dim = embed_dim - scaling = float(head_dim) ** -0.5 - q = q * scaling # [query_len,batch_size,kdim * num_heads] - - if attn_mask is not None: # [tgt_len,src_len] or [num_heads*batch_size,tgt_len, src_len] - if attn_mask.dim() == 2: - attn_mask = attn_mask.unsqueeze(0) # [1, tgt_len,src_len] - if list(attn_mask.size()) != [1, query.size(0), key.size(0)]: - raise RuntimeError('The size of the 2D attn_mask is not correct.') - elif attn_mask.dim() == 3: - if list(attn_mask.size()) != [bsz * num_heads, query.size(0), key.size(0)]: - raise RuntimeError('The size of the 3D attn_mask is not correct.') - # 现在 atten_mask 的维度就变成了3D - - q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1) - # [batch_size * num_heads,tgt_len,kdim] - # 因为前面是num_heads个头一起参与的计算,所以这里要进行一下变形,以便于后面计算。 且同时交换了0,1两个维度 - k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1) # [batch_size * num_heads,src_len,kdim] - v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1) # [batch_size * num_heads,src_len,vdim] - attn_output_weights = torch.bmm(q, k.transpose(1, 2)) - # [batch_size * num_heads,tgt_len,kdim] x [batch_size * num_heads, kdim, src_len] - # = [batch_size * num_heads, tgt_len, src_len] 这就num_heads个QK相乘后的注意力矩阵 - - if attn_mask is not None: - attn_output_weights += attn_mask # [batch_size * num_heads, tgt_len, src_len] - - if key_padding_mask is not None: - attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len) - # 变成 [batch_size, num_heads, tgt_len, src_len]的形状 - attn_output_weights = attn_output_weights.masked_fill( - key_padding_mask.unsqueeze(1).unsqueeze(2), - float('-inf')) # - # 扩展维度,key_padding_mask从[batch_size,src_len]变成[batch_size,1,1,src_len] - # 然后再对attn_output_weights进行填充 - attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, - src_len) # [batch_size * num_heads, tgt_len, src_len] - - attn_output_weights = F.softmax(attn_output_weights, dim=-1) # [batch_size * num_heads, tgt_len, src_len] - attn_output_weights = F.dropout(attn_output_weights, p=dropout_p, training=training) - attn_output = torch.bmm(attn_output_weights, v) - # Z = [batch_size * num_heads, tgt_len, src_len] x [batch_size * num_heads,src_len,vdim] - # = # [batch_size * num_heads,tgt_len,vdim] - # 这就num_heads个Attention(Q,K,V)结果 - - attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) - # 先transpose成 [tgt_len, batch_size* num_heads ,kdim] - # 再view成 [tgt_len,batch_size,num_heads*kdim] - attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len) - - Z = out_proj(attn_output) - # 这里就是多个z 线性组合成Z [tgt_len,batch_size,embed_dim] - if is_print_shape: - print(f"\t 多头注意力中,多头计算结束后的形状(堆叠)为([tgt_len,batch_size,num_heads*kdim]){attn_output.shape}") - print(f"\t 多头计算结束后,再进行线性变换时的权重W_o的形状为([num_heads*vdim, num_heads*vdim ]){out_proj.weight.shape}") - print(f"\t 多头线性变化后的形状为([tgt_len,batch_size,embed_dim]) {Z.shape}") - return Z, attn_output_weights.sum(dim=1) / num_heads # average attention weights over heads + # [tgt_len, batch_size, num_heads * kdim] <==> [tgt_len,batch_size,embed_dim] + return output From bd8da7c0898b8b65ab4d7f39a22f53297decaedf Mon Sep 17 00:00:00 2001 From: Li Xing Date: Thu, 22 Feb 2024 15:05:54 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E6=9B=B4=E6=96=B0torch=E5=92=8Cspacy?= =?UTF-8?q?=E7=9A=84=E7=89=88=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 13 ++++++---- config/config.py | 22 +++++++++++----- model/MyTransformer.py | 10 ++++--- utils/data_helpers.py | 59 ++++++++++++++++++++++++++---------------- 4 files changed, 67 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index c9359ed..cd01ae2 100644 --- a/README.md +++ b/README.md @@ -6,11 +6,14 @@ ## 1. 环境准备 -* Python==3.x -* PyTorch==1.5.0 -* torchtext==0.6.0 -* pip install de_core_news_sm-3.0.0.tar.gz -* pip install en_core_web_sm-3.0.0.tar.gz +* Python==3.12.2 +* PyTorch==2.2.0 +* torchtext==0.16.2 + +spacy model: +de_core_news_sm 3.7.0 +en-core-web-sm 3.7.1 + ## 2. 使用方法 * STEP 1. 直接下载或克隆本项目:https://github.com/moon-hotel/TransformerTranslation diff --git a/config/config.py b/config/config.py index 2114549..e243d2f 100644 --- a/config/config.py +++ b/config/config.py @@ -1,7 +1,10 @@ +import logging import os + import torch + from utils.log_helper import logger_init -import logging + class Config(): """ @@ -10,12 +13,15 @@ class Config(): def __init__(self): # 数据集设置相关配置 - self.project_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + self.project_dir = os.path.dirname( + os.path.dirname(os.path.abspath(__file__))) self.dataset_dir = os.path.join(self.project_dir, 'data') self.train_corpus_file_paths = [os.path.join(self.dataset_dir, 'train.de'), # 训练时编码器的输入 - os.path.join(self.dataset_dir, 'train.en')] # 训练时解码器的输入 + # 训练时解码器的输入 + os.path.join(self.dataset_dir, 'train.en')] self.val_corpus_file_paths = [os.path.join(self.dataset_dir, 'val.de'), # 验证时编码器的输入 - os.path.join(self.dataset_dir, 'val.en')] # 验证时解码器的输入 + # 验证时解码器的输入 + os.path.join(self.dataset_dir, 'val.en')] self.test_corpus_file_paths = [os.path.join(self.dataset_dir, 'test_2016_flickr.de'), os.path.join(self.dataset_dir, 'test_2016_flickr.en')] self.min_freq = 1 # 在构建词表的过程中滤掉词(字)频小于min_freq的词(字) @@ -31,7 +37,8 @@ def __init__(self): self.beta1 = 0.9 self.beta2 = 0.98 self.epsilon = 10e-9 - self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') + self.device = torch.device( + 'cuda:0' if torch.cuda.is_available() else 'cpu') self.epochs = 10 self.model_save_dir = os.path.join(self.project_dir, 'cache') if not os.path.exists(self.model_save_dir): @@ -39,4 +46,7 @@ def __init__(self): # 日志相关 logger_init(log_file_name='log_train', log_level=logging.INFO, - log_dir=self.model_save_dir) \ No newline at end of file + log_dir=self.model_save_dir) + + def __repr__(self): + return str(self.__dict__) diff --git a/model/MyTransformer.py b/model/MyTransformer.py index b603406..f14fdd5 100755 --- a/model/MyTransformer.py +++ b/model/MyTransformer.py @@ -6,9 +6,15 @@ from torch.nn import Parameter from torch.nn.init import xavier_uniform_ +from .Attention import MyMultiheadAttention + is_print_shape = True +def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for _ in range(N)]) + + class MyTransformer(nn.Module): def __init__(self, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, @@ -164,10 +170,6 @@ def forward(self, src, mask=None, src_key_padding_mask=None): return output -def _get_clones(module, N): - return nn.ModuleList([copy.deepcopy(module) for _ in range(N)]) - - class MyTransformerDecoderLayer(nn.Module): def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1): super(MyTransformerDecoderLayer, self).__init__() diff --git a/utils/data_helpers.py b/utils/data_helpers.py index 4548b32..7ee4a6a 100755 --- a/utils/data_helpers.py +++ b/utils/data_helpers.py @@ -1,10 +1,11 @@ import logging from collections import Counter -from torchtext.vocab import Vocab + import torch from torch.nn.utils.rnn import pad_sequence from torch.utils.data import DataLoader from torchtext.data.utils import get_tokenizer +from torchtext.vocab import build_vocab_from_iterator from tqdm import tqdm @@ -17,23 +18,28 @@ def my_tokenizer(): def build_vocab(tokenizer, filepath, min_freq=1, specials=None): """ - vocab = Vocab(counter, specials=specials) - - print(vocab.itos) # 得到一个列表,返回词表中的每一个词; + print(vocab_obj.get_itos()) # 得到一个列表,返回词表中的每一个词; # ['', '', '', '', '.', 'a', 'are', 'A', 'Two', 'in', 'men',...] - print(vocab.itos[2]) # 通过索引返回得到词表中对应的词; + print(vocab_obj.lookup_token(0)) # 通过索引返回得到词表中对应的词; - print(vocab.stoi) # 得到一个字典,返回词表中每个词的索引; + print(list(vocab_obj.get_stoi().items())[:10]) # 得到一个字典,返回词表中每个词的索引; # {'': 0, '': 1, '': 2, '': 3, '.': 4, 'a': 5, 'are': 6,...} - print(vocab.stoi['are']) # 通过单词返回得到词表中对应的索引 + print(vocab_obj['are']) # 通过单词返回得到词表中对应的索引 """ if specials is None: specials = ['', '', '', ''] - counter = Counter() - with open(filepath, encoding='utf8') as f: - for string_ in f: - counter.update(tokenizer(string_)) - return Vocab(counter, specials=specials, min_freq=min_freq) + + def yield_tokens(filepath): + with open(filepath, encoding='utf8') as f: + for string_ in f: + yield tokenizer(string_) + + vocab_obj = build_vocab_from_iterator(yield_tokens( + filepath), specials=specials, min_freq=min_freq) + + vocab_obj.set_default_index(vocab_obj['']) + + return vocab_obj class LoadEnglishGermanDataset(): @@ -41,8 +47,10 @@ def __init__(self, train_file_paths=None, tokenizer=None, batch_size=2, min_freq=1): # 根据训练预料建立英语和德语各自的字典 self.tokenizer = tokenizer() - self.de_vocab = build_vocab(self.tokenizer['de'], filepath=train_file_paths[0], min_freq=min_freq) - self.en_vocab = build_vocab(self.tokenizer['en'], filepath=train_file_paths[1], min_freq=min_freq) + self.de_vocab = build_vocab( + self.tokenizer['de'], filepath=train_file_paths[0], min_freq=min_freq) + self.en_vocab = build_vocab( + self.tokenizer['en'], filepath=train_file_paths[1], min_freq=min_freq) self.specials = ['', '', '', ''] self.PAD_IDX = self.de_vocab[''] self.BOS_IDX = self.de_vocab[''] @@ -59,7 +67,7 @@ def data_process(self, filepaths): raw_en_iter = iter(open(filepaths[1], encoding="utf8")) data = [] logging.info(f"### 正在将数据集 {filepaths} 转换成 Token ID ") - for (raw_de, raw_en) in tqdm(zip(raw_de_iter, raw_en_iter),ncols=80): + for (raw_de, raw_en) in tqdm(zip(raw_de_iter, raw_en_iter), ncols=80): de_tensor_ = torch.tensor([self.de_vocab[token] for token in self.tokenizer['de'](raw_de.rstrip("\n"))], dtype=torch.long) en_tensor_ = torch.tensor([self.en_vocab[token] for token in @@ -98,26 +106,33 @@ def generate_batch(self, data_batch): for (de_item, en_item) in data_batch: # 开始对一个batch中的每一个样本进行处理。 de_batch.append(de_item) # 编码器输入序列不需要加起止符 # 在每个idx序列的首位加上 起始token 和 结束 token - en = torch.cat([torch.tensor([self.BOS_IDX]), en_item, torch.tensor([self.EOS_IDX])], dim=0) + en = torch.cat([torch.tensor([self.BOS_IDX]), en_item, + torch.tensor([self.EOS_IDX])], dim=0) en_batch.append(en) # 以最长的序列为标准进行填充 - de_batch = pad_sequence(de_batch, padding_value=self.PAD_IDX) # [de_len,batch_size] - en_batch = pad_sequence(en_batch, padding_value=self.PAD_IDX) # [en_len,batch_size] + # [de_len,batch_size] + de_batch = pad_sequence(de_batch, padding_value=self.PAD_IDX) + # [en_len,batch_size] + en_batch = pad_sequence(en_batch, padding_value=self.PAD_IDX) return de_batch, en_batch def generate_square_subsequent_mask(self, sz, device): - mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1) - mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) + mask = (torch.triu(torch.ones((sz, sz), device=device)) + == 1).transpose(0, 1) + mask = mask.float().masked_fill(mask == 0, float( + '-inf')).masked_fill(mask == 1, float(0.0)) return mask def create_mask(self, src, tgt, device='cpu'): src_seq_len = src.shape[0] tgt_seq_len = tgt.shape[0] - tgt_mask = self.generate_square_subsequent_mask(tgt_seq_len, device) # [tgt_len,tgt_len] + tgt_mask = self.generate_square_subsequent_mask( + tgt_seq_len, device) # [tgt_len,tgt_len] # Decoder的注意力Mask输入,用于掩盖当前position之后的position,所以这里是一个对称矩阵 - src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool) + src_mask = torch.zeros((src_seq_len, src_seq_len), + device=device).type(torch.bool) # Encoder的注意力Mask输入,这部分其实对于Encoder来说是没有用的,所以这里全是0 src_padding_mask = (src == self.PAD_IDX).transpose(0, 1) From e15973a4853ba158979f93ae831c610be0ef7f7d Mon Sep 17 00:00:00 2001 From: Li Xing Date: Thu, 22 Feb 2024 15:34:33 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E4=BF=AE=E6=94=B9vocab=E6=93=8D=E4=BD=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- translate.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/translate.py b/translate.py index fc34a6d..08324db 100644 --- a/translate.py +++ b/translate.py @@ -1,7 +1,8 @@ +import torch + from config.config import Config from model.TranslationModel import TranslationModel from utils.data_helpers import LoadEnglishGermanDataset, my_tokenizer -import torch def greedy_decode(model, src, max_len, start_symbol, config, data_loader): @@ -17,7 +18,8 @@ def greedy_decode(model, src, max_len, start_symbol, config, data_loader): # out[:,1] shape : [1,embed_dim], prob shape: [1,tgt_vocab_size] _, next_word = torch.max(prob, dim=1) # 选择概率最大者 next_word = next_word.item() - ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) + ys = torch.cat([ys, torch.ones(1, 1).type_as( + src.data).fill_(next_word)], dim=0) # 将当前时刻解码的预测输出结果,同之前所有的结果堆叠作为输入再去预测下一个词。 if next_word == data_loader.EOS_IDX: # 如果当前时刻的预测输出为结束标志,则跳出循环结束预测。 break @@ -29,14 +31,15 @@ def translate(model, src, data_loader, config): tgt_vocab = data_loader.en_vocab src_tokenizer = data_loader.tokenizer['de'] model.eval() - tokens = [src_vocab.stoi[tok] for tok in src_tokenizer(src)] # 构造一个样本 + tokens = [src_vocab[tok] for tok in src_tokenizer(src)] # 构造一个样本 num_tokens = len(tokens) src = (torch.LongTensor(tokens).reshape(num_tokens, 1)) # 将src_len 作为第一个维度 with torch.no_grad(): tgt_tokens = greedy_decode(model, src, max_len=num_tokens + 5, start_symbol=data_loader.BOS_IDX, config=config, - data_loader=data_loader).flatten() # 解码的预测结果 - return " ".join([tgt_vocab.itos[tok] for tok in tgt_tokens]).replace("", "").replace("", "") + # 解码的预测结果 + data_loader=data_loader).flatten() + return " ".join([tgt_vocab.get_itos()[tok] for tok in tgt_tokens]).replace("", "").replace("", "") def translate_german_to_english(srcs, config): @@ -45,7 +48,8 @@ def translate_german_to_english(srcs, config): tokenizer=my_tokenizer, min_freq=config.min_freq) translation_model = TranslationModel(src_vocab_size=len(data_loader.de_vocab), - tgt_vocab_size=len(data_loader.en_vocab), + tgt_vocab_size=len( + data_loader.en_vocab), d_model=config.d_model, nhead=config.num_head, num_encoder_layers=config.num_encoder_layers,