From d1279878f8f03848a81ab37f573097a3a2015102 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=AB=A0=E5=BF=97=E5=87=8C?= Date: Tue, 2 Aug 2022 17:00:43 +0800 Subject: [PATCH 1/3] add gitignore --- .gitignore | 160 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6769e21 --- /dev/null +++ b/.gitignore @@ -0,0 +1,160 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ \ No newline at end of file From 897ce9f186b4a07c6939a7f364909f9a83fc004c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=AB=A0=E5=BF=97=E5=87=8C?= Date: Tue, 2 Aug 2022 17:01:11 +0800 Subject: [PATCH 2/3] add roformer-sim-ft --- bert4vec/bert4vec.py | 8 +- examples/compare_models.ipynb | 134 ++++++++++++++++++++++++++++++++++ 2 files changed, 140 insertions(+), 2 deletions(-) create mode 100644 examples/compare_models.ipynb diff --git a/bert4vec/bert4vec.py b/bert4vec/bert4vec.py index 852d504..a161b20 100644 --- a/bert4vec/bert4vec.py +++ b/bert4vec/bert4vec.py @@ -84,17 +84,21 @@ def __init__(self, mode="paraphrase-multilingual-minilm", model_name_or_path="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" """ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - assert mode in ["simbert-base", "roformer-sim-base", "roformer-sim-small", "paraphrase-multilingual-minilm"] + assert mode in ["simbert-base", "roformer-sim-base", "roformer-sim-small", "roformer-sim-ft-small", "roformer-sim-ft-base", "paraphrase-multilingual-minilm"] self.mode = mode if mode == "simbert-base": if not os.path.isdir(model_name_or_path): model_name_or_path = "WangZeJun/simbert-base-chinese" self.tokenizer = BertTokenizer.from_pretrained(model_name_or_path) self.model = BertModel.from_pretrained(model_name_or_path) - elif mode in ["roformer-sim-base", "roformer-sim-small"]: + elif "roformer" in mode: if not os.path.isdir(model_name_or_path): if mode == "roformer-sim-base": model_name_or_path = "WangZeJun/roformer-sim-base-chinese" + if mode == "roformer-sim-ft-small": + model_name_or_path = "blmoistawinde/roformer-sim-ft-small-chinese" + if mode == "roformer-sim-ft-base": + model_name_or_path = "blmoistawinde/roformer-sim-ft-base-chinese" else: model_name_or_path = "WangZeJun/roformer-sim-small-chinese" diff --git a/examples/compare_models.ipynb b/examples/compare_models.ipynb new file mode 100644 index 0000000..09bf71d --- /dev/null +++ b/examples/compare_models.ipynb @@ -0,0 +1,134 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from bert4vec import Bert4Vec" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "sent1 = ['今天天气不错', '今天天气不错', '我喜欢北京', '我喜欢北京', \n", + " '电影不错', '电影不错', '红色的苹果', '给我推荐一款红色的车', '给我推荐一款红色的车', '给我推荐一款红色的车']\n", + "sent2 = ['今天天气很好', '今天天气不好', '我很喜欢北京', '我不喜欢北京', \n", + " '电影很好', '电影不好', '绿色的苹果', '给我推荐一款黑色的车', '推荐一辆红车', '麻烦来一辆红车']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "使用标准的roformer-sim" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of the model checkpoint at C:\\Users\\zhiling.zhang/.cache\\torch\\bert4vec\\WangZeJun_roformer-sim-small-chinese were not used when initializing RoFormerModel: ['pooler.dense.bias', 'pooler.dense.weight']\n", + "- This IS expected if you are initializing RoFormerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing RoFormerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0.90218985 0.9218414 0.9741074 0.88859826 0.87840664 0.8916058\n", + " 0.79851764 0.781005 0.8613388 0.68186337]\n" + ] + } + ], + "source": [ + "model = Bert4Vec(mode='roformer-sim-base') \n", + "similarity = model.similarity(sent1, sent2, return_matrix=False)\n", + "print(similarity)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "使用[finetune过后的模型](https://kexue.fm/archives/8541)比较相似度,模型确实能体现出更符合普通常规认知的相似度打分,比如加了“不”字后相似度明显降低" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "You are using a model of type bert to instantiate a model of type roformer. This is not supported for all configurations of models and can yield errors.\n", + "Some weights of the model checkpoint at C:\\Users\\zhiling.zhang/.cache\\torch\\bert4vec\\blmoistawinde_roformer-sim-ft-base-chinese were not used when initializing RoFormerModel: ['embeddings.position_ids', 'pooler.dense.weight', 'embeddings.position_embeddings.weight', 'pooler.dense.bias']\n", + "- This IS expected if you are initializing RoFormerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing RoFormerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "Some weights of RoFormerModel were not initialized from the model checkpoint at C:\\Users\\zhiling.zhang/.cache\\torch\\bert4vec\\blmoistawinde_roformer-sim-ft-base-chinese and are newly initialized: ['encoder.embed_positions.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0.96080494 0.71603286 0.9829726 0.6466826 0.9385048 0.7281149\n", + " 0.863762 0.9140622 0.9729547 0.8636934 ]\n" + ] + } + ], + "source": [ + "model = Bert4Vec(mode='roformer-sim-ft-base') \n", + "similarity = model.similarity(sent1, sent2, return_matrix=False)\n", + "print(similarity)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.12 ('base')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "228fb4fdf50c48aaed230367f3c11053346245306bfaa9a8a7dbf785baa18430" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 7a19ad441070396ec393b5f7add65cda0b978804 Mon Sep 17 00:00:00 2001 From: blmoistawinde <1840962220@qq.com> Date: Mon, 8 Aug 2022 15:18:21 +0800 Subject: [PATCH 3/3] add new conversion script and fix bugs --- bert4vec/bert4vec.py | 18 +++-- examples/compare_models.ipynb | 33 ++------- ...r_sim_original_tf_checkpoint_to_pytorch.py | 73 +++++++++++++++++++ 3 files changed, 89 insertions(+), 35 deletions(-) create mode 100644 examples/convert_roformer_sim_original_tf_checkpoint_to_pytorch.py diff --git a/bert4vec/bert4vec.py b/bert4vec/bert4vec.py index a161b20..2369774 100644 --- a/bert4vec/bert4vec.py +++ b/bert4vec/bert4vec.py @@ -26,11 +26,15 @@ def __init__(self, model_path: str): model_file = os.path.join(model_path, "pytorch_model.bin") assert os.path.isfile(model_file) params_dict = torch.load(model_file) - pooler_weight = params_dict["pooler.dense.weight"] - pooler_bias = params_dict["pooler.dense.bias"] + try: + pooler_weight = params_dict["pooler.dense.weight"] + pooler_bias = params_dict["pooler.dense.bias"] + except: + # model with new conversion script convert_roformer_sim_original_tf_checkpoint_to_pytorch.py + pooler_weight = params_dict["roformer.pooler.weight"] + pooler_bias = params_dict["roformer.pooler.bias"] del params_dict self.pooler = nn.Linear(pooler_weight.shape[0], pooler_weight.shape[0]) - self.activation = nn.Tanh() self.pooler.weight.data = pooler_weight self.pooler.bias.data = pooler_bias @@ -64,7 +68,6 @@ def forward( sequence_output = outputs[0] cls_output = sequence_output[:, 0, :] pooled_output = self.pooler(cls_output) - pooled_output = self.activation(pooled_output) return (sequence_output, pooled_output) + outputs[1:] class Bert4Vec(object): @@ -95,13 +98,12 @@ def __init__(self, if not os.path.isdir(model_name_or_path): if mode == "roformer-sim-base": model_name_or_path = "WangZeJun/roformer-sim-base-chinese" - if mode == "roformer-sim-ft-small": + elif mode == "roformer-sim-ft-small": model_name_or_path = "blmoistawinde/roformer-sim-ft-small-chinese" - if mode == "roformer-sim-ft-base": + elif mode == "roformer-sim-ft-base": model_name_or_path = "blmoistawinde/roformer-sim-ft-base-chinese" else: model_name_or_path = "WangZeJun/roformer-sim-small-chinese" - try: from torch.hub import _get_torch_home torch_cache_home = _get_torch_home() @@ -164,7 +166,7 @@ def encode(self, return_tensors="pt" ).to(self.device) outputs = self.model(**inputs) - if self.mode in ["simbert-base", "roformer-sim-base", "roformer-sim-small"]: + if self.mode in ["simbert-base", "roformer-sim-base", "roformer-sim-small", "roformer-sim-ft-base", "roformer-sim-ft-small"]: embeddings = outputs[1] else: embeddings = self.mean_pooling(outputs[0], inputs["attention_mask"]) diff --git a/examples/compare_models.ipynb b/examples/compare_models.ipynb index 09bf71d..6b0cb39 100644 --- a/examples/compare_models.ipynb +++ b/examples/compare_models.ipynb @@ -30,24 +30,15 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Some weights of the model checkpoint at C:\\Users\\zhiling.zhang/.cache\\torch\\bert4vec\\WangZeJun_roformer-sim-small-chinese were not used when initializing RoFormerModel: ['pooler.dense.bias', 'pooler.dense.weight']\n", - "- This IS expected if you are initializing RoFormerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "- This IS NOT expected if you are initializing RoFormerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "[0.90218985 0.9218414 0.9741074 0.88859826 0.87840664 0.8916058\n", - " 0.79851764 0.781005 0.8613388 0.68186337]\n" + "[0.89441085 0.92176294 0.9760171 0.9027498 0.88794875 0.89445156\n", + " 0.77999914 0.7980834 0.8914307 0.6808028 ]\n" ] } ], @@ -66,27 +57,15 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "You are using a model of type bert to instantiate a model of type roformer. This is not supported for all configurations of models and can yield errors.\n", - "Some weights of the model checkpoint at C:\\Users\\zhiling.zhang/.cache\\torch\\bert4vec\\blmoistawinde_roformer-sim-ft-base-chinese were not used when initializing RoFormerModel: ['embeddings.position_ids', 'pooler.dense.weight', 'embeddings.position_embeddings.weight', 'pooler.dense.bias']\n", - "- This IS expected if you are initializing RoFormerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "- This IS NOT expected if you are initializing RoFormerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", - "Some weights of RoFormerModel were not initialized from the model checkpoint at C:\\Users\\zhiling.zhang/.cache\\torch\\bert4vec\\blmoistawinde_roformer-sim-ft-base-chinese and are newly initialized: ['encoder.embed_positions.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "[0.96080494 0.71603286 0.9829726 0.6466826 0.9385048 0.7281149\n", - " 0.863762 0.9140622 0.9729547 0.8636934 ]\n" + "[0.97699356 0.62355804 0.99210703 0.5291083 0.9676273 0.6313111\n", + " 0.6974415 0.7191807 0.9866393 0.9460245 ]\n" ] } ], diff --git a/examples/convert_roformer_sim_original_tf_checkpoint_to_pytorch.py b/examples/convert_roformer_sim_original_tf_checkpoint_to_pytorch.py new file mode 100644 index 0000000..d012599 --- /dev/null +++ b/examples/convert_roformer_sim_original_tf_checkpoint_to_pytorch.py @@ -0,0 +1,73 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert RoFormer checkpoint.""" + + +import argparse + +import torch +from torch import nn +from transformers import RoFormerConfig, RoFormerForMaskedLM, RoFormerModel, load_tf_weights_in_roformer +from transformers.models.roformer.modeling_roformer import RoFormerOnlyMLMHead +from transformers.utils import logging + + +logging.set_verbosity_info() + +class RoFormerModelWithPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.roformer = RoFormerModel(config) + self.cls = RoFormerOnlyMLMHead(config) + self.roformer.pooler = nn.Linear(config.hidden_size, config.hidden_size) + +def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): + # Initialise PyTorch model + config = RoFormerConfig.from_json_file(bert_config_file) + print(f"Building PyTorch model from configuration: {config}") + model = RoFormerModelWithPooler(config) + + # Load weights from tf checkpoint + load_tf_weights_in_roformer(model, config, tf_checkpoint_path) + + # Save pytorch-model + print(f"Save PyTorch model to {pytorch_dump_path}") + torch.save(model.state_dict(), pytorch_dump_path, _use_new_zipfile_serialization=False) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." + ) + parser.add_argument( + "--bert_config_file", + default=None, + type=str, + required=True, + help=( + "The config json file corresponding to the pre-trained BERT model. \n" + "This specifies the model architecture." + ), + ) + parser.add_argument( + "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." + ) + args = parser.parse_args() + convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) + +# python convert_roformer_sim_original_tf_checkpoint_to_pytorch.py --tf_checkpoint_path "./chinese_roformer-sim-char-ft_L-6_H-384_A-6/bert_model.ckpt" --bert_config_file "./chinese_roformer-sim-char-ft_L-6_H-384_A-6/bert_config.json" --pytorch_dump_path "./chinese_roformer-sim-char-ft_L-6_H-384_A-6/pytorch_model.bin" +# python convert_roformer_sim_original_tf_checkpoint_to_pytorch.py --tf_checkpoint_path "./chinese_roformer-sim-char-ft_L-12_H-768_A-12/bert_model.ckpt" --bert_config_file "./chinese_roformer-sim-char-ft_L-12_H-768_A-12/bert_config.json" --pytorch_dump_path "./chinese_roformer-sim-char-ft_L-12_H-768_A-12/pytorch_model.bin" \ No newline at end of file