From d1279878f8f03848a81ab37f573097a3a2015102 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=AB=A0=E5=BF=97=E5=87=8C?= <zhiling.zhang@mihoyo.com>
Date: Tue, 2 Aug 2022 17:00:43 +0800
Subject: [PATCH 1/3] add gitignore

---
 .gitignore | 160 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 160 insertions(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..6769e21
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,160 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
\ No newline at end of file

From 897ce9f186b4a07c6939a7f364909f9a83fc004c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=AB=A0=E5=BF=97=E5=87=8C?= <zhiling.zhang@mihoyo.com>
Date: Tue, 2 Aug 2022 17:01:11 +0800
Subject: [PATCH 2/3] add roformer-sim-ft

---
 bert4vec/bert4vec.py          |   8 +-
 examples/compare_models.ipynb | 134 ++++++++++++++++++++++++++++++++++
 2 files changed, 140 insertions(+), 2 deletions(-)
 create mode 100644 examples/compare_models.ipynb

diff --git a/bert4vec/bert4vec.py b/bert4vec/bert4vec.py
index 852d504..a161b20 100644
--- a/bert4vec/bert4vec.py
+++ b/bert4vec/bert4vec.py
@@ -84,17 +84,21 @@ def __init__(self,
                                      mode="paraphrase-multilingual-minilm", model_name_or_path="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
         """
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        assert mode in ["simbert-base", "roformer-sim-base", "roformer-sim-small", "paraphrase-multilingual-minilm"]
+        assert mode in ["simbert-base", "roformer-sim-base", "roformer-sim-small", "roformer-sim-ft-small", "roformer-sim-ft-base", "paraphrase-multilingual-minilm"]
         self.mode = mode
         if mode == "simbert-base":
             if not os.path.isdir(model_name_or_path):
                 model_name_or_path = "WangZeJun/simbert-base-chinese"
             self.tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
             self.model = BertModel.from_pretrained(model_name_or_path)
-        elif mode in ["roformer-sim-base", "roformer-sim-small"]:
+        elif "roformer" in mode:
             if not os.path.isdir(model_name_or_path):
                 if mode == "roformer-sim-base":
                     model_name_or_path = "WangZeJun/roformer-sim-base-chinese"
+                if mode == "roformer-sim-ft-small":
+                    model_name_or_path = "blmoistawinde/roformer-sim-ft-small-chinese"
+                if mode == "roformer-sim-ft-base":
+                    model_name_or_path = "blmoistawinde/roformer-sim-ft-base-chinese"
                 else:
                     model_name_or_path = "WangZeJun/roformer-sim-small-chinese"
                 
diff --git a/examples/compare_models.ipynb b/examples/compare_models.ipynb
new file mode 100644
index 0000000..09bf71d
--- /dev/null
+++ b/examples/compare_models.ipynb
@@ -0,0 +1,134 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from bert4vec import Bert4Vec"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sent1 = ['今天天气不错', '今天天气不错', '我喜欢北京', '我喜欢北京', \n",
+    "         '电影不错', '电影不错', '红色的苹果', '给我推荐一款红色的车', '给我推荐一款红色的车', '给我推荐一款红色的车']\n",
+    "sent2 = ['今天天气很好', '今天天气不好', '我很喜欢北京', '我不喜欢北京', \n",
+    "         '电影很好', '电影不好', '绿色的苹果', '给我推荐一款黑色的车', '推荐一辆红车', '麻烦来一辆红车']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "使用标准的roformer-sim"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the model checkpoint at C:\\Users\\zhiling.zhang/.cache\\torch\\bert4vec\\WangZeJun_roformer-sim-small-chinese were not used when initializing RoFormerModel: ['pooler.dense.bias', 'pooler.dense.weight']\n",
+      "- This IS expected if you are initializing RoFormerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing RoFormerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0.90218985 0.9218414  0.9741074  0.88859826 0.87840664 0.8916058\n",
+      " 0.79851764 0.781005   0.8613388  0.68186337]\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = Bert4Vec(mode='roformer-sim-base') \n",
+    "similarity = model.similarity(sent1, sent2, return_matrix=False)\n",
+    "print(similarity)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "使用[finetune过后的模型](https://kexue.fm/archives/8541)比较相似度，模型确实能体现出更符合普通常规认知的相似度打分，比如加了“不”字后相似度明显降低"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "You are using a model of type bert to instantiate a model of type roformer. This is not supported for all configurations of models and can yield errors.\n",
+      "Some weights of the model checkpoint at C:\\Users\\zhiling.zhang/.cache\\torch\\bert4vec\\blmoistawinde_roformer-sim-ft-base-chinese were not used when initializing RoFormerModel: ['embeddings.position_ids', 'pooler.dense.weight', 'embeddings.position_embeddings.weight', 'pooler.dense.bias']\n",
+      "- This IS expected if you are initializing RoFormerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing RoFormerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "Some weights of RoFormerModel were not initialized from the model checkpoint at C:\\Users\\zhiling.zhang/.cache\\torch\\bert4vec\\blmoistawinde_roformer-sim-ft-base-chinese and are newly initialized: ['encoder.embed_positions.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0.96080494 0.71603286 0.9829726  0.6466826  0.9385048  0.7281149\n",
+      " 0.863762   0.9140622  0.9729547  0.8636934 ]\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = Bert4Vec(mode='roformer-sim-ft-base') \n",
+    "similarity = model.similarity(sent1, sent2, return_matrix=False)\n",
+    "print(similarity)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.12 ('base')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "228fb4fdf50c48aaed230367f3c11053346245306bfaa9a8a7dbf785baa18430"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 7a19ad441070396ec393b5f7add65cda0b978804 Mon Sep 17 00:00:00 2001
From: blmoistawinde <1840962220@qq.com>
Date: Mon, 8 Aug 2022 15:18:21 +0800
Subject: [PATCH 3/3] add new conversion script and fix bugs

---
 bert4vec/bert4vec.py                          | 18 +++--
 examples/compare_models.ipynb                 | 33 ++-------
 ...r_sim_original_tf_checkpoint_to_pytorch.py | 73 +++++++++++++++++++
 3 files changed, 89 insertions(+), 35 deletions(-)
 create mode 100644 examples/convert_roformer_sim_original_tf_checkpoint_to_pytorch.py

diff --git a/bert4vec/bert4vec.py b/bert4vec/bert4vec.py
index a161b20..2369774 100644
--- a/bert4vec/bert4vec.py
+++ b/bert4vec/bert4vec.py
@@ -26,11 +26,15 @@ def __init__(self, model_path: str):
         model_file = os.path.join(model_path, "pytorch_model.bin")
         assert os.path.isfile(model_file)
         params_dict = torch.load(model_file)
-        pooler_weight = params_dict["pooler.dense.weight"]
-        pooler_bias = params_dict["pooler.dense.bias"]
+        try:
+            pooler_weight = params_dict["pooler.dense.weight"]
+            pooler_bias = params_dict["pooler.dense.bias"]
+        except:
+            # model with new conversion script convert_roformer_sim_original_tf_checkpoint_to_pytorch.py
+            pooler_weight = params_dict["roformer.pooler.weight"]
+            pooler_bias = params_dict["roformer.pooler.bias"]
         del params_dict
         self.pooler = nn.Linear(pooler_weight.shape[0], pooler_weight.shape[0])
-        self.activation = nn.Tanh()
         self.pooler.weight.data = pooler_weight
         self.pooler.bias.data = pooler_bias
     
@@ -64,7 +68,6 @@ def forward(
         sequence_output = outputs[0]
         cls_output = sequence_output[:, 0, :]
         pooled_output = self.pooler(cls_output)
-        pooled_output = self.activation(pooled_output)
         return (sequence_output, pooled_output) + outputs[1:]
 
 class Bert4Vec(object):
@@ -95,13 +98,12 @@ def __init__(self,
             if not os.path.isdir(model_name_or_path):
                 if mode == "roformer-sim-base":
                     model_name_or_path = "WangZeJun/roformer-sim-base-chinese"
-                if mode == "roformer-sim-ft-small":
+                elif mode == "roformer-sim-ft-small":
                     model_name_or_path = "blmoistawinde/roformer-sim-ft-small-chinese"
-                if mode == "roformer-sim-ft-base":
+                elif mode == "roformer-sim-ft-base":
                     model_name_or_path = "blmoistawinde/roformer-sim-ft-base-chinese"
                 else:
                     model_name_or_path = "WangZeJun/roformer-sim-small-chinese"
-                
                 try:
                     from torch.hub import _get_torch_home
                     torch_cache_home = _get_torch_home()
@@ -164,7 +166,7 @@ def encode(self,
                     return_tensors="pt"
                 ).to(self.device)
                 outputs = self.model(**inputs)
-                if self.mode in ["simbert-base", "roformer-sim-base", "roformer-sim-small"]:
+                if self.mode in ["simbert-base", "roformer-sim-base", "roformer-sim-small", "roformer-sim-ft-base", "roformer-sim-ft-small"]:
                     embeddings = outputs[1]
                 else:
                     embeddings = self.mean_pooling(outputs[0], inputs["attention_mask"])
diff --git a/examples/compare_models.ipynb b/examples/compare_models.ipynb
index 09bf71d..6b0cb39 100644
--- a/examples/compare_models.ipynb
+++ b/examples/compare_models.ipynb
@@ -30,24 +30,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Some weights of the model checkpoint at C:\\Users\\zhiling.zhang/.cache\\torch\\bert4vec\\WangZeJun_roformer-sim-small-chinese were not used when initializing RoFormerModel: ['pooler.dense.bias', 'pooler.dense.weight']\n",
-      "- This IS expected if you are initializing RoFormerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "- This IS NOT expected if you are initializing RoFormerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[0.90218985 0.9218414  0.9741074  0.88859826 0.87840664 0.8916058\n",
-      " 0.79851764 0.781005   0.8613388  0.68186337]\n"
+      "[0.89441085 0.92176294 0.9760171  0.9027498  0.88794875 0.89445156\n",
+      " 0.77999914 0.7980834  0.8914307  0.6808028 ]\n"
      ]
     }
    ],
@@ -66,27 +57,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "You are using a model of type bert to instantiate a model of type roformer. This is not supported for all configurations of models and can yield errors.\n",
-      "Some weights of the model checkpoint at C:\\Users\\zhiling.zhang/.cache\\torch\\bert4vec\\blmoistawinde_roformer-sim-ft-base-chinese were not used when initializing RoFormerModel: ['embeddings.position_ids', 'pooler.dense.weight', 'embeddings.position_embeddings.weight', 'pooler.dense.bias']\n",
-      "- This IS expected if you are initializing RoFormerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "- This IS NOT expected if you are initializing RoFormerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "Some weights of RoFormerModel were not initialized from the model checkpoint at C:\\Users\\zhiling.zhang/.cache\\torch\\bert4vec\\blmoistawinde_roformer-sim-ft-base-chinese and are newly initialized: ['encoder.embed_positions.weight']\n",
-      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[0.96080494 0.71603286 0.9829726  0.6466826  0.9385048  0.7281149\n",
-      " 0.863762   0.9140622  0.9729547  0.8636934 ]\n"
+      "[0.97699356 0.62355804 0.99210703 0.5291083  0.9676273  0.6313111\n",
+      " 0.6974415  0.7191807  0.9866393  0.9460245 ]\n"
      ]
     }
    ],
diff --git a/examples/convert_roformer_sim_original_tf_checkpoint_to_pytorch.py b/examples/convert_roformer_sim_original_tf_checkpoint_to_pytorch.py
new file mode 100644
index 0000000..d012599
--- /dev/null
+++ b/examples/convert_roformer_sim_original_tf_checkpoint_to_pytorch.py
@@ -0,0 +1,73 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert RoFormer checkpoint."""
+
+
+import argparse
+
+import torch
+from torch import nn
+from transformers import RoFormerConfig, RoFormerForMaskedLM, RoFormerModel, load_tf_weights_in_roformer
+from transformers.models.roformer.modeling_roformer import RoFormerOnlyMLMHead
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+
+class RoFormerModelWithPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.roformer = RoFormerModel(config)
+        self.cls = RoFormerOnlyMLMHead(config)     
+        self.roformer.pooler = nn.Linear(config.hidden_size, config.hidden_size)
+
+def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
+    # Initialise PyTorch model
+    config = RoFormerConfig.from_json_file(bert_config_file)
+    print(f"Building PyTorch model from configuration: {config}")
+    model = RoFormerModelWithPooler(config)
+
+    # Load weights from tf checkpoint
+    load_tf_weights_in_roformer(model, config, tf_checkpoint_path)
+
+    # Save pytorch-model
+    print(f"Save PyTorch model to {pytorch_dump_path}")
+    torch.save(model.state_dict(), pytorch_dump_path, _use_new_zipfile_serialization=False)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
+    )
+    parser.add_argument(
+        "--bert_config_file",
+        default=None,
+        type=str,
+        required=True,
+        help=(
+            "The config json file corresponding to the pre-trained BERT model. \n"
+            "This specifies the model architecture."
+        ),
+    )
+    parser.add_argument(
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    args = parser.parse_args()
+    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path)
+
+# python convert_roformer_sim_original_tf_checkpoint_to_pytorch.py --tf_checkpoint_path "./chinese_roformer-sim-char-ft_L-6_H-384_A-6/bert_model.ckpt" --bert_config_file "./chinese_roformer-sim-char-ft_L-6_H-384_A-6/bert_config.json" --pytorch_dump_path "./chinese_roformer-sim-char-ft_L-6_H-384_A-6/pytorch_model.bin"
+# python convert_roformer_sim_original_tf_checkpoint_to_pytorch.py --tf_checkpoint_path "./chinese_roformer-sim-char-ft_L-12_H-768_A-12/bert_model.ckpt" --bert_config_file "./chinese_roformer-sim-char-ft_L-12_H-768_A-12/bert_config.json" --pytorch_dump_path "./chinese_roformer-sim-char-ft_L-12_H-768_A-12/pytorch_model.bin"
\ No newline at end of file