bug(medcat): CU-869ckx6dr Allow for better supervised training (#374)

mart-r · github-actions[bot] · web-flow · commit 7c12cae87e1e · 2026-04-01T12:02:43.000+01:00
* CU-869ckx6dr: Add extra test to trainer to make sure it tests on multiple projects

* CU-869ckx6dr: Add new method for reuse of entities when getting based on tokens

* CU-869ckx6dr: Add simple test for entity persitance in document

* CU-869ckx6dr: Small addition to test

* CU-869ckx6dr: Prepare document with appropriate entities at training time

* CU-869ckx6dr: Update tests to work with new setup

* CU-869ckx6dr: Add a new test for entities in add_and_train_concept.

* CU-869ckx6dr: Add deprecation arning to old / unused entity_from_tokens method in pipe

* CU-869ckx6dr: Add deprecation warning to old / unused entity_from_tokens method in tokenizers

* CU-869ckx6dr: Deprecate unused method on a protocol level as well

* CU-869ckx6dr: Fix linting issue

* CU-869ckx6dr: Fix minor issues in test-time supervised triaining data

* CU-869ckx6dr: Add enw test for order of training examples

* CU-869ckx6dr: Minor changes to trainer tests

* CU-869ckx6dr: Allow a little longer for the relcat tutorial to run

---------

Co-authored-by: github-actions[bot] &lt;github-actions[bot]@users.noreply.github.com&gt;
diff --git a/.github/workflows/medcat-v2-tutorials_main.yml b/.github/workflows/medcat-v2-tutorials_main.yml
@@ -83,4 +83,4 @@ jobs:
       - name: Smoke test tutorial
         run: |
           pytest --capture=no --collect-only --nbmake ${{ matrix.part }}
-          pytest --capture=no --nbmake -n=auto --nbmake-kernel=smoketests --nbmake-timeout=1800 ${{ matrix.part }}
+          pytest --capture=no --nbmake -n=auto --nbmake-kernel=smoketests --nbmake-timeout=2400 ${{ matrix.part }}
diff --git a/medcat-v2/medcat/pipeline/pipeline.py b/medcat-v2/medcat/pipeline/pipeline.py
@@ -1,6 +1,7 @@
 from typing import Optional, Iterable, Union
 import logging
 import os
+import warnings
 
 from medcat.utils.defaults import COMPONENTS_FOLDER
 from medcat.tokenizing.tokenizers import BaseTokenizer, create_tokenizer
@@ -43,8 +44,19 @@ def create_entity(self, doc: MutableDocument,
             doc, token_start_index, token_end_index, label)
 
     def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
+        warnings.warn(
+            "The `medcat.pipeline.pipeline.entity_from_tokens` method is"
+            "depreacated and subject to removal in a future release. Please "
+            "use `medcat.pipeline.pipeline.entity_from_tokens_in_doc` instead.",
+            DeprecationWarning,
+            stacklevel=2
+        )
         return self.tokenizer.entity_from_tokens(tokens)
 
+    def entity_from_tokens_in_doc(
+            self, tokens: list[MutableToken], doc: MutableDocument) -> MutableEntity:
+        return self.tokenizer.entity_from_tokens_in_doc(tokens, doc)
+
     def __call__(self, text: str) -> MutableDocument:
         doc = self.tokenizer(text)
         for comp in self.components:
@@ -342,6 +354,23 @@ def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
         """
         return self._tokenizer.entity_from_tokens(tokens)
 
+    def entity_from_tokens_in_doc(self, tokens: list[MutableToken],
+                                  doc: MutableDocument) -> MutableEntity:
+        """Get the entity from the list of tokens in a document.
+
+        This effectively turns a list of (consecutive) documents
+        into an entity. But it is also designed to reuse existing
+        instances on the document instead of creating new ones.
+
+        Args:
+            tokens (list[MutableToken]): The tokens to use.
+            doc (MutableDocument): The document for these tokens.
+
+        Returns:
+            MutableEntity: The resulting entity.
+        """
+        return self._tokenizer.entity_from_tokens_in_doc(tokens, doc)
+
     def get_component(self, ctype: CoreComponentType) -> CoreComponent:
         """Get the core component by the component type.
 
diff --git a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
@@ -1,6 +1,7 @@
 import re
 from typing import cast, Optional, Iterator, overload, Union, Any, Type
 from collections import defaultdict
+import warnings
 
 from medcat.tokenizing.tokens import (
     BaseToken, BaseEntity, BaseDocument,
@@ -340,13 +341,38 @@ def create_entity(self, doc: MutableDocument,
         # return Entity(span)
 
     def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
+        warnings.warn(
+            "The `medcat.tokenizing.tokenizers.Tokenizer.entity_from_tokens` method is"
+            "depreacated and subject to removal in a future release. Please use "
+            "`medcat.tokenizing.tokenizers.Tokenizer.entity_from_tokens_in_doc` "
+            "instead.",
+            DeprecationWarning,
+            stacklevel=2
+        )
         if not tokens:
             raise ValueError("Need at least one token for an entity")
         doc = cast(Token, tokens[0])._doc
         start_index = doc._tokens.index(tokens[0])
         end_index = doc._tokens.index(tokens[-1])
         return _entity_from_tokens(doc, tokens, start_index, end_index)
 
+    def _get_existing_entity(self, tokens: list[MutableToken],
+                             doc: MutableDocument) -> Optional[MutableEntity]:
+        if not tokens:
+            return None
+        for ent in doc.ner_ents + doc.linked_ents:
+            if (ent.base.start_index == tokens[0].base.index and
+                    ent.base.end_index == tokens[-1].base.index):
+                return ent
+        return None
+
+    def entity_from_tokens_in_doc(self, tokens: list[MutableToken],
+                                  doc: MutableDocument) -> MutableEntity:
+        existing_ent = self._get_existing_entity(tokens, doc)
+        if existing_ent:
+            return existing_ent
+        return self.entity_from_tokens(tokens)
+
     def _get_tokens_matches(self, text: str) -> list[re.Match[str]]:
         tokens = self.REGEX.finditer(text)
         return list(tokens)
diff --git a/medcat-v2/medcat/tokenizing/spacy_impl/tokenizers.py b/medcat-v2/medcat/tokenizing/spacy_impl/tokenizers.py
@@ -3,6 +3,7 @@
 import os
 import shutil
 import logging
+import warnings
 
 import spacy
 from spacy.tokens import Span
@@ -77,13 +78,38 @@ def create_entity(self, doc: MutableDocument,
         return Entity(span)
 
     def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
+        warnings.warn(
+            "The `medcat.tokenizing.tokenizers.Tokenizer.entity_from_tokens` method is"
+            "depreacated and subject to removal in a future release. Please use "
+            "`medcat.tokenizing.tokenizers.Tokenizer.entity_from_tokens_in_doc` "
+            "instead.",
+            DeprecationWarning,
+            stacklevel=2
+        )
         if not tokens:
             raise ValueError("Need at least one token for an entity")
         spacy_tokens = cast(list[Token], tokens)
         span = Span(spacy_tokens[0]._delegate.doc, spacy_tokens[0].index,
                     spacy_tokens[-1].index + 1)
         return Entity(span)
 
+    def _get_existing_entity(self, tokens: list[MutableToken],
+                             doc: MutableDocument) -> Optional[MutableEntity]:
+        if not tokens:
+            return None
+        for ent in doc.ner_ents + doc.linked_ents:
+            if (ent.base.start_index == tokens[0].base.index and
+                    ent.base.end_index == tokens[-1].base.index):
+                return ent
+        return None
+
+    def entity_from_tokens_in_doc(self, tokens: list[MutableToken],
+                                  doc: MutableDocument) -> MutableEntity:
+        existing_ent = self._get_existing_entity(tokens, doc)
+        if existing_ent:
+            return existing_ent
+        return self.entity_from_tokens(tokens)
+
     def __call__(self, text: str) -> MutableDocument:
         if self._avoid_pipe:
             doc = Document(self._nlp.make_doc(text))
diff --git a/medcat-v2/medcat/tokenizing/tokenizers.py b/medcat-v2/medcat/tokenizing/tokenizers.py
@@ -34,15 +34,22 @@ def create_entity(self, doc: MutableDocument,
         pass
 
     def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
-        """Get an entity from the list of tokens.
+        """Deprecated: use entity_from_tokens_in_doc instead."""
+        pass
+
+    def entity_from_tokens_in_doc(self, tokens: list[MutableToken],
+                                  doc: MutableDocument) -> MutableEntity:
+        """Get an entity from the list of tokens in the specified document.
+
+        This method is designed to reuse entities where possible.
 
         Args:
             tokens (list[MutableToken]): List of tokens.
+            doc (MutableDocument): The document for these tokens.
 
         Returns:
             MutableEntity: The resulting entity.
         """
-        pass
 
     def __call__(self, text: str) -> MutableDocument:
         pass
diff --git a/medcat-v2/medcat/trainer.py b/medcat-v2/medcat/trainer.py
@@ -11,7 +11,7 @@
 from medcat.utils.data_utils import make_mc_train_test, get_false_positives
 from medcat.utils.filters import project_filters
 from medcat.data.mctexport import (
-    MedCATTrainerExport, MedCATTrainerExportProject,
+    MedCATTrainerExport, MedCATTrainerExportAnnotation, MedCATTrainerExportProject,
     MedCATTrainerExportDocument, count_all_annotations, iter_anns)
 from medcat.preprocessors.cleaners import prepare_name, NameDescriptor
 from medcat.components.types import CoreComponentType, TrainableComponent
@@ -397,6 +397,20 @@ def _train_supervised_for_project(self,
                     docs, current_document, train_from_false_positives,
                     devalue_others)
 
+    def _prepare_doc_with_anns(
+            self, doc: MutableDocument,
+            anns: list[MedCATTrainerExportAnnotation]) -> None:
+        ents = []
+        for ann in anns:
+            tkns = doc.get_tokens(ann['start'], ann['end'])
+            ents.append(self._pipeline.entity_from_tokens_in_doc(tkns, doc))
+        # set NER ents
+        doc.ner_ents.clear()
+        doc.ner_ents.extend(ents)
+        # duplicate for linked as well, but in a a separate list
+        doc.linked_ents.clear()
+        doc.linked_ents.extend(ents)
+
     def _train_supervised_for_project2(self,
                                        docs: list[MedCATTrainerExportDocument],
                                        current_document: int,
@@ -412,17 +426,17 @@ def _train_supervised_for_project2(self,
             with temp_changed_config(self.config.components.linking,
                                      'train', False):
                 mut_doc = self.caller(doc['text'])
+            self._prepare_doc_with_anns(mut_doc, doc['annotations'])
 
             # Compatibility with old output where annotations are a list
-            for ann in doc['annotations']:
+            for ann, mut_entity in zip(doc['annotations'], mut_doc.linked_ents):
                 if ann.get('killed', False):
                     continue
                 logger.info("    Annotation %s (%s) [%d:%d]",
                             ann['value'], ann['cui'], ann['start'], ann['end'])
                 cui = ann['cui']
                 start = ann['start']
                 end = ann['end']
-                mut_entity = mut_doc.get_tokens(start, end)
                 if not mut_entity:
                     logger.warning(
                         "When looking for CUI '%s' (value '%s') [%d...%d] "
diff --git a/medcat-v2/tests/resources/supervised_mct_export.json b/medcat-v2/tests/resources/supervised_mct_export.json
@@ -58,7 +58,7 @@
                         {
                             "cui": "C04",
                             "start": 81,
-                            "end": 87,
+                            "end": 88,
                             "value": "fittest"
                         }
                     ],
@@ -125,7 +125,7 @@
                     "id": "ID-3",
                     "last_modified": "2024-08-21",
                     "name": "Doc#4",
-                    "text": "The RHS male is healthy as considered by all available tests. There are no indications that the patient is not fittest."
+                    "text": "The RHS male is healthy as considered by all available tests. There are no indications that the patient is not fittest"
                 }
             ],
             "id": "Project#0",
diff --git a/medcat-v2/tests/test_cat.py b/medcat-v2/tests/test_cat.py
@@ -7,6 +7,7 @@
 from contextlib import contextmanager
 
 from medcat import cat
+from medcat.data.mctexport import count_all_annotations, iter_anns
 from medcat.data.model_card import ModelCard
 from medcat.vocab import Vocab
 from medcat.config import Config
@@ -576,7 +577,7 @@ class CATSupTrainingTests(CATUnsupTrainingTests):
         os.path.dirname(__file__), 'resources', 'supervised_mct_export.json'
     )
     # NOTE: should remain consistent unless we change the model or data
-    EXPECTED_HASH = "7bfe01e8e36eb07d"
+    EXPECTED_HASH = "9c299628c9e6c220"
 
     @classmethod
     def _get_cui_counts(cls) -> dict[str, int]:
@@ -620,6 +621,21 @@ def test_clearing_training_works(self):
             self.assertEqual(self.cat.config.meta.unsup_trained, [])
             self.assertEqual(self.cat.config.meta.sup_trained, [])
 
+    def test_training_happens_in_correct_order(self):
+        with captured_state_cdb(self.cat.cdb):
+            with unittest.mock.patch.object(
+                    self.cat.trainer, "add_and_train_concept") as mock_add_and_train_concept:
+                self._perform_training()
+        mct_export = self._get_data()
+        called_ents = [
+            args.kwargs['mut_entity'] for args in mock_add_and_train_concept.call_args_list
+        ]
+        self.assertEqual(len(called_ents), count_all_annotations(mct_export))
+        for (_, _, ann), ent in zip(iter_anns(mct_export), called_ents):
+            with self.subTest(f"Ann: {ann} vs Ent: {ent}"):
+                self.assertEqual(ann['start'], ent.base.start_char_index)
+                self.assertEqual(ann['end'], ent.base.end_char_index)
+
 
 class CATWithDictNERSupTrainingTests(CATSupTrainingTests):
     from medcat.components.types import CoreComponentType
diff --git a/medcat-v2/tests/test_trainer.py b/medcat-v2/tests/test_trainer.py
diff --git a/medcat-v2/tests/tokenizing/spacy_impl/test_tokenizers.py b/medcat-v2/tests/tokenizing/spacy_impl/test_tokenizers.py

Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,7 @@`
`58`	`58`	`{`
`59`	`59`	`"cui": "C04",`
`60`	`60`	`"start": 81,`
`61`		`- "end": 87,`
	`61`	`+ "end": 88,`
`62`	`62`	`"value": "fittest"`
`63`	`63`	`}`
`64`	`64`	`],`
`@@ -125,7 +125,7 @@`
`125`	`125`	`"id": "ID-3",`
`126`	`126`	`"last_modified": "2024-08-21",`
`127`	`127`	`"name": "Doc#4",`
`128`		`- "text": "The RHS male is healthy as considered by all available tests. There are no indications that the patient is not fittest."`
	`128`	`+ "text": "The RHS male is healthy as considered by all available tests. There are no indications that the patient is not fittest"`
`129`	`129`	`}`
`130`	`130`	`],`
`131`	`131`	`"id": "Project#0",`