CogStack
diff --git a/‎.github/workflows/main.yml‎
Lines changed: 7 additions & 1 deletion b/‎.github/workflows/main.yml‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎install_requires.txt‎
Lines changed: 1 addition & 1 deletion b/‎install_requires.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎medcat/cat.py‎
Lines changed: 1 addition & 1 deletion b/‎medcat/cat.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎medcat/config.py‎
Lines changed: 21 additions & 20 deletions b/‎medcat/config.py‎
Lines changed: 21 additions & 20 deletions
diff --git a/‎medcat/config_meta_cat.py‎
Lines changed: 6 additions & 6 deletions b/‎medcat/config_meta_cat.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎medcat/config_rel_cat.py‎
Lines changed: 4 additions & 4 deletions b/‎medcat/config_rel_cat.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎medcat/config_transformers_ner.py‎
Lines changed: 4 additions & 4 deletions b/‎medcat/config_transformers_ner.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎medcat/meta_cat.py‎
Lines changed: 3 additions & 3 deletions b/‎medcat/meta_cat.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎medcat/ner/transformers_ner.py‎
Lines changed: 3 additions & 3 deletions b/‎medcat/ner/transformers_ner.py‎
Lines changed: 3 additions & 3 deletions
@@ -31,6 +31,13 @@ jobs:
       - name: Lint
         run: |
           flake8 medcat
+      - name: Pydantic 1 check
+        # NOTE: the following will look for use of pydantic1-specific .dict() method and .__fields__ attribute
+        #       if there are some (that are not annotated for pydantic1 backwards compatibility) a non-zero exit
+        #       code is returned, which will hald the workflow and print out the offending parts
+        run: |
+          grep "\.__fields__" medcat -rI | grep -v "# 4pydantic1 - backwards compatibility" | tee /dev/stderr | test $(wc -l) -eq 0
+          grep "\.dict(" medcat -rI | grep -v "# 4pydantic1 - backwards compatibility" | tee /dev/stderr | test $(wc -l) -eq 0
       - name: Test
         run: |
           all_files=$(git ls-files | grep '^tests/.*\.py$' | grep -v '/__init__\.py$' | sed 's/\.py$//' | sed 's/\//./g')
@@ -54,7 +61,6 @@ jobs:
               repo: context.repo.repo
             });
             core.setOutput('latest_version', latestRelease.data.tag_name);
-
       - name: Make sure there's no deprecated methods that should be removed.
         # only run this for master -> production PR. I.e just before doing a release.
         if: github.event.pull_request.base.ref == 'main' && github.event.pull_request.head.ref == 'production'
 
@@ -17,7 +17,7 @@ We have 4 public models available:
 1) UMLS Small (A modelpack containing a subset of UMLS (disorders, symptoms, medications...). Trained on MIMIC-III)
 2) SNOMED International (Full SNOMED modelpack trained on MIMIC-III)
 3) UMLS Dutch v1.10 (a modelpack provided by UMC Utrecht containing [UMLS entities with Dutch names](https://github.com/umcu/dutch-umls) trained on Dutch medical wikipedia articles and a negation detection model [repository](https://github.com/umcu/negation-detection/)/[paper](https://doi.org/10.48550/arxiv.2209.00470) trained on EMC Dutch Clinical Corpus).
-4) UMLS Full. >4MM concepts trained self-supervsied on MIMIC-III. v2022AA of UMLS.
+4) UMLS Full. >4MM concepts trained self-supervised on MIMIC-III. v2022AA of UMLS.
 
 To download any of these models, please [follow this link](https://uts.nlm.nih.gov/uts/login?service=https://medcat.rosalind.kcl.ac.uk/auth-callback) and sign into your NIH profile / UMLS license. You will then be redirected to the MedCAT model download form. Please complete this form and you will be provided a download link.
 
 
@@ -19,6 +19,6 @@
 'xxhash>=3.0.0' # allow later versions, tested with 3.1.0
 'blis>=0.7.5,<1.0.0' # allow later versions, tested with 0.7.9, avoid 1.0.0 (depends on numpy 2)
 'click>=8.0.4' # allow later versions, tested with 8.1.3
-'pydantic>=1.10.0,<2.0' # for spacy compatibility; avoid 2.0 due to breaking changes
+'pydantic>=2.0.0,<3.0' # avoid next major release
 "humanfriendly~=10.0"  # for human readable file / RAM sizes
 "peft>=0.8.2"
@@ -590,7 +590,7 @@ def _print_stats(self,
 
     def _init_ckpts(self, is_resumed, checkpoint):
         if self.config.general.checkpoint.steps is not None or checkpoint is not None:
-            checkpoint_config = CheckpointConfig(**self.config.general.checkpoint.dict())
+            checkpoint_config = CheckpointConfig(**self.config.general.checkpoint.model_dump())
             checkpoint_manager = CheckpointManager('cat_train', checkpoint_config)
             if is_resumed:
                 # TODO: probably remove is_resumed mark and always resume if a checkpoint is provided,
 
@@ -1,6 +1,5 @@
 from datetime import datetime
-from pydantic import BaseModel, Extra, ValidationError
-from pydantic.fields import ModelField
+from pydantic import BaseModel, ValidationError
 from typing import List, Set, Tuple, cast, Any, Callable, Dict, Optional, Union, Type, Literal
 from multiprocessing import cpu_count
 import logging
@@ -125,7 +124,7 @@ def merge_config(self, config_dict: Dict) -> None:
                 attr = None # new attribute
             value = config_dict[key]
             if isinstance(value, BaseModel):
-                value = value.dict()
+                value = value.model_dump()
             if isinstance(attr, MixingConfig):
                 attr.merge_config(value)
             else:
@@ -177,7 +176,7 @@ def rebuild_re(self) -> None:
     def _calc_hash(self, hasher: Optional[Hasher] = None) -> Hasher:
         if hasher is None:
             hasher = Hasher()
-        for _, v in cast(BaseModel, self).dict().items():
+        for _, v in cast(BaseModel, self).model_dump().items():
             if isinstance(v, MixingConfig):
                 v._calc_hash(hasher)
             else:
@@ -189,7 +188,7 @@ def get_hash(self, hasher: Optional[Hasher] = None):
         return hasher.hexdigest()
 
     def __str__(self) -> str:
-        return str(cast(BaseModel, self).dict())
+        return str(cast(BaseModel, self).model_dump())
 
     @classmethod
     def load(cls, save_path: str) -> "MixingConfig":
@@ -238,15 +237,15 @@ def asdict(self) -> Dict[str, Any]:
         Returns:
             Dict[str, Any]: The dictionary associated with this config
         """
-        return cast(BaseModel, self).dict()
+        return cast(BaseModel, self).model_dump()
 
-    def fields(self) -> Dict[str, ModelField]:
+    def fields(self) -> dict:
         """Get the fields associated with this config.
 
         Returns:
-            Dict[str, ModelField]: The dictionary of the field names and fields
+            dict: The dictionary of the field names and fields
         """
-        return cast(BaseModel, self).__fields__
+        return cast(BaseModel, self).model_fields
 
 
 class VersionInfo(MixingConfig, BaseModel):
@@ -272,7 +271,7 @@ class VersionInfo(MixingConfig, BaseModel):
     """Which version of medcat was used to build the CDB"""
 
     class Config:
-        extra = Extra.allow
+        extra = 'allow'
         validate_assignment = True
 
 
@@ -290,7 +289,7 @@ class CDBMaker(MixingConfig, BaseModel):
     """Minimum number of letters required in a name to be accepted for a concept"""
 
     class Config:
-        extra = Extra.allow
+        extra = 'allow'
         validate_assignment = True
 
 
@@ -303,7 +302,7 @@ class AnnotationOutput(MixingConfig, BaseModel):
     include_text_in_output: bool = False
 
     class Config:
-        extra = Extra.allow
+        extra = 'allow'
         validate_assignment = True
 
 
@@ -317,7 +316,7 @@ class CheckPoint(MixingConfig, BaseModel):
     """When training the maximum checkpoints will be kept on the disk"""
 
     class Config:
-        extra = Extra.allow
+        extra = 'allow'
         validate_assignment = True
 
 
@@ -354,7 +353,7 @@ class General(MixingConfig, BaseModel):
 
     NB! For these changes to take effect, the pipe would need to be recreated."""
     checkpoint: CheckPoint = CheckPoint()
-    usage_monitor = UsageMonitor()
+    usage_monitor: UsageMonitor = UsageMonitor()
     """Checkpointing config"""
     log_level: int = logging.INFO
     """Logging config for everything | 'tagger' can be disabled, but will cause a drop in performance"""
@@ -395,7 +394,7 @@ class General(MixingConfig, BaseModel):
     reliable due to not taking into account all the details of the changes."""
 
     class Config:
-        extra = Extra.allow
+        extra = 'allow'
         validate_assignment = True
 
 
@@ -424,7 +423,7 @@ class Preprocessing(MixingConfig, BaseModel):
     NB! For these changes to take effect, the pipe would need to be recreated."""
 
     class Config:
-        extra = Extra.allow
+        extra = 'allow'
         validate_assignment = True
 
 
@@ -444,7 +443,7 @@ class Ner(MixingConfig, BaseModel):
     """Try reverse word order for short concepts (2 words max), e.g. heart disease -> disease heart"""
 
     class Config:
-        extra = Extra.allow
+        extra = 'allow'
         validate_assignment = True
 
 
@@ -579,7 +578,7 @@ class Linking(MixingConfig, BaseModel):
     """If true when the context of a concept is calculated (embedding) the words making that concept are not taken into account"""
 
     class Config:
-        extra = Extra.allow
+        extra = 'allow'
         validate_assignment = True
 
 
@@ -600,7 +599,7 @@ class Config:
         # this if for word_skipper and punct_checker which would otherwise
         # not have a validator
         arbitrary_types_allowed = True
-        extra = Extra.allow
+        extra = 'allow'
         validate_assignment = True
 
     def __init__(self, *args, **kwargs):
@@ -618,7 +617,7 @@ def rebuild_re(self) -> None:
     # Override
     def get_hash(self):
         hasher = Hasher()
-        for k, v in self.dict().items():
+        for k, v in self.model_dump().items():
             if k in ['hash', ]:
                 # ignore hash
                 continue
@@ -674,4 +673,6 @@ def wrapper(*args, **kwargs):
 # we get a nicer exceptio
 _waf_advice = "You can use `cat.cdb.weighted_average_function` to access it directly"
 Linking.__getattribute__ = _wrapper(Linking.__getattribute__, Linking, _waf_advice, AttributeError)  # type: ignore
+if hasattr(Linking, '__getattr__'):
+    Linking.__getattr__ = _wrapper(Linking.__getattr__, Linking, _waf_advice, AttributeError)  # type: ignore
 Linking.__getitem__ = _wrapper(Linking.__getitem__, Linking, _waf_advice, KeyError)  # type: ignore
@@ -1,5 +1,5 @@
 from typing import Dict, Any
-from medcat.config import MixingConfig, BaseModel, Optional, Extra
+from medcat.config import MixingConfig, BaseModel, Optional
 
 
 class General(MixingConfig, BaseModel):
@@ -65,7 +65,7 @@ class General(MixingConfig, BaseModel):
     Otherwise defaults to doc._.ents or doc.ents per the annotate_overlapping settings"""
 
     class Config:
-        extra = Extra.allow
+        extra = 'allow'
         validate_assignment = True
 
 
@@ -169,7 +169,7 @@ class Model(MixingConfig, BaseModel):
     """If set to True center positions will be ignored when calculating representation"""
 
     class Config:
-        extra = Extra.allow
+        extra = 'allow'
         validate_assignment = True
 
 
@@ -191,7 +191,7 @@ class Train(MixingConfig, BaseModel):
     """If set only this CUIs will be used for training"""
     auto_save_model: bool = True
     """Should do model be saved during training for best results"""
-    last_train_on: Optional[int] = None
+    last_train_on: Optional[float] = None
     """When was the last training run"""
     metric: Dict[str, str] = {'base': 'weighted avg', 'score': 'f1-score'}
     """What metric should be used for choosing the best model"""
@@ -206,7 +206,7 @@ class Train(MixingConfig, BaseModel):
     """Focal Loss hyperparameter - determines importance the loss gives to hard-to-classify examples"""
 
     class Config:
-        extra = Extra.allow
+        extra = 'allow'
         validate_assignment = True
 
 
@@ -217,5 +217,5 @@ class ConfigMetaCAT(MixingConfig, BaseModel):
     train: Train = Train()
 
     class Config:
-        extra = Extra.allow
+        extra = 'allow'
         validate_assignment = True
@@ -1,6 +1,6 @@
 import logging
 from typing import Dict, Any, List
-from medcat.config import MixingConfig, BaseModel, Optional, Extra
+from medcat.config import MixingConfig, BaseModel, Optional
 
 
 class General(MixingConfig, BaseModel):
@@ -89,7 +89,7 @@ class Model(MixingConfig, BaseModel):
     """If set to True center positions will be ignored when calculating representation"""
 
     class Config:
-        extra = Extra.allow
+        extra = 'allow'
         validate_assignment = True
 
 
@@ -116,7 +116,7 @@ class Train(MixingConfig, BaseModel):
     """Should the model be saved during training for best results"""
 
     class Config:
-        extra = Extra.allow
+        extra = 'allow'
         validate_assignment = True
 
 
@@ -127,5 +127,5 @@ class ConfigRelCAT(MixingConfig, BaseModel):
     train: Train = Train()
 
     class Config:
-        extra = Extra.allow
+        extra = 'allow'
         validate_assignment = True
@@ -1,4 +1,4 @@
-from medcat.config import MixingConfig, BaseModel, Optional, Extra
+from medcat.config import MixingConfig, BaseModel, Optional
 
 
 class General(MixingConfig, BaseModel):
@@ -16,11 +16,11 @@ class General(MixingConfig, BaseModel):
     chunking_overlap_window: Optional[int] = 5
     """Size of the overlap window used for chunking"""
     test_size: float = 0.2
-    last_train_on: Optional[int] = None
+    last_train_on: Optional[float] = None
     verbose_metrics: bool = False
 
     class Config:
-        extra = Extra.allow
+        extra = 'allow'
         validate_assignment = True
 
 
@@ -29,5 +29,5 @@ class ConfigTransformersNER(MixingConfig, BaseModel):
     general: General = General()
 
     class Config:
-        extra = Extra.allow
+        extra = 'allow'
         validate_assignment = True
@@ -114,8 +114,8 @@ def get_hash(self) -> str:
         """
         hasher = Hasher()
         # Set last_train_on if None
-        if self.config.train['last_train_on'] is None:
-            self.config.train['last_train_on'] = datetime.now().timestamp()
+        if self.config.train.last_train_on is None:
+            self.config.train.last_train_on = datetime.now().timestamp()
 
         hasher.update(self.config.get_hash())
         return hasher.hexdigest()
@@ -310,7 +310,7 @@ def train_raw(self, data_loaded: Dict, save_dir_path: Optional[str] = None, data
                 # Save everything now
                 self.save(save_dir_path=save_dir_path)
 
-        self.config.train['last_train_on'] = datetime.now().timestamp()
+        self.config.train.last_train_on = datetime.now().timestamp()
         return report
 
     def eval(self, json_path: str) -> Dict:
 
@@ -103,8 +103,8 @@ def get_hash(self) -> str:
         """
         hasher = Hasher()
         # Set last_train_on if None
-        if self.config.general['last_train_on'] is None:
-            self.config.general['last_train_on'] = datetime.now().timestamp()
+        if self.config.general.last_train_on is None:
+            self.config.general.last_train_on = datetime.now().timestamp()
 
         hasher.update(self.config.get_hash())
         return hasher.hexdigest()
@@ -242,7 +242,7 @@ def train(self,
         trainer.train() # type: ignore
 
         # Save the training time
-        self.config.general['last_train_on'] = datetime.now().timestamp() # type: ignore
+        self.config.general.last_train_on = datetime.now().timestamp() # type: ignore
 
         # Save everything
         self.save(save_dir_path=os.path.join(self.training_arguments.output_dir, 'final_model'))