Skip to content
This repository was archived by the owner on Jul 28, 2025. It is now read-only.

Commit 01cdb4c

Browse files
authored
Merge pull request #509 from CogStack/master
Release PR for v1.15.0b1
2 parents ceb74b1 + 00c0dd0 commit 01cdb4c

24 files changed

Lines changed: 411 additions & 108 deletions

.github/workflows/main.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,13 @@ jobs:
3131
- name: Lint
3232
run: |
3333
flake8 medcat
34+
- name: Pydantic 1 check
35+
# NOTE: the following will look for use of pydantic1-specific .dict() method and .__fields__ attribute
36+
# if there are some (that are not annotated for pydantic1 backwards compatibility) a non-zero exit
37+
# code is returned, which will hald the workflow and print out the offending parts
38+
run: |
39+
grep "\.__fields__" medcat -rI | grep -v "# 4pydantic1 - backwards compatibility" | tee /dev/stderr | test $(wc -l) -eq 0
40+
grep "\.dict(" medcat -rI | grep -v "# 4pydantic1 - backwards compatibility" | tee /dev/stderr | test $(wc -l) -eq 0
3441
- name: Test
3542
run: |
3643
all_files=$(git ls-files | grep '^tests/.*\.py$' | grep -v '/__init__\.py$' | sed 's/\.py$//' | sed 's/\//./g')
@@ -54,7 +61,6 @@ jobs:
5461
repo: context.repo.repo
5562
});
5663
core.setOutput('latest_version', latestRelease.data.tag_name);
57-
5864
- name: Make sure there's no deprecated methods that should be removed.
5965
# only run this for master -> production PR. I.e just before doing a release.
6066
if: github.event.pull_request.base.ref == 'main' && github.event.pull_request.head.ref == 'production'

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ We have 4 public models available:
1717
1) UMLS Small (A modelpack containing a subset of UMLS (disorders, symptoms, medications...). Trained on MIMIC-III)
1818
2) SNOMED International (Full SNOMED modelpack trained on MIMIC-III)
1919
3) UMLS Dutch v1.10 (a modelpack provided by UMC Utrecht containing [UMLS entities with Dutch names](https://github.com/umcu/dutch-umls) trained on Dutch medical wikipedia articles and a negation detection model [repository](https://github.com/umcu/negation-detection/)/[paper](https://doi.org/10.48550/arxiv.2209.00470) trained on EMC Dutch Clinical Corpus).
20-
4) UMLS Full. >4MM concepts trained self-supervsied on MIMIC-III. v2022AA of UMLS.
20+
4) UMLS Full. >4MM concepts trained self-supervised on MIMIC-III. v2022AA of UMLS.
2121

2222
To download any of these models, please [follow this link](https://uts.nlm.nih.gov/uts/login?service=https://medcat.rosalind.kcl.ac.uk/auth-callback) and sign into your NIH profile / UMLS license. You will then be redirected to the MedCAT model download form. Please complete this form and you will be provided a download link.
2323

install_requires.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,6 @@
1919
'xxhash>=3.0.0' # allow later versions, tested with 3.1.0
2020
'blis>=0.7.5,<1.0.0' # allow later versions, tested with 0.7.9, avoid 1.0.0 (depends on numpy 2)
2121
'click>=8.0.4' # allow later versions, tested with 8.1.3
22-
'pydantic>=1.10.0,<2.0' # for spacy compatibility; avoid 2.0 due to breaking changes
22+
'pydantic>=2.0.0,<3.0' # avoid next major release
2323
"humanfriendly~=10.0" # for human readable file / RAM sizes
2424
"peft>=0.8.2"

medcat/cat.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -590,7 +590,7 @@ def _print_stats(self,
590590

591591
def _init_ckpts(self, is_resumed, checkpoint):
592592
if self.config.general.checkpoint.steps is not None or checkpoint is not None:
593-
checkpoint_config = CheckpointConfig(**self.config.general.checkpoint.dict())
593+
checkpoint_config = CheckpointConfig(**self.config.general.checkpoint.model_dump())
594594
checkpoint_manager = CheckpointManager('cat_train', checkpoint_config)
595595
if is_resumed:
596596
# TODO: probably remove is_resumed mark and always resume if a checkpoint is provided,

medcat/config.py

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from datetime import datetime
2-
from pydantic import BaseModel, Extra, ValidationError
3-
from pydantic.fields import ModelField
2+
from pydantic import BaseModel, ValidationError
43
from typing import List, Set, Tuple, cast, Any, Callable, Dict, Optional, Union, Type, Literal
54
from multiprocessing import cpu_count
65
import logging
@@ -125,7 +124,7 @@ def merge_config(self, config_dict: Dict) -> None:
125124
attr = None # new attribute
126125
value = config_dict[key]
127126
if isinstance(value, BaseModel):
128-
value = value.dict()
127+
value = value.model_dump()
129128
if isinstance(attr, MixingConfig):
130129
attr.merge_config(value)
131130
else:
@@ -177,7 +176,7 @@ def rebuild_re(self) -> None:
177176
def _calc_hash(self, hasher: Optional[Hasher] = None) -> Hasher:
178177
if hasher is None:
179178
hasher = Hasher()
180-
for _, v in cast(BaseModel, self).dict().items():
179+
for _, v in cast(BaseModel, self).model_dump().items():
181180
if isinstance(v, MixingConfig):
182181
v._calc_hash(hasher)
183182
else:
@@ -189,7 +188,7 @@ def get_hash(self, hasher: Optional[Hasher] = None):
189188
return hasher.hexdigest()
190189

191190
def __str__(self) -> str:
192-
return str(cast(BaseModel, self).dict())
191+
return str(cast(BaseModel, self).model_dump())
193192

194193
@classmethod
195194
def load(cls, save_path: str) -> "MixingConfig":
@@ -238,15 +237,15 @@ def asdict(self) -> Dict[str, Any]:
238237
Returns:
239238
Dict[str, Any]: The dictionary associated with this config
240239
"""
241-
return cast(BaseModel, self).dict()
240+
return cast(BaseModel, self).model_dump()
242241

243-
def fields(self) -> Dict[str, ModelField]:
242+
def fields(self) -> dict:
244243
"""Get the fields associated with this config.
245244
246245
Returns:
247-
Dict[str, ModelField]: The dictionary of the field names and fields
246+
dict: The dictionary of the field names and fields
248247
"""
249-
return cast(BaseModel, self).__fields__
248+
return cast(BaseModel, self).model_fields
250249

251250

252251
class VersionInfo(MixingConfig, BaseModel):
@@ -272,7 +271,7 @@ class VersionInfo(MixingConfig, BaseModel):
272271
"""Which version of medcat was used to build the CDB"""
273272

274273
class Config:
275-
extra = Extra.allow
274+
extra = 'allow'
276275
validate_assignment = True
277276

278277

@@ -290,7 +289,7 @@ class CDBMaker(MixingConfig, BaseModel):
290289
"""Minimum number of letters required in a name to be accepted for a concept"""
291290

292291
class Config:
293-
extra = Extra.allow
292+
extra = 'allow'
294293
validate_assignment = True
295294

296295

@@ -303,7 +302,7 @@ class AnnotationOutput(MixingConfig, BaseModel):
303302
include_text_in_output: bool = False
304303

305304
class Config:
306-
extra = Extra.allow
305+
extra = 'allow'
307306
validate_assignment = True
308307

309308

@@ -317,7 +316,7 @@ class CheckPoint(MixingConfig, BaseModel):
317316
"""When training the maximum checkpoints will be kept on the disk"""
318317

319318
class Config:
320-
extra = Extra.allow
319+
extra = 'allow'
321320
validate_assignment = True
322321

323322

@@ -354,7 +353,7 @@ class General(MixingConfig, BaseModel):
354353
355354
NB! For these changes to take effect, the pipe would need to be recreated."""
356355
checkpoint: CheckPoint = CheckPoint()
357-
usage_monitor = UsageMonitor()
356+
usage_monitor: UsageMonitor = UsageMonitor()
358357
"""Checkpointing config"""
359358
log_level: int = logging.INFO
360359
"""Logging config for everything | 'tagger' can be disabled, but will cause a drop in performance"""
@@ -395,7 +394,7 @@ class General(MixingConfig, BaseModel):
395394
reliable due to not taking into account all the details of the changes."""
396395

397396
class Config:
398-
extra = Extra.allow
397+
extra = 'allow'
399398
validate_assignment = True
400399

401400

@@ -424,7 +423,7 @@ class Preprocessing(MixingConfig, BaseModel):
424423
NB! For these changes to take effect, the pipe would need to be recreated."""
425424

426425
class Config:
427-
extra = Extra.allow
426+
extra = 'allow'
428427
validate_assignment = True
429428

430429

@@ -444,7 +443,7 @@ class Ner(MixingConfig, BaseModel):
444443
"""Try reverse word order for short concepts (2 words max), e.g. heart disease -> disease heart"""
445444

446445
class Config:
447-
extra = Extra.allow
446+
extra = 'allow'
448447
validate_assignment = True
449448

450449

@@ -579,7 +578,7 @@ class Linking(MixingConfig, BaseModel):
579578
"""If true when the context of a concept is calculated (embedding) the words making that concept are not taken into account"""
580579

581580
class Config:
582-
extra = Extra.allow
581+
extra = 'allow'
583582
validate_assignment = True
584583

585584

@@ -600,7 +599,7 @@ class Config:
600599
# this if for word_skipper and punct_checker which would otherwise
601600
# not have a validator
602601
arbitrary_types_allowed = True
603-
extra = Extra.allow
602+
extra = 'allow'
604603
validate_assignment = True
605604

606605
def __init__(self, *args, **kwargs):
@@ -618,7 +617,7 @@ def rebuild_re(self) -> None:
618617
# Override
619618
def get_hash(self):
620619
hasher = Hasher()
621-
for k, v in self.dict().items():
620+
for k, v in self.model_dump().items():
622621
if k in ['hash', ]:
623622
# ignore hash
624623
continue
@@ -674,4 +673,6 @@ def wrapper(*args, **kwargs):
674673
# we get a nicer exceptio
675674
_waf_advice = "You can use `cat.cdb.weighted_average_function` to access it directly"
676675
Linking.__getattribute__ = _wrapper(Linking.__getattribute__, Linking, _waf_advice, AttributeError) # type: ignore
676+
if hasattr(Linking, '__getattr__'):
677+
Linking.__getattr__ = _wrapper(Linking.__getattr__, Linking, _waf_advice, AttributeError) # type: ignore
677678
Linking.__getitem__ = _wrapper(Linking.__getitem__, Linking, _waf_advice, KeyError) # type: ignore

medcat/config_meta_cat.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from typing import Dict, Any
2-
from medcat.config import MixingConfig, BaseModel, Optional, Extra
2+
from medcat.config import MixingConfig, BaseModel, Optional
33

44

55
class General(MixingConfig, BaseModel):
@@ -65,7 +65,7 @@ class General(MixingConfig, BaseModel):
6565
Otherwise defaults to doc._.ents or doc.ents per the annotate_overlapping settings"""
6666

6767
class Config:
68-
extra = Extra.allow
68+
extra = 'allow'
6969
validate_assignment = True
7070

7171

@@ -169,7 +169,7 @@ class Model(MixingConfig, BaseModel):
169169
"""If set to True center positions will be ignored when calculating representation"""
170170

171171
class Config:
172-
extra = Extra.allow
172+
extra = 'allow'
173173
validate_assignment = True
174174

175175

@@ -191,7 +191,7 @@ class Train(MixingConfig, BaseModel):
191191
"""If set only this CUIs will be used for training"""
192192
auto_save_model: bool = True
193193
"""Should do model be saved during training for best results"""
194-
last_train_on: Optional[int] = None
194+
last_train_on: Optional[float] = None
195195
"""When was the last training run"""
196196
metric: Dict[str, str] = {'base': 'weighted avg', 'score': 'f1-score'}
197197
"""What metric should be used for choosing the best model"""
@@ -206,7 +206,7 @@ class Train(MixingConfig, BaseModel):
206206
"""Focal Loss hyperparameter - determines importance the loss gives to hard-to-classify examples"""
207207

208208
class Config:
209-
extra = Extra.allow
209+
extra = 'allow'
210210
validate_assignment = True
211211

212212

@@ -217,5 +217,5 @@ class ConfigMetaCAT(MixingConfig, BaseModel):
217217
train: Train = Train()
218218

219219
class Config:
220-
extra = Extra.allow
220+
extra = 'allow'
221221
validate_assignment = True

medcat/config_rel_cat.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import logging
22
from typing import Dict, Any, List
3-
from medcat.config import MixingConfig, BaseModel, Optional, Extra
3+
from medcat.config import MixingConfig, BaseModel, Optional
44

55

66
class General(MixingConfig, BaseModel):
@@ -89,7 +89,7 @@ class Model(MixingConfig, BaseModel):
8989
"""If set to True center positions will be ignored when calculating representation"""
9090

9191
class Config:
92-
extra = Extra.allow
92+
extra = 'allow'
9393
validate_assignment = True
9494

9595

@@ -116,7 +116,7 @@ class Train(MixingConfig, BaseModel):
116116
"""Should the model be saved during training for best results"""
117117

118118
class Config:
119-
extra = Extra.allow
119+
extra = 'allow'
120120
validate_assignment = True
121121

122122

@@ -127,5 +127,5 @@ class ConfigRelCAT(MixingConfig, BaseModel):
127127
train: Train = Train()
128128

129129
class Config:
130-
extra = Extra.allow
130+
extra = 'allow'
131131
validate_assignment = True

medcat/config_transformers_ner.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from medcat.config import MixingConfig, BaseModel, Optional, Extra
1+
from medcat.config import MixingConfig, BaseModel, Optional
22

33

44
class General(MixingConfig, BaseModel):
@@ -16,11 +16,11 @@ class General(MixingConfig, BaseModel):
1616
chunking_overlap_window: Optional[int] = 5
1717
"""Size of the overlap window used for chunking"""
1818
test_size: float = 0.2
19-
last_train_on: Optional[int] = None
19+
last_train_on: Optional[float] = None
2020
verbose_metrics: bool = False
2121

2222
class Config:
23-
extra = Extra.allow
23+
extra = 'allow'
2424
validate_assignment = True
2525

2626

@@ -29,5 +29,5 @@ class ConfigTransformersNER(MixingConfig, BaseModel):
2929
general: General = General()
3030

3131
class Config:
32-
extra = Extra.allow
32+
extra = 'allow'
3333
validate_assignment = True

medcat/meta_cat.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,8 @@ def get_hash(self) -> str:
114114
"""
115115
hasher = Hasher()
116116
# Set last_train_on if None
117-
if self.config.train['last_train_on'] is None:
118-
self.config.train['last_train_on'] = datetime.now().timestamp()
117+
if self.config.train.last_train_on is None:
118+
self.config.train.last_train_on = datetime.now().timestamp()
119119

120120
hasher.update(self.config.get_hash())
121121
return hasher.hexdigest()
@@ -310,7 +310,7 @@ def train_raw(self, data_loaded: Dict, save_dir_path: Optional[str] = None, data
310310
# Save everything now
311311
self.save(save_dir_path=save_dir_path)
312312

313-
self.config.train['last_train_on'] = datetime.now().timestamp()
313+
self.config.train.last_train_on = datetime.now().timestamp()
314314
return report
315315

316316
def eval(self, json_path: str) -> Dict:

medcat/ner/transformers_ner.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,8 @@ def get_hash(self) -> str:
103103
"""
104104
hasher = Hasher()
105105
# Set last_train_on if None
106-
if self.config.general['last_train_on'] is None:
107-
self.config.general['last_train_on'] = datetime.now().timestamp()
106+
if self.config.general.last_train_on is None:
107+
self.config.general.last_train_on = datetime.now().timestamp()
108108

109109
hasher.update(self.config.get_hash())
110110
return hasher.hexdigest()
@@ -242,7 +242,7 @@ def train(self,
242242
trainer.train() # type: ignore
243243

244244
# Save the training time
245-
self.config.general['last_train_on'] = datetime.now().timestamp() # type: ignore
245+
self.config.general.last_train_on = datetime.now().timestamp() # type: ignore
246246

247247
# Save everything
248248
self.save(save_dir_path=os.path.join(self.training_arguments.output_dir, 'final_model'))

0 commit comments

Comments
 (0)