diff --git a/medcat-v2/medcat/components/addons/meta_cat/data_utils.py b/medcat-v2/medcat/components/addons/meta_cat/data_utils.py index caf568b9f..3c14982e5 100644 --- a/medcat-v2/medcat/components/addons/meta_cat/data_utils.py +++ b/medcat-v2/medcat/components/addons/meta_cat/data_utils.py @@ -126,6 +126,10 @@ def _prepare_from_json_loop(document: dict, else: ctoken_idx.append(ind) + if not ctoken_idx: + # Entity span did not map to any tokens + continue + _start = max(0, ctoken_idx[0] - cntx_left) _end = min(len(doc_text['input_ids']), ctoken_idx[-1] + 1 + cntx_right) diff --git a/medcat-v2/medcat/components/addons/meta_cat/meta_cat.py b/medcat-v2/medcat/components/addons/meta_cat/meta_cat.py index 5386a8dc1..05fb4a185 100644 --- a/medcat-v2/medcat/components/addons/meta_cat/meta_cat.py +++ b/medcat-v2/medcat/components/addons/meta_cat/meta_cat.py @@ -736,6 +736,10 @@ def prepare_document(self, doc: MutableDocument, input_ids: list, # Start where the last ent was found, cannot be before it as we've # sorted + if not ctoken_idx: + # Entity span did not map to any tokens (e.g. entity at + # document boundary or beyond tokenised text length) + continue last_ind += ind # If we did not start from 0 in the for loop _start = max(0, ctoken_idx[0] - cntx_left) @@ -825,6 +829,9 @@ def _set_meta_anns(self, ents = self.get_ents(doc) for ent in ents: + if ent.id not in ent_id2ind: + # Entity was skipped in prepare_document (no token mapping) + continue ent_ind = ent_id2ind[ent.id] value = id2category_value[predictions[ent_ind]] confidence = confidences[ent_ind] diff --git a/medcat-v2/medcat/trainer.py b/medcat-v2/medcat/trainer.py index 7a2b32ef8..d8f4b5f80 100644 --- a/medcat-v2/medcat/trainer.py +++ b/medcat-v2/medcat/trainer.py @@ -336,8 +336,15 @@ def _train_meta_cat(self, addon: AddonComponent, cat_name = cnf.general.get_applicable_category_name(ann_names) if cat_name in ann_names: logger.debug("Training MetaCAT %s", cnf.general.category_name) + # Provide a save directory for auto_save_model support — + # train_raw requires save_dir_path when auto_save_model is True + save_dir = None + if cnf.train.auto_save_model: + import tempfile + save_dir = tempfile.mkdtemp( + prefix=f"metacat_{cnf.general.category_name}_") # NOTE: this is a mypy quirk - the types are compatible - addon.mc.train_raw(cast(dict, data)) + addon.mc.train_raw(cast(dict, data), save_dir_path=save_dir) def _train_addons(self, data: MedCATTrainerExport): logger.info("Training addons within train_supervised_raw")