From 0623508f66cb2b6d6e9114a423c8c7b20857a071 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Wed, 25 Mar 2026 17:33:12 -0700 Subject: [PATCH 1/2] feat: round-trip schema through SchemaDefinition after merge/overlay After `-M` deep merge, output keys could appear in non-canonical order because deepmerge preserves dict insertion order. After `-O` overlay, key ordering relied on a manual SchemaDefinition field list that also silently dropped unknown keys. Both functions now call a new `canonicalize_schema_yml` helper that round-trips the YAML through `SchemaDefinition` via linkml-runtime's yaml_loader/yaml_dumper. This produces canonical key ordering and raises `InvalidLinkMLSchemaError` (new exception) for any field name unknown to `SchemaDefinition` or its nested objects, which the CLI converts to a `BadParameter` error. `remove_schema_key_duplication` is moved to after both merge/overlay steps in the CLI pipeline (so it strips the `name`/`text`/`prefix_prefix` fields re-introduced by the round-trip), and is extended to also strip the redundant `prefix_prefix` key from prefix entries. Co-Authored-By: Claude Sonnet 4.6 --- CLAUDE.md | 27 ++++--- README.md | 4 +- src/pydantic2linkml/cli/__init__.py | 18 ++++- src/pydantic2linkml/exceptions.py | 6 ++ src/pydantic2linkml/tools.py | 115 +++++++++++++++++++--------- tests/test_cli.py | 13 +++- tests/test_tools.py | 69 +++++++++++++---- 7 files changed, 184 insertions(+), 68 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 5e8f5020..7bd0275d 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -69,10 +69,10 @@ Options: - `--output-file`/`-o` (path) — write output to a file instead of stdout - `--merge-file`/`-M` (path) — deep-merge a YAML file into the generated - schema; values from the file win on conflict; no field filtering applied + schema; values from the file win on conflict; unknown field names raise + an error - `--overlay-file`/`-O` (path) — shallow-merge a YAML file into the - generated schema; only `SchemaDefinition` fields are applied; unknown - keys are skipped with a warning + generated schema; unknown field names raise an error - `--log-level`/`-l` (default: WARNING) ## Architecture @@ -91,13 +91,18 @@ Options: resolution context, field name, `FieldInfo`, and owning model - `resolve_ref_schema()` — resolves `definition-ref` and `definitions` schema types to concrete schemas + - `canonicalize_schema_yml(yml)` — round-trips a YAML string through + `SchemaDefinition` to produce canonically ordered output and detect + unknown field names (raises `InvalidLinkMLSchemaError`); note that + wrong-type values for known fields are generally not detected - `apply_schema_overlay(schema_yml, overlay_file)` — shallow-merges a - YAML file into a schema YAML string; restricts keys to - `SchemaDefinition` fields + YAML file into a schema YAML string; no field filtering; calls + `canonicalize_schema_yml` to reorder keys and detect unknown fields - `apply_yaml_deep_merge(schema_yml, merge_file)` — deep-merges a YAML - file into a schema YAML string using `deepmerge`; no field filtering - - `remove_schema_key_duplication(yml)` — strips redundant `name`/`text` - fields from serialized LinkML YAML + file into a schema YAML string using `deepmerge`; calls + `canonicalize_schema_yml` to reorder keys and detect unknown fields + - `remove_schema_key_duplication(yml)` — strips redundant `name`/`text`/ + `prefix_prefix` fields from serialized LinkML YAML - `add_section_breaks(yml)` — inserts blank lines before top-level sections @@ -109,8 +114,8 @@ Options: 3. **`cli/`** — Typer-based CLI wrapping `translate_defs`; `cli/__init__.py` defines the `app` and `main` command. After translation the pipeline is: - dump YAML → `remove_schema_key_duplication` → optional `-M` deep merge - → optional `-O` overlay → `add_section_breaks` → output. + dump YAML → optional `-M` deep merge → optional `-O` overlay → + `remove_schema_key_duplication` → `add_section_breaks` → output. 4. **`exceptions.py`** — Custom exceptions: - `NameCollisionError` — duplicate class/enum names across modules @@ -120,6 +125,8 @@ Options: via slot_usage - `YAMLContentError` — YAML file content is not what is expected (e.g., not a mapping) + - `InvalidLinkMLSchemaError` — YAML string contains field names unknown + to LinkML (raised by `canonicalize_schema_yml`) ### Key Design Patterns diff --git a/README.md b/README.md index 1ebfc619..cb5263ed 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,6 @@ pydantic2linkml -o o.yml -l INFO dandischema.models | Flag | Description | |------|-------------| | `-o` / `--output-file` | Write output to a file (default: stdout) | -| `-M` / `--merge-file` | Deep-merge a YAML file into the generated schema. Values from the file win on conflict; no field filtering is applied. | -| `-O` / `--overlay-file` | Shallow-merge a YAML file into the generated schema. Only `SchemaDefinition` fields are applied; unknown keys are skipped with a warning. | +| `-M` / `--merge-file` | Deep-merge a YAML file into the generated schema. Values from the file win on conflict; unknown field names raise an error. | +| `-O` / `--overlay-file` | Shallow-merge a YAML file into the generated schema. Unknown field names raise an error. | | `-l` / `--log-level` | Log level (default: `WARNING`) | diff --git a/src/pydantic2linkml/cli/__init__.py b/src/pydantic2linkml/cli/__init__.py index bbcb063e..517c2ec8 100644 --- a/src/pydantic2linkml/cli/__init__.py +++ b/src/pydantic2linkml/cli/__init__.py @@ -8,7 +8,7 @@ from pydantic import ValidationError from pydantic2linkml.cli.tools import LogLevel -from pydantic2linkml.exceptions import YAMLContentError +from pydantic2linkml.exceptions import InvalidLinkMLSchemaError, YAMLContentError from pydantic2linkml.gen_linkml import translate_defs from pydantic2linkml.tools import ( add_section_breaks, @@ -45,8 +45,7 @@ def main( "the generated schema. The overlay is merged into the serialized YAML " "output, so the result is always a valid YAML file but may not be a " "valid LinkML schema — it is the user's responsibility to supply an " - "overlay that produces a valid schema. Overlay keys that do not " - "correspond to a field of SchemaDefinition are skipped.", + "overlay that produces a valid schema. Unknown keys raise an error.", ), ] = None, output_file: Annotated[Optional[Path], typer.Option("--output-file", "-o")] = None, @@ -59,7 +58,7 @@ def main( schema = translate_defs(module_names) logger.info("Dumping schema") - yml = remove_schema_key_duplication(yaml_dumper.dumps(schema)) + yml = yaml_dumper.dumps(schema) if merge_file is not None: logger.info("Applying deep merge from %s", merge_file) try: @@ -79,6 +78,11 @@ def main( f"The merge file does not contain a valid YAML mapping: {e}", param_hint="'--merge-file'", ) from e + except InvalidLinkMLSchemaError as e: + raise typer.BadParameter( + f"The merge file introduces field names unknown to LinkML: {e}", + param_hint="'--merge-file'", + ) from e if overlay_file is not None: logger.info("Applying overlay from %s", overlay_file) try: @@ -93,6 +97,12 @@ def main( f"The overlay file does not contain a valid YAML mapping: {e}", param_hint="'--overlay-file'", ) from e + except InvalidLinkMLSchemaError as e: + raise typer.BadParameter( + f"The overlay file introduces field names unknown to LinkML: {e}", + param_hint="'--overlay-file'", + ) from e + yml = remove_schema_key_duplication(yml) yml = add_section_breaks(yml) if not output_file: print(yml, end="") # noqa: T201 diff --git a/src/pydantic2linkml/exceptions.py b/src/pydantic2linkml/exceptions.py index f570ce9d..49a89923 100644 --- a/src/pydantic2linkml/exceptions.py +++ b/src/pydantic2linkml/exceptions.py @@ -110,3 +110,9 @@ class YAMLContentError(ValueError): """ Raise when the content of a YAML file is not what is expected """ + + +class InvalidLinkMLSchemaError(ValueError): + """ + Raised when a YAML string contains field names unknown to LinkML + """ diff --git a/src/pydantic2linkml/tools.py b/src/pydantic2linkml/tools.py index cc0e0fe5..6c9e93bc 100644 --- a/src/pydantic2linkml/tools.py +++ b/src/pydantic2linkml/tools.py @@ -12,7 +12,9 @@ from typing import Any, NamedTuple, Optional, TypeVar, cast import yaml +from linkml_runtime.dumpers import yaml_dumper from linkml_runtime.linkml_model import SchemaDefinition, SlotDefinition +from linkml_runtime.loaders import yaml_loader from linkml_runtime.utils.formatutils import is_empty from pydantic import BaseModel, FilePath, RootModel, validate_call @@ -22,9 +24,10 @@ from pydantic_core import core_schema from pydantic2linkml.exceptions import ( + InvalidLinkMLSchemaError, NameCollisionError, - YAMLContentError, SlotExtensionError, + YAMLContentError, ) logger = logging.getLogger(__name__) @@ -201,9 +204,9 @@ def get_model_schema(model: type[BaseModel]) -> core_schema.ModelSchema: else: model_schema = inner_schema - assert model_schema["type"] == "model", ( - "Assumption about how model schema is stored is wrong." - ) + assert ( + model_schema["type"] == "model" + ), "Assumption about how model schema is stored is wrong." return cast(core_schema.ModelSchema, model_schema) @@ -534,17 +537,56 @@ def get_slot_usage_entry( ) +def canonicalize_schema_yml(yml: str) -> str: + """Canonicalize a YAML string as a LinkML schema via a round-trip. + + Deserializes ``yml`` into a ``SchemaDefinition`` object and + re-serializes it. The round-trip serves two purposes: + + * **Unknown-field detection** — ``yaml_loader`` raises ``TypeError`` + for any field name unknown to ``SchemaDefinition`` or its nested + objects; this function catches that and re-raises it as + ``InvalidLinkMLSchemaError``. Note: wrong-type values for known + fields are generally not detected. + * **Canonical ordering** — the output keys follow the same order + produced by serializing a freshly constructed ``SchemaDefinition``. + + :param yml: A YAML string to canonicalize as a LinkML schema. + :return: Canonically ordered YAML string representing the schema. + :raises InvalidLinkMLSchemaError: If ``yml`` contains field names + unknown to ``SchemaDefinition`` or any of its nested objects. + """ + try: + sd = yaml_loader.loads(yml, target_class=SchemaDefinition) + except TypeError as e: + raise InvalidLinkMLSchemaError(str(e)) from e + + return yaml_dumper.dumps(sd) + + @validate_call def apply_schema_overlay(schema_yml: str, overlay_file: FilePath) -> str: """Apply an overlay YAML file onto a serialized schema YAML string. - :param schema_yml: YAML string of a serialized SchemaDefinition + All keys from the overlay are applied without filtering. The result + is then round-tripped through ``SchemaDefinition`` via + ``canonicalize_schema_yml``, which reorders keys canonically and + raises ``InvalidLinkMLSchemaError`` for any field names unknown to + ``SchemaDefinition`` or its nested objects. + + Note: The result is always a valid YAML file but may not be a valid + LinkML schema — it is the user's responsibility to supply an overlay + that produces a valid schema. + + :param schema_yml: YAML string of a valid LinkML schema :param overlay_file: Path to an existing overlay YAML file - :return: YAML string with the overlay applied, keys ordered to match - SchemaDefinition field order + :return: Canonical YAML string with the overlay applied, keys in + SchemaDefinition order :raises ValueError: If ``schema_yml`` does not deserialize to a dict :raises YAMLContentError: If the overlay file does not contain a YAML mapping + :raises InvalidLinkMLSchemaError: If the overlay introduces field + names unknown to ``SchemaDefinition`` or its nested objects """ schema_dict = yaml.safe_load(schema_yml) if not isinstance(schema_dict, dict): @@ -560,24 +602,10 @@ def apply_schema_overlay(schema_yml: str, overlay_file: FilePath) -> str: f"Overlay file {overlay_file} must contain a YAML mapping" ) - # Ordered list of valid SchemaDefinition field names - sd_field_names = [f.name for f in fields(SchemaDefinition)] - sd_field_set = set(sd_field_names) - - # Apply overlay, skipping keys that are not SchemaDefinition fields - for k, v in overlay.items(): - if k not in sd_field_set: - logger.warning( - "Overlay key '%s' is not a field of SchemaDefinition. Skipping.", - k, - ) - else: - schema_dict[k] = v - - # Rebuild dict in SchemaDefinition field order - ordered = {k: schema_dict[k] for k in sd_field_names if k in schema_dict} - - return yaml.dump(ordered, allow_unicode=True, sort_keys=False) + schema_dict.update(overlay) + return canonicalize_schema_yml( + yaml.dump(schema_dict, allow_unicode=True, sort_keys=False) + ) @validate_call @@ -585,15 +613,24 @@ def apply_yaml_deep_merge(schema_yml: str, merge_file: FilePath) -> str: """Deep-merge a YAML file into a serialized schema YAML string. Values from the merge file win on conflict. The merge is unrestricted — - no field filtering is applied. + no field filtering is applied. The result is then round-tripped + through ``SchemaDefinition`` via ``canonicalize_schema_yml``, which + reorders keys canonically and raises ``InvalidLinkMLSchemaError`` for + any field names unknown to ``SchemaDefinition`` or its nested objects. + + Note: The result is always a valid YAML file but may not be a valid + LinkML schema — it is the user's responsibility to supply a merge file + that produces a valid schema. :param schema_yml: YAML string of a valid LinkML schema :param merge_file: Path to an existing YAML file containing a mapping - :return: YAML string with the deep merge applied + :return: Canonical YAML string with the deep merge applied :raises ValueError: If ``schema_yml`` does not contain valid YAML or does not deserialize to a dict :raises yaml.YAMLError: If the merge file does not contain valid YAML :raises YAMLContentError: If the merge file does not contain a YAML mapping + :raises InvalidLinkMLSchemaError: If the merge introduces field names + unknown to ``SchemaDefinition`` or its nested objects """ from deepmerge import always_merger @@ -613,26 +650,32 @@ def apply_yaml_deep_merge(schema_yml: str, merge_file: FilePath) -> str: if not isinstance(merge_dict, dict): raise YAMLContentError(f"Merge file {merge_file} must contain a YAML mapping") - return yaml.dump( - always_merger.merge(schema_dict, merge_dict), - allow_unicode=True, - sort_keys=False, + return canonicalize_schema_yml( + yaml.dump( + always_merger.merge(schema_dict, merge_dict), + allow_unicode=True, + sort_keys=False, + ) ) def remove_schema_key_duplication(yml: str) -> str: - """Remove redundant name/text fields from a valid serialized LinkML schema. + """Remove redundant name/text/prefix_prefix fields from a serialized + LinkML schema. In LinkML's serialized YAML, dictionary keys already serve as - identifiers for classes, slots, enums, slot_usage entries, and - permissible values. This function strips the redundant ``name`` and - ``text`` fields that the linkml-runtime YAML dumper includes alongside - those keys. + identifiers for classes, slots, enums, slot_usage entries, + permissible values, and prefixes. This function strips the redundant + ``name``, ``text``, and ``prefix_prefix`` fields that the + linkml-runtime YAML dumper includes alongside those keys. :param yml: A YAML string representing a **valid** LinkML schema. """ schema = yaml.safe_load(yml) + for prefix in schema.get("prefixes", {}).values(): + prefix.pop("prefix_prefix", None) + for cls in schema.get("classes", {}).values(): cls.pop("name", None) for su in cls.get("slot_usage", {}).values(): diff --git a/tests/test_cli.py b/tests/test_cli.py index 1abb8d43..d16f853f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -56,12 +56,12 @@ def test_non_mapping(self, tmp_path: Path): assert result.exit_code == 2 assert "does not contain a" in result.output.lower() - def test_unknown_key(self, tmp_path: Path): + def test_unknown_field_raises_bad_parameter(self, tmp_path: Path): overlay_file = tmp_path / "overlay.yaml" overlay_file.write_text("not_a_field: some_value\n") result = runner.invoke(app, ["dandischema.models", "-O", str(overlay_file)]) - assert result.exit_code == 0 - assert "not_a_field" not in result.output + assert result.exit_code == 2 + assert "not_a_field" in result.output class TestCliDeepMerge: @@ -106,3 +106,10 @@ def test_invalid_yaml(self, tmp_path: Path): result = runner.invoke(app, ["dandischema.models", "-M", str(merge_file)]) assert result.exit_code == 2 assert "does not contain valid YAML" in result.output + + def test_unknown_field_raises_bad_parameter(self, tmp_path: Path): + merge_file = tmp_path / "merge.yaml" + merge_file.write_text("not_a_field: some_value\n") + result = runner.invoke(app, ["dandischema.models", "-M", str(merge_file)]) + assert result.exit_code == 2 + assert "not_a_field" in result.output diff --git a/tests/test_tools.py b/tests/test_tools.py index d42d42b6..a19fb739 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -1,4 +1,3 @@ -import logging import re from enum import Enum, auto from operator import itemgetter @@ -12,9 +11,10 @@ from pydantic_core import core_schema from pydantic2linkml.exceptions import ( + InvalidLinkMLSchemaError, NameCollisionError, - YAMLContentError, SlotExtensionError, + YAMLContentError, ) from pydantic2linkml.tools import ( add_section_breaks, @@ -37,14 +37,18 @@ ) # A minimal YAML dict suitable as schema_yml input for apply_schema_overlay -# and apply_yaml_deep_merge tests +# and apply_yaml_deep_merge tests. Written in the canonical form produced by +# yaml_dumper.dumps so that round-tripping through SchemaDefinition leaves the +# content unchanged (aside from key reordering, which dict equality ignores). SAMPLE_SCHEMA_YML = ( "id: https://example.com/test\n" "name: original-name\n" + "default_prefix: https://example.com/test/\n" "imports:\n" " - linkml:types\n" "classes:\n" " Foo:\n" + " name: Foo\n" " description: original description\n" ) @@ -710,19 +714,21 @@ def test_schema_yml_not_dict_raises_value_error(self, tmp_path: Path): schema_yml="- item1\n- item2\n", overlay_file=overlay_file ) - def test_unknown_key_logged_and_skipped(self, tmp_path: Path, caplog): + def test_unknown_field_raises_invalid_schema_error(self, tmp_path: Path): overlay_file = tmp_path / "overlay.yaml" overlay_file.write_text("not_a_field: some_value\n") - with caplog.at_level(logging.WARNING, logger="pydantic2linkml.tools"): - result = apply_schema_overlay( + with pytest.raises(InvalidLinkMLSchemaError): + apply_schema_overlay( schema_yml=SAMPLE_SCHEMA_YML, overlay_file=overlay_file ) - assert "not_a_field" in caplog.text - assert "not_a_field" not in yaml.safe_load(result) def test_output_follows_schema_definition_field_order(self, tmp_path: Path): # description comes after name in SchemaDefinition; supply them reversed - schema_yml = "description: some desc\nname: test-name\n" + schema_yml = ( + "description: some desc\n" + "name: test-name\n" + "id: https://example.com/test\n" + ) overlay_file = tmp_path / "overlay.yaml" overlay_file.write_text("title: My Title\n") result = apply_schema_overlay(schema_yml=schema_yml, overlay_file=overlay_file) @@ -747,8 +753,11 @@ class TestApplyYamlDeepMerge: { "id": "https://example.com/test", "name": "new-name", + "default_prefix": "https://example.com/test/", "imports": ["linkml:types"], - "classes": {"Foo": {"description": "original description"}}, + "classes": { + "Foo": {"name": "Foo", "description": "original description"} + }, }, id="top_level_scalar_override", ), @@ -763,13 +772,15 @@ class TestApplyYamlDeepMerge: { "id": "https://example.com/test", "name": "original-name", + "default_prefix": "https://example.com/test/", "imports": ["linkml:types"], "classes": { "Foo": { + "name": "Foo", "description": "original description", "title": "new title", }, - "Bar": {"description": "bar desc"}, + "Bar": {"name": "Bar", "description": "bar desc"}, }, }, id="nested_dict_merge", @@ -780,8 +791,11 @@ class TestApplyYamlDeepMerge: { "id": "https://example.com/test", "name": "original-name", + "default_prefix": "https://example.com/test/", "imports": ["linkml:types"], - "classes": {"Foo": {"description": "new description"}}, + "classes": { + "Foo": {"name": "Foo", "description": "new description"} + }, }, id="nested_dict_override", ), @@ -791,8 +805,11 @@ class TestApplyYamlDeepMerge: { "id": "https://example.com/test", "name": "original-name", + "default_prefix": "https://example.com/test/", "imports": ["linkml:types", "linkml:extra"], - "classes": {"Foo": {"description": "original description"}}, + "classes": { + "Foo": {"name": "Foo", "description": "original description"} + }, }, id="append_to_list", ), @@ -862,6 +879,12 @@ def test_unicode_content_preserved(self, tmp_path: Path): ) assert yaml.safe_load(result)["title"] == "\u00dc n\u00ef c\u00f6d\u00e9" + def test_unknown_field_raises_invalid_schema_error(self, tmp_path: Path): + merge_file = tmp_path / "merge.yaml" + merge_file.write_text("not_a_field: some_value\n") + with pytest.raises(InvalidLinkMLSchemaError): + apply_yaml_deep_merge(schema_yml=SAMPLE_SCHEMA_YML, merge_file=merge_file) + class TestRemoveSchemaKeyDuplication: def test_classes_name_removed(self): @@ -910,6 +933,24 @@ def test_permissible_values_text_removed(self): assert "text" not in pv assert pv["description"] == "Currently active" + def test_prefixes_prefix_prefix_removed(self): + schema = { + "prefixes": { + "linkml": { + "prefix_prefix": "linkml", + "prefix_reference": "https://w3id.org/linkml/", + }, + "ex": { + "prefix_prefix": "ex", + "prefix_reference": "https://example.org/", + }, + } + } + result = yaml.safe_load(remove_schema_key_duplication(yaml.dump(schema))) + for prefix_entry in result["prefixes"].values(): + assert "prefix_prefix" not in prefix_entry + assert "prefix_reference" in prefix_entry + def test_missing_sections_no_error(self): schema = {"id": "https://example.com/test", "name": "test-schema"} result = yaml.safe_load(remove_schema_key_duplication(yaml.dump(schema))) @@ -926,6 +967,8 @@ def test_round_trip(self): result_yml = remove_schema_key_duplication(raw_yml) result = yaml.safe_load(result_yml) + for prefix_entry in result.get("prefixes", {}).values(): + assert "prefix_prefix" not in prefix_entry for cls in result.get("classes", {}).values(): assert "name" not in cls for su in cls.get("slot_usage", {}).values(): From 1e774f65d0c5663a90f2185b4f78483f53623e50 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Wed, 25 Mar 2026 23:23:27 -0700 Subject: [PATCH 2/2] feat: validate schema against LinkML meta schema after round-trip Add `_get_meta_schema_validator()` (lazily initialized, cached via `functools.cache`) and extend `canonicalize_schema_yml` to validate the canonical output against the LinkML meta schema using `linkml.validator.Validator` with `JsonschemaValidationPlugin(closed=True)`. This catches unknown field names and wrong-type values that the `yaml_loader` round-trip alone does not detect. The two detection paths now produce distinct `InvalidLinkMLSchemaError` messages: "Unknown field in schema:" for `TypeError` from `yaml_loader`, and "Schema validation failed:" for violations found by the meta-schema validator. CLI `BadParameter` messages and all documentation are updated accordingly. Co-Authored-By: Claude Sonnet 4.6 --- CLAUDE.md | 24 ++++--- README.md | 4 +- src/pydantic2linkml/cli/__init__.py | 12 ++-- src/pydantic2linkml/exceptions.py | 3 +- src/pydantic2linkml/tools.py | 101 +++++++++++++++++----------- tests/test_tools.py | 27 ++++++-- 6 files changed, 107 insertions(+), 64 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 7bd0275d..7ae69de1 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -69,10 +69,11 @@ Options: - `--output-file`/`-o` (path) — write output to a file instead of stdout - `--merge-file`/`-M` (path) — deep-merge a YAML file into the generated - schema; values from the file win on conflict; unknown field names raise - an error + schema; values from the file win on conflict; the result is validated + against the LinkML meta schema - `--overlay-file`/`-O` (path) — shallow-merge a YAML file into the - generated schema; unknown field names raise an error + generated schema; the result is validated against the LinkML meta + schema - `--log-level`/`-l` (default: WARNING) ## Architecture @@ -92,15 +93,17 @@ Options: - `resolve_ref_schema()` — resolves `definition-ref` and `definitions` schema types to concrete schemas - `canonicalize_schema_yml(yml)` — round-trips a YAML string through - `SchemaDefinition` to produce canonically ordered output and detect - unknown field names (raises `InvalidLinkMLSchemaError`); note that - wrong-type values for known fields are generally not detected + `SchemaDefinition` for canonical key ordering, then validates the + result against the LinkML meta schema via `linkml.validator` + (raises `InvalidLinkMLSchemaError` on unknown fields or wrong-type + values); the meta-schema validator is lazily initialized and cached + via `_get_meta_schema_validator()` - `apply_schema_overlay(schema_yml, overlay_file)` — shallow-merges a YAML file into a schema YAML string; no field filtering; calls - `canonicalize_schema_yml` to reorder keys and detect unknown fields + `canonicalize_schema_yml` to reorder keys and validate the result - `apply_yaml_deep_merge(schema_yml, merge_file)` — deep-merges a YAML file into a schema YAML string using `deepmerge`; calls - `canonicalize_schema_yml` to reorder keys and detect unknown fields + `canonicalize_schema_yml` to reorder keys and validate the result - `remove_schema_key_duplication(yml)` — strips redundant `name`/`text`/ `prefix_prefix` fields from serialized LinkML YAML - `add_section_breaks(yml)` — inserts blank lines before top-level @@ -125,8 +128,9 @@ Options: via slot_usage - `YAMLContentError` — YAML file content is not what is expected (e.g., not a mapping) - - `InvalidLinkMLSchemaError` — YAML string contains field names unknown - to LinkML (raised by `canonicalize_schema_yml`) + - `InvalidLinkMLSchemaError` — schema does not conform to the LinkML + meta schema (unknown fields, wrong-type values, etc.); raised by + `canonicalize_schema_yml` ### Key Design Patterns diff --git a/README.md b/README.md index cb5263ed..408a8530 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,6 @@ pydantic2linkml -o o.yml -l INFO dandischema.models | Flag | Description | |------|-------------| | `-o` / `--output-file` | Write output to a file (default: stdout) | -| `-M` / `--merge-file` | Deep-merge a YAML file into the generated schema. Values from the file win on conflict; unknown field names raise an error. | -| `-O` / `--overlay-file` | Shallow-merge a YAML file into the generated schema. Unknown field names raise an error. | +| `-M` / `--merge-file` | Deep-merge a YAML file into the generated schema. Values from the file win on conflict; the result is validated against the LinkML meta schema. | +| `-O` / `--overlay-file` | Shallow-merge a YAML file into the generated schema. The result is validated against the LinkML meta schema. | | `-l` / `--log-level` | Log level (default: `WARNING`) | diff --git a/src/pydantic2linkml/cli/__init__.py b/src/pydantic2linkml/cli/__init__.py index 517c2ec8..e3f7b0d1 100644 --- a/src/pydantic2linkml/cli/__init__.py +++ b/src/pydantic2linkml/cli/__init__.py @@ -31,9 +31,7 @@ def main( "-M", help="A YAML file whose contents are deep-merged into the generated " "schema. Values from this file win on conflict. The result is " - "always a valid YAML file but may not be a valid LinkML schema — " - "it is the user's responsibility to supply a merge file that " - "produces a valid schema.", + "validated against the LinkML meta schema.", ), ] = None, overlay_file: Annotated[ @@ -43,9 +41,7 @@ def main( "-O", help="An overlay file specifying a partial schema to be applied on top of " "the generated schema. The overlay is merged into the serialized YAML " - "output, so the result is always a valid YAML file but may not be a " - "valid LinkML schema — it is the user's responsibility to supply an " - "overlay that produces a valid schema. Unknown keys raise an error.", + "output. The result is validated against the LinkML meta schema.", ), ] = None, output_file: Annotated[Optional[Path], typer.Option("--output-file", "-o")] = None, @@ -80,7 +76,7 @@ def main( ) from e except InvalidLinkMLSchemaError as e: raise typer.BadParameter( - f"The merge file introduces field names unknown to LinkML: {e}", + f"The merge file produces an invalid schema: {e}", param_hint="'--merge-file'", ) from e if overlay_file is not None: @@ -99,7 +95,7 @@ def main( ) from e except InvalidLinkMLSchemaError as e: raise typer.BadParameter( - f"The overlay file introduces field names unknown to LinkML: {e}", + f"The overlay file produces an invalid schema: {e}", param_hint="'--overlay-file'", ) from e yml = remove_schema_key_duplication(yml) diff --git a/src/pydantic2linkml/exceptions.py b/src/pydantic2linkml/exceptions.py index 49a89923..4decdcd0 100644 --- a/src/pydantic2linkml/exceptions.py +++ b/src/pydantic2linkml/exceptions.py @@ -114,5 +114,6 @@ class YAMLContentError(ValueError): class InvalidLinkMLSchemaError(ValueError): """ - Raised when a YAML string contains field names unknown to LinkML + Raised when a YAML string does not conform to the LinkML meta schema + (e.g. unknown field names or wrong-type values) """ diff --git a/src/pydantic2linkml/tools.py b/src/pydantic2linkml/tools.py index 6c9e93bc..0fcb22fb 100644 --- a/src/pydantic2linkml/tools.py +++ b/src/pydantic2linkml/tools.py @@ -1,3 +1,4 @@ +import functools import importlib import inspect import logging @@ -7,6 +8,7 @@ from collections.abc import Callable, Iterable from dataclasses import fields from enum import Enum +from importlib.resources import files as resource_files from operator import attrgetter, itemgetter from types import ModuleType from typing import Any, NamedTuple, Optional, TypeVar, cast @@ -204,9 +206,9 @@ def get_model_schema(model: type[BaseModel]) -> core_schema.ModelSchema: else: model_schema = inner_schema - assert ( - model_schema["type"] == "model" - ), "Assumption about how model schema is stored is wrong." + assert model_schema["type"] == "model", ( + "Assumption about how model schema is stored is wrong." + ) return cast(core_schema.ModelSchema, model_schema) @@ -537,31 +539,63 @@ def get_slot_usage_entry( ) +@functools.cache +def _get_meta_schema_validator(): + """Return a cached LinkML meta-schema validator. + + The validator is initialized lazily on first call (importing + ``linkml.validator`` is slow) and then cached for reuse. + ``closed=True`` adds ``additionalProperties: false`` to every object + type in the generated JSON Schema, so unknown field names are caught + as validation errors. + """ + from linkml.validator import Validator + from linkml.validator.plugins import JsonschemaValidationPlugin + + meta_schema_path = str( + resource_files("linkml_runtime.linkml_model.model.schema").joinpath("meta.yaml") + ) + return Validator( + meta_schema_path, + validation_plugins=[JsonschemaValidationPlugin(closed=True)], + ) + + def canonicalize_schema_yml(yml: str) -> str: """Canonicalize a YAML string as a LinkML schema via a round-trip. - Deserializes ``yml`` into a ``SchemaDefinition`` object and - re-serializes it. The round-trip serves two purposes: + Deserializes ``yml`` into a ``SchemaDefinition`` object, + re-serializes it to canonical YAML, then validates the canonical + output against the LinkML meta schema. This serves two purposes: - * **Unknown-field detection** — ``yaml_loader`` raises ``TypeError`` - for any field name unknown to ``SchemaDefinition`` or its nested - objects; this function catches that and re-raises it as - ``InvalidLinkMLSchemaError``. Note: wrong-type values for known - fields are generally not detected. * **Canonical ordering** — the output keys follow the same order produced by serializing a freshly constructed ``SchemaDefinition``. + * **Validation** — the canonical YAML is validated against the + LinkML meta schema. Unknown field names and wrong-type values for + known fields are caught and re-raised as ``InvalidLinkMLSchemaError``. :param yml: A YAML string to canonicalize as a LinkML schema. - :return: Canonically ordered YAML string representing the schema. - :raises InvalidLinkMLSchemaError: If ``yml`` contains field names - unknown to ``SchemaDefinition`` or any of its nested objects. + :return: Canonically ordered, validated YAML string representing the + schema. + :raises InvalidLinkMLSchemaError: If the resulting schema does not + conform to the LinkML meta schema (unknown field names, + wrong-type values, etc.). """ try: sd = yaml_loader.loads(yml, target_class=SchemaDefinition) except TypeError as e: - raise InvalidLinkMLSchemaError(str(e)) from e + raise InvalidLinkMLSchemaError(f"Unknown field in schema: {e}") from e - return yaml_dumper.dumps(sd) + canonical = yaml_dumper.dumps(sd) + + validator = _get_meta_schema_validator() + report = validator.validate(yaml.safe_load(canonical), "schema_definition") + if report.results: + raise InvalidLinkMLSchemaError( + "Schema validation failed: " + "; ".join(r.message for r in report.results) + ) + + return canonical @validate_call @@ -569,14 +603,9 @@ def apply_schema_overlay(schema_yml: str, overlay_file: FilePath) -> str: """Apply an overlay YAML file onto a serialized schema YAML string. All keys from the overlay are applied without filtering. The result - is then round-tripped through ``SchemaDefinition`` via - ``canonicalize_schema_yml``, which reorders keys canonically and - raises ``InvalidLinkMLSchemaError`` for any field names unknown to - ``SchemaDefinition`` or its nested objects. - - Note: The result is always a valid YAML file but may not be a valid - LinkML schema — it is the user's responsibility to supply an overlay - that produces a valid schema. + is then passed through ``canonicalize_schema_yml``, which reorders + keys canonically and validates the output against the LinkML meta + schema. :param schema_yml: YAML string of a valid LinkML schema :param overlay_file: Path to an existing overlay YAML file @@ -585,8 +614,8 @@ def apply_schema_overlay(schema_yml: str, overlay_file: FilePath) -> str: :raises ValueError: If ``schema_yml`` does not deserialize to a dict :raises YAMLContentError: If the overlay file does not contain a YAML mapping - :raises InvalidLinkMLSchemaError: If the overlay introduces field - names unknown to ``SchemaDefinition`` or its nested objects + :raises InvalidLinkMLSchemaError: If the result does not conform to + the LinkML meta schema """ schema_dict = yaml.safe_load(schema_yml) if not isinstance(schema_dict, dict): @@ -613,24 +642,20 @@ def apply_yaml_deep_merge(schema_yml: str, merge_file: FilePath) -> str: """Deep-merge a YAML file into a serialized schema YAML string. Values from the merge file win on conflict. The merge is unrestricted — - no field filtering is applied. The result is then round-tripped - through ``SchemaDefinition`` via ``canonicalize_schema_yml``, which - reorders keys canonically and raises ``InvalidLinkMLSchemaError`` for - any field names unknown to ``SchemaDefinition`` or its nested objects. - - Note: The result is always a valid YAML file but may not be a valid - LinkML schema — it is the user's responsibility to supply a merge file - that produces a valid schema. + no field filtering is applied. The result is then passed through + ``canonicalize_schema_yml``, which reorders keys canonically and + validates the output against the LinkML meta schema. :param schema_yml: YAML string of a valid LinkML schema :param merge_file: Path to an existing YAML file containing a mapping :return: Canonical YAML string with the deep merge applied - :raises ValueError: If ``schema_yml`` does not contain valid YAML or does - not deserialize to a dict + :raises ValueError: If ``schema_yml`` does not contain valid YAML or + does not deserialize to a dict :raises yaml.YAMLError: If the merge file does not contain valid YAML - :raises YAMLContentError: If the merge file does not contain a YAML mapping - :raises InvalidLinkMLSchemaError: If the merge introduces field names - unknown to ``SchemaDefinition`` or its nested objects + :raises YAMLContentError: If the merge file does not contain a YAML + mapping + :raises InvalidLinkMLSchemaError: If the result does not conform to + the LinkML meta schema """ from deepmerge import always_merger diff --git a/tests/test_tools.py b/tests/test_tools.py index a19fb739..f5babbb4 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -21,6 +21,7 @@ apply_schema_overlay, apply_yaml_deep_merge, bucketize, + canonicalize_schema_yml, ensure_unique_names, fetch_defs, force_to_set, @@ -649,6 +650,24 @@ def test_get_slot_usage_entry( assert get_slot_usage_entry(base, target) == expected_return +class TestCanonicalizeSchemaYml: + def test_wrong_type_raises_invalid_schema_error(self, mocker): + # yaml_loader coerces scalar wrong-type values (e.g. integer -> string) + # during the round-trip, so they never reach the meta-schema validator. + # We mock yaml_dumper.dumps to inject a canonical YAML with an integer + # value for the string field `title`, simulating the validator being + # given wrong-type data. The "Schema validation failed:" prefix in the + # error message confirms the error comes from the meta-schema validator + # step, not from the yaml_loader TypeError step. + from linkml_runtime.dumpers import yaml_dumper + + wrong_canonical = "id: https://example.com/test\nname: test-name\ntitle: 123\n" + mocker.patch.object(yaml_dumper, "dumps", return_value=wrong_canonical) + + with pytest.raises(InvalidLinkMLSchemaError, match="Schema validation failed:"): + canonicalize_schema_yml("id: https://example.com/test\nname: test-name\n") + + class TestApplySchemaOverlay: @pytest.mark.parametrize( "overlay_content, expected_overrides", @@ -717,7 +736,7 @@ def test_schema_yml_not_dict_raises_value_error(self, tmp_path: Path): def test_unknown_field_raises_invalid_schema_error(self, tmp_path: Path): overlay_file = tmp_path / "overlay.yaml" overlay_file.write_text("not_a_field: some_value\n") - with pytest.raises(InvalidLinkMLSchemaError): + with pytest.raises(InvalidLinkMLSchemaError, match="Unknown field in schema:"): apply_schema_overlay( schema_yml=SAMPLE_SCHEMA_YML, overlay_file=overlay_file ) @@ -725,9 +744,7 @@ def test_unknown_field_raises_invalid_schema_error(self, tmp_path: Path): def test_output_follows_schema_definition_field_order(self, tmp_path: Path): # description comes after name in SchemaDefinition; supply them reversed schema_yml = ( - "description: some desc\n" - "name: test-name\n" - "id: https://example.com/test\n" + "description: some desc\nname: test-name\nid: https://example.com/test\n" ) overlay_file = tmp_path / "overlay.yaml" overlay_file.write_text("title: My Title\n") @@ -882,7 +899,7 @@ def test_unicode_content_preserved(self, tmp_path: Path): def test_unknown_field_raises_invalid_schema_error(self, tmp_path: Path): merge_file = tmp_path / "merge.yaml" merge_file.write_text("not_a_field: some_value\n") - with pytest.raises(InvalidLinkMLSchemaError): + with pytest.raises(InvalidLinkMLSchemaError, match="Unknown field in schema:"): apply_yaml_deep_merge(schema_yml=SAMPLE_SCHEMA_YML, merge_file=merge_file)