diff --git a/CLAUDE.md b/CLAUDE.md index 5e8f5020..7ae69de1 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -69,10 +69,11 @@ Options: - `--output-file`/`-o` (path) — write output to a file instead of stdout - `--merge-file`/`-M` (path) — deep-merge a YAML file into the generated - schema; values from the file win on conflict; no field filtering applied + schema; values from the file win on conflict; the result is validated + against the LinkML meta schema - `--overlay-file`/`-O` (path) — shallow-merge a YAML file into the - generated schema; only `SchemaDefinition` fields are applied; unknown - keys are skipped with a warning + generated schema; the result is validated against the LinkML meta + schema - `--log-level`/`-l` (default: WARNING) ## Architecture @@ -91,13 +92,20 @@ Options: resolution context, field name, `FieldInfo`, and owning model - `resolve_ref_schema()` — resolves `definition-ref` and `definitions` schema types to concrete schemas + - `canonicalize_schema_yml(yml)` — round-trips a YAML string through + `SchemaDefinition` for canonical key ordering, then validates the + result against the LinkML meta schema via `linkml.validator` + (raises `InvalidLinkMLSchemaError` on unknown fields or wrong-type + values); the meta-schema validator is lazily initialized and cached + via `_get_meta_schema_validator()` - `apply_schema_overlay(schema_yml, overlay_file)` — shallow-merges a - YAML file into a schema YAML string; restricts keys to - `SchemaDefinition` fields + YAML file into a schema YAML string; no field filtering; calls + `canonicalize_schema_yml` to reorder keys and validate the result - `apply_yaml_deep_merge(schema_yml, merge_file)` — deep-merges a YAML - file into a schema YAML string using `deepmerge`; no field filtering - - `remove_schema_key_duplication(yml)` — strips redundant `name`/`text` - fields from serialized LinkML YAML + file into a schema YAML string using `deepmerge`; calls + `canonicalize_schema_yml` to reorder keys and validate the result + - `remove_schema_key_duplication(yml)` — strips redundant `name`/`text`/ + `prefix_prefix` fields from serialized LinkML YAML - `add_section_breaks(yml)` — inserts blank lines before top-level sections @@ -109,8 +117,8 @@ Options: 3. **`cli/`** — Typer-based CLI wrapping `translate_defs`; `cli/__init__.py` defines the `app` and `main` command. After translation the pipeline is: - dump YAML → `remove_schema_key_duplication` → optional `-M` deep merge - → optional `-O` overlay → `add_section_breaks` → output. + dump YAML → optional `-M` deep merge → optional `-O` overlay → + `remove_schema_key_duplication` → `add_section_breaks` → output. 4. **`exceptions.py`** — Custom exceptions: - `NameCollisionError` — duplicate class/enum names across modules @@ -120,6 +128,9 @@ Options: via slot_usage - `YAMLContentError` — YAML file content is not what is expected (e.g., not a mapping) + - `InvalidLinkMLSchemaError` — schema does not conform to the LinkML + meta schema (unknown fields, wrong-type values, etc.); raised by + `canonicalize_schema_yml` ### Key Design Patterns diff --git a/README.md b/README.md index 1ebfc619..408a8530 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,6 @@ pydantic2linkml -o o.yml -l INFO dandischema.models | Flag | Description | |------|-------------| | `-o` / `--output-file` | Write output to a file (default: stdout) | -| `-M` / `--merge-file` | Deep-merge a YAML file into the generated schema. Values from the file win on conflict; no field filtering is applied. | -| `-O` / `--overlay-file` | Shallow-merge a YAML file into the generated schema. Only `SchemaDefinition` fields are applied; unknown keys are skipped with a warning. | +| `-M` / `--merge-file` | Deep-merge a YAML file into the generated schema. Values from the file win on conflict; the result is validated against the LinkML meta schema. | +| `-O` / `--overlay-file` | Shallow-merge a YAML file into the generated schema. The result is validated against the LinkML meta schema. | | `-l` / `--log-level` | Log level (default: `WARNING`) | diff --git a/src/pydantic2linkml/cli/__init__.py b/src/pydantic2linkml/cli/__init__.py index bbcb063e..e3f7b0d1 100644 --- a/src/pydantic2linkml/cli/__init__.py +++ b/src/pydantic2linkml/cli/__init__.py @@ -8,7 +8,7 @@ from pydantic import ValidationError from pydantic2linkml.cli.tools import LogLevel -from pydantic2linkml.exceptions import YAMLContentError +from pydantic2linkml.exceptions import InvalidLinkMLSchemaError, YAMLContentError from pydantic2linkml.gen_linkml import translate_defs from pydantic2linkml.tools import ( add_section_breaks, @@ -31,9 +31,7 @@ def main( "-M", help="A YAML file whose contents are deep-merged into the generated " "schema. Values from this file win on conflict. The result is " - "always a valid YAML file but may not be a valid LinkML schema — " - "it is the user's responsibility to supply a merge file that " - "produces a valid schema.", + "validated against the LinkML meta schema.", ), ] = None, overlay_file: Annotated[ @@ -43,10 +41,7 @@ def main( "-O", help="An overlay file specifying a partial schema to be applied on top of " "the generated schema. The overlay is merged into the serialized YAML " - "output, so the result is always a valid YAML file but may not be a " - "valid LinkML schema — it is the user's responsibility to supply an " - "overlay that produces a valid schema. Overlay keys that do not " - "correspond to a field of SchemaDefinition are skipped.", + "output. The result is validated against the LinkML meta schema.", ), ] = None, output_file: Annotated[Optional[Path], typer.Option("--output-file", "-o")] = None, @@ -59,7 +54,7 @@ def main( schema = translate_defs(module_names) logger.info("Dumping schema") - yml = remove_schema_key_duplication(yaml_dumper.dumps(schema)) + yml = yaml_dumper.dumps(schema) if merge_file is not None: logger.info("Applying deep merge from %s", merge_file) try: @@ -79,6 +74,11 @@ def main( f"The merge file does not contain a valid YAML mapping: {e}", param_hint="'--merge-file'", ) from e + except InvalidLinkMLSchemaError as e: + raise typer.BadParameter( + f"The merge file produces an invalid schema: {e}", + param_hint="'--merge-file'", + ) from e if overlay_file is not None: logger.info("Applying overlay from %s", overlay_file) try: @@ -93,6 +93,12 @@ def main( f"The overlay file does not contain a valid YAML mapping: {e}", param_hint="'--overlay-file'", ) from e + except InvalidLinkMLSchemaError as e: + raise typer.BadParameter( + f"The overlay file produces an invalid schema: {e}", + param_hint="'--overlay-file'", + ) from e + yml = remove_schema_key_duplication(yml) yml = add_section_breaks(yml) if not output_file: print(yml, end="") # noqa: T201 diff --git a/src/pydantic2linkml/exceptions.py b/src/pydantic2linkml/exceptions.py index f570ce9d..4decdcd0 100644 --- a/src/pydantic2linkml/exceptions.py +++ b/src/pydantic2linkml/exceptions.py @@ -110,3 +110,10 @@ class YAMLContentError(ValueError): """ Raise when the content of a YAML file is not what is expected """ + + +class InvalidLinkMLSchemaError(ValueError): + """ + Raised when a YAML string does not conform to the LinkML meta schema + (e.g. unknown field names or wrong-type values) + """ diff --git a/src/pydantic2linkml/tools.py b/src/pydantic2linkml/tools.py index cc0e0fe5..0fcb22fb 100644 --- a/src/pydantic2linkml/tools.py +++ b/src/pydantic2linkml/tools.py @@ -1,3 +1,4 @@ +import functools import importlib import inspect import logging @@ -7,12 +8,15 @@ from collections.abc import Callable, Iterable from dataclasses import fields from enum import Enum +from importlib.resources import files as resource_files from operator import attrgetter, itemgetter from types import ModuleType from typing import Any, NamedTuple, Optional, TypeVar, cast import yaml +from linkml_runtime.dumpers import yaml_dumper from linkml_runtime.linkml_model import SchemaDefinition, SlotDefinition +from linkml_runtime.loaders import yaml_loader from linkml_runtime.utils.formatutils import is_empty from pydantic import BaseModel, FilePath, RootModel, validate_call @@ -22,9 +26,10 @@ from pydantic_core import core_schema from pydantic2linkml.exceptions import ( + InvalidLinkMLSchemaError, NameCollisionError, - YAMLContentError, SlotExtensionError, + YAMLContentError, ) logger = logging.getLogger(__name__) @@ -534,17 +539,83 @@ def get_slot_usage_entry( ) +@functools.cache +def _get_meta_schema_validator(): + """Return a cached LinkML meta-schema validator. + + The validator is initialized lazily on first call (importing + ``linkml.validator`` is slow) and then cached for reuse. + ``closed=True`` adds ``additionalProperties: false`` to every object + type in the generated JSON Schema, so unknown field names are caught + as validation errors. + """ + from linkml.validator import Validator + from linkml.validator.plugins import JsonschemaValidationPlugin + + meta_schema_path = str( + resource_files("linkml_runtime.linkml_model.model.schema").joinpath("meta.yaml") + ) + return Validator( + meta_schema_path, + validation_plugins=[JsonschemaValidationPlugin(closed=True)], + ) + + +def canonicalize_schema_yml(yml: str) -> str: + """Canonicalize a YAML string as a LinkML schema via a round-trip. + + Deserializes ``yml`` into a ``SchemaDefinition`` object, + re-serializes it to canonical YAML, then validates the canonical + output against the LinkML meta schema. This serves two purposes: + + * **Canonical ordering** — the output keys follow the same order + produced by serializing a freshly constructed ``SchemaDefinition``. + * **Validation** — the canonical YAML is validated against the + LinkML meta schema. Unknown field names and wrong-type values for + known fields are caught and re-raised as ``InvalidLinkMLSchemaError``. + + :param yml: A YAML string to canonicalize as a LinkML schema. + :return: Canonically ordered, validated YAML string representing the + schema. + :raises InvalidLinkMLSchemaError: If the resulting schema does not + conform to the LinkML meta schema (unknown field names, + wrong-type values, etc.). + """ + try: + sd = yaml_loader.loads(yml, target_class=SchemaDefinition) + except TypeError as e: + raise InvalidLinkMLSchemaError(f"Unknown field in schema: {e}") from e + + canonical = yaml_dumper.dumps(sd) + + validator = _get_meta_schema_validator() + report = validator.validate(yaml.safe_load(canonical), "schema_definition") + if report.results: + raise InvalidLinkMLSchemaError( + "Schema validation failed: " + "; ".join(r.message for r in report.results) + ) + + return canonical + + @validate_call def apply_schema_overlay(schema_yml: str, overlay_file: FilePath) -> str: """Apply an overlay YAML file onto a serialized schema YAML string. - :param schema_yml: YAML string of a serialized SchemaDefinition + All keys from the overlay are applied without filtering. The result + is then passed through ``canonicalize_schema_yml``, which reorders + keys canonically and validates the output against the LinkML meta + schema. + + :param schema_yml: YAML string of a valid LinkML schema :param overlay_file: Path to an existing overlay YAML file - :return: YAML string with the overlay applied, keys ordered to match - SchemaDefinition field order + :return: Canonical YAML string with the overlay applied, keys in + SchemaDefinition order :raises ValueError: If ``schema_yml`` does not deserialize to a dict :raises YAMLContentError: If the overlay file does not contain a YAML mapping + :raises InvalidLinkMLSchemaError: If the result does not conform to + the LinkML meta schema """ schema_dict = yaml.safe_load(schema_yml) if not isinstance(schema_dict, dict): @@ -560,24 +631,10 @@ def apply_schema_overlay(schema_yml: str, overlay_file: FilePath) -> str: f"Overlay file {overlay_file} must contain a YAML mapping" ) - # Ordered list of valid SchemaDefinition field names - sd_field_names = [f.name for f in fields(SchemaDefinition)] - sd_field_set = set(sd_field_names) - - # Apply overlay, skipping keys that are not SchemaDefinition fields - for k, v in overlay.items(): - if k not in sd_field_set: - logger.warning( - "Overlay key '%s' is not a field of SchemaDefinition. Skipping.", - k, - ) - else: - schema_dict[k] = v - - # Rebuild dict in SchemaDefinition field order - ordered = {k: schema_dict[k] for k in sd_field_names if k in schema_dict} - - return yaml.dump(ordered, allow_unicode=True, sort_keys=False) + schema_dict.update(overlay) + return canonicalize_schema_yml( + yaml.dump(schema_dict, allow_unicode=True, sort_keys=False) + ) @validate_call @@ -585,15 +642,20 @@ def apply_yaml_deep_merge(schema_yml: str, merge_file: FilePath) -> str: """Deep-merge a YAML file into a serialized schema YAML string. Values from the merge file win on conflict. The merge is unrestricted — - no field filtering is applied. + no field filtering is applied. The result is then passed through + ``canonicalize_schema_yml``, which reorders keys canonically and + validates the output against the LinkML meta schema. :param schema_yml: YAML string of a valid LinkML schema :param merge_file: Path to an existing YAML file containing a mapping - :return: YAML string with the deep merge applied - :raises ValueError: If ``schema_yml`` does not contain valid YAML or does - not deserialize to a dict + :return: Canonical YAML string with the deep merge applied + :raises ValueError: If ``schema_yml`` does not contain valid YAML or + does not deserialize to a dict :raises yaml.YAMLError: If the merge file does not contain valid YAML - :raises YAMLContentError: If the merge file does not contain a YAML mapping + :raises YAMLContentError: If the merge file does not contain a YAML + mapping + :raises InvalidLinkMLSchemaError: If the result does not conform to + the LinkML meta schema """ from deepmerge import always_merger @@ -613,26 +675,32 @@ def apply_yaml_deep_merge(schema_yml: str, merge_file: FilePath) -> str: if not isinstance(merge_dict, dict): raise YAMLContentError(f"Merge file {merge_file} must contain a YAML mapping") - return yaml.dump( - always_merger.merge(schema_dict, merge_dict), - allow_unicode=True, - sort_keys=False, + return canonicalize_schema_yml( + yaml.dump( + always_merger.merge(schema_dict, merge_dict), + allow_unicode=True, + sort_keys=False, + ) ) def remove_schema_key_duplication(yml: str) -> str: - """Remove redundant name/text fields from a valid serialized LinkML schema. + """Remove redundant name/text/prefix_prefix fields from a serialized + LinkML schema. In LinkML's serialized YAML, dictionary keys already serve as - identifiers for classes, slots, enums, slot_usage entries, and - permissible values. This function strips the redundant ``name`` and - ``text`` fields that the linkml-runtime YAML dumper includes alongside - those keys. + identifiers for classes, slots, enums, slot_usage entries, + permissible values, and prefixes. This function strips the redundant + ``name``, ``text``, and ``prefix_prefix`` fields that the + linkml-runtime YAML dumper includes alongside those keys. :param yml: A YAML string representing a **valid** LinkML schema. """ schema = yaml.safe_load(yml) + for prefix in schema.get("prefixes", {}).values(): + prefix.pop("prefix_prefix", None) + for cls in schema.get("classes", {}).values(): cls.pop("name", None) for su in cls.get("slot_usage", {}).values(): diff --git a/tests/test_cli.py b/tests/test_cli.py index 1abb8d43..d16f853f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -56,12 +56,12 @@ def test_non_mapping(self, tmp_path: Path): assert result.exit_code == 2 assert "does not contain a" in result.output.lower() - def test_unknown_key(self, tmp_path: Path): + def test_unknown_field_raises_bad_parameter(self, tmp_path: Path): overlay_file = tmp_path / "overlay.yaml" overlay_file.write_text("not_a_field: some_value\n") result = runner.invoke(app, ["dandischema.models", "-O", str(overlay_file)]) - assert result.exit_code == 0 - assert "not_a_field" not in result.output + assert result.exit_code == 2 + assert "not_a_field" in result.output class TestCliDeepMerge: @@ -106,3 +106,10 @@ def test_invalid_yaml(self, tmp_path: Path): result = runner.invoke(app, ["dandischema.models", "-M", str(merge_file)]) assert result.exit_code == 2 assert "does not contain valid YAML" in result.output + + def test_unknown_field_raises_bad_parameter(self, tmp_path: Path): + merge_file = tmp_path / "merge.yaml" + merge_file.write_text("not_a_field: some_value\n") + result = runner.invoke(app, ["dandischema.models", "-M", str(merge_file)]) + assert result.exit_code == 2 + assert "not_a_field" in result.output diff --git a/tests/test_tools.py b/tests/test_tools.py index d42d42b6..f5babbb4 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -1,4 +1,3 @@ -import logging import re from enum import Enum, auto from operator import itemgetter @@ -12,15 +11,17 @@ from pydantic_core import core_schema from pydantic2linkml.exceptions import ( + InvalidLinkMLSchemaError, NameCollisionError, - YAMLContentError, SlotExtensionError, + YAMLContentError, ) from pydantic2linkml.tools import ( add_section_breaks, apply_schema_overlay, apply_yaml_deep_merge, bucketize, + canonicalize_schema_yml, ensure_unique_names, fetch_defs, force_to_set, @@ -37,14 +38,18 @@ ) # A minimal YAML dict suitable as schema_yml input for apply_schema_overlay -# and apply_yaml_deep_merge tests +# and apply_yaml_deep_merge tests. Written in the canonical form produced by +# yaml_dumper.dumps so that round-tripping through SchemaDefinition leaves the +# content unchanged (aside from key reordering, which dict equality ignores). SAMPLE_SCHEMA_YML = ( "id: https://example.com/test\n" "name: original-name\n" + "default_prefix: https://example.com/test/\n" "imports:\n" " - linkml:types\n" "classes:\n" " Foo:\n" + " name: Foo\n" " description: original description\n" ) @@ -645,6 +650,24 @@ def test_get_slot_usage_entry( assert get_slot_usage_entry(base, target) == expected_return +class TestCanonicalizeSchemaYml: + def test_wrong_type_raises_invalid_schema_error(self, mocker): + # yaml_loader coerces scalar wrong-type values (e.g. integer -> string) + # during the round-trip, so they never reach the meta-schema validator. + # We mock yaml_dumper.dumps to inject a canonical YAML with an integer + # value for the string field `title`, simulating the validator being + # given wrong-type data. The "Schema validation failed:" prefix in the + # error message confirms the error comes from the meta-schema validator + # step, not from the yaml_loader TypeError step. + from linkml_runtime.dumpers import yaml_dumper + + wrong_canonical = "id: https://example.com/test\nname: test-name\ntitle: 123\n" + mocker.patch.object(yaml_dumper, "dumps", return_value=wrong_canonical) + + with pytest.raises(InvalidLinkMLSchemaError, match="Schema validation failed:"): + canonicalize_schema_yml("id: https://example.com/test\nname: test-name\n") + + class TestApplySchemaOverlay: @pytest.mark.parametrize( "overlay_content, expected_overrides", @@ -710,19 +733,19 @@ def test_schema_yml_not_dict_raises_value_error(self, tmp_path: Path): schema_yml="- item1\n- item2\n", overlay_file=overlay_file ) - def test_unknown_key_logged_and_skipped(self, tmp_path: Path, caplog): + def test_unknown_field_raises_invalid_schema_error(self, tmp_path: Path): overlay_file = tmp_path / "overlay.yaml" overlay_file.write_text("not_a_field: some_value\n") - with caplog.at_level(logging.WARNING, logger="pydantic2linkml.tools"): - result = apply_schema_overlay( + with pytest.raises(InvalidLinkMLSchemaError, match="Unknown field in schema:"): + apply_schema_overlay( schema_yml=SAMPLE_SCHEMA_YML, overlay_file=overlay_file ) - assert "not_a_field" in caplog.text - assert "not_a_field" not in yaml.safe_load(result) def test_output_follows_schema_definition_field_order(self, tmp_path: Path): # description comes after name in SchemaDefinition; supply them reversed - schema_yml = "description: some desc\nname: test-name\n" + schema_yml = ( + "description: some desc\nname: test-name\nid: https://example.com/test\n" + ) overlay_file = tmp_path / "overlay.yaml" overlay_file.write_text("title: My Title\n") result = apply_schema_overlay(schema_yml=schema_yml, overlay_file=overlay_file) @@ -747,8 +770,11 @@ class TestApplyYamlDeepMerge: { "id": "https://example.com/test", "name": "new-name", + "default_prefix": "https://example.com/test/", "imports": ["linkml:types"], - "classes": {"Foo": {"description": "original description"}}, + "classes": { + "Foo": {"name": "Foo", "description": "original description"} + }, }, id="top_level_scalar_override", ), @@ -763,13 +789,15 @@ class TestApplyYamlDeepMerge: { "id": "https://example.com/test", "name": "original-name", + "default_prefix": "https://example.com/test/", "imports": ["linkml:types"], "classes": { "Foo": { + "name": "Foo", "description": "original description", "title": "new title", }, - "Bar": {"description": "bar desc"}, + "Bar": {"name": "Bar", "description": "bar desc"}, }, }, id="nested_dict_merge", @@ -780,8 +808,11 @@ class TestApplyYamlDeepMerge: { "id": "https://example.com/test", "name": "original-name", + "default_prefix": "https://example.com/test/", "imports": ["linkml:types"], - "classes": {"Foo": {"description": "new description"}}, + "classes": { + "Foo": {"name": "Foo", "description": "new description"} + }, }, id="nested_dict_override", ), @@ -791,8 +822,11 @@ class TestApplyYamlDeepMerge: { "id": "https://example.com/test", "name": "original-name", + "default_prefix": "https://example.com/test/", "imports": ["linkml:types", "linkml:extra"], - "classes": {"Foo": {"description": "original description"}}, + "classes": { + "Foo": {"name": "Foo", "description": "original description"} + }, }, id="append_to_list", ), @@ -862,6 +896,12 @@ def test_unicode_content_preserved(self, tmp_path: Path): ) assert yaml.safe_load(result)["title"] == "\u00dc n\u00ef c\u00f6d\u00e9" + def test_unknown_field_raises_invalid_schema_error(self, tmp_path: Path): + merge_file = tmp_path / "merge.yaml" + merge_file.write_text("not_a_field: some_value\n") + with pytest.raises(InvalidLinkMLSchemaError, match="Unknown field in schema:"): + apply_yaml_deep_merge(schema_yml=SAMPLE_SCHEMA_YML, merge_file=merge_file) + class TestRemoveSchemaKeyDuplication: def test_classes_name_removed(self): @@ -910,6 +950,24 @@ def test_permissible_values_text_removed(self): assert "text" not in pv assert pv["description"] == "Currently active" + def test_prefixes_prefix_prefix_removed(self): + schema = { + "prefixes": { + "linkml": { + "prefix_prefix": "linkml", + "prefix_reference": "https://w3id.org/linkml/", + }, + "ex": { + "prefix_prefix": "ex", + "prefix_reference": "https://example.org/", + }, + } + } + result = yaml.safe_load(remove_schema_key_duplication(yaml.dump(schema))) + for prefix_entry in result["prefixes"].values(): + assert "prefix_prefix" not in prefix_entry + assert "prefix_reference" in prefix_entry + def test_missing_sections_no_error(self): schema = {"id": "https://example.com/test", "name": "test-schema"} result = yaml.safe_load(remove_schema_key_duplication(yaml.dump(schema))) @@ -926,6 +984,8 @@ def test_round_trip(self): result_yml = remove_schema_key_duplication(raw_yml) result = yaml.safe_load(result_yml) + for prefix_entry in result.get("prefixes", {}).values(): + assert "prefix_prefix" not in prefix_entry for cls in result.get("classes", {}).values(): assert "name" not in cls for su in cls.get("slot_usage", {}).values():