diff --git a/CLAUDE.md b/CLAUDE.md index 15b7f56f..5e8f5020 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -65,18 +65,41 @@ pydantic2linkml [OPTIONS] MODULE_NAMES... pydantic2linkml -o output.yml -l INFO dandischema.models ``` -Options: `--output-file`/`-o` (path), `--log-level`/`-l` (default: WARNING). +Options: + +- `--output-file`/`-o` (path) — write output to a file instead of stdout +- `--merge-file`/`-M` (path) — deep-merge a YAML file into the generated + schema; values from the file win on conflict; no field filtering applied +- `--overlay-file`/`-O` (path) — shallow-merge a YAML file into the + generated schema; only `SchemaDefinition` fields are applied; unknown + keys are skipped with a warning +- `--log-level`/`-l` (default: WARNING) ## Architecture ### Core Translation Pipeline -1. **`tools.py`** — Low-level utilities for introspecting Pydantic internals: +1. **`tools.py`** — Low-level utilities for introspecting Pydantic internals + and post-processing the generated schema YAML: - `get_all_modules()` — imports modules and collects them with submodules - - `fetch_defs()` — extracts `BaseModel` subclasses and `Enum` subclasses from modules - - `get_field_schema()` / `get_locally_defined_fields()` — extracts resolved `pydantic_core.CoreSchema` objects for fields, distinguishing newly defined vs. overriding fields - - `FieldSchema` (NamedTuple) — bundles a field's core schema, its resolution context, field name, `FieldInfo`, and owning model - - `resolve_ref_schema()` — resolves `definition-ref` and `definitions` schema types to concrete schemas + - `fetch_defs()` — extracts `BaseModel` subclasses and `Enum` subclasses + from modules + - `get_field_schema()` / `get_locally_defined_fields()` — extracts + resolved `pydantic_core.CoreSchema` objects for fields, distinguishing + newly defined vs. overriding fields + - `FieldSchema` (NamedTuple) — bundles a field's core schema, its + resolution context, field name, `FieldInfo`, and owning model + - `resolve_ref_schema()` — resolves `definition-ref` and `definitions` + schema types to concrete schemas + - `apply_schema_overlay(schema_yml, overlay_file)` — shallow-merges a + YAML file into a schema YAML string; restricts keys to + `SchemaDefinition` fields + - `apply_yaml_deep_merge(schema_yml, merge_file)` — deep-merges a YAML + file into a schema YAML string using `deepmerge`; no field filtering + - `remove_schema_key_duplication(yml)` — strips redundant `name`/`text` + fields from serialized LinkML YAML + - `add_section_breaks(yml)` — inserts blank lines before top-level + sections 2. **`gen_linkml.py`** — Main translation logic: - `translate_defs(module_names)` — top-level entry point; loads modules, fetches defs, runs `LinkmlGenerator` @@ -84,13 +107,19 @@ Options: `--output-file`/`-o` (path), `--log-level`/`-l` (default: WARNING). - `SlotGenerator` — single-use class; translates a single Pydantic `CoreSchema` into a `SlotDefinition`. Dispatches on schema `type` strings via handler methods. Handles nesting, optionality, lists, unions, literals, UUIDs, dates, etc. - `any_class_def` — module-level `ClassDefinition` constant for the LinkML `Any` type -3. **`cli/`** — Typer-based CLI wrapping `translate_defs`; `cli/__init__.py` defines the `app` and `main` command. +3. **`cli/`** — Typer-based CLI wrapping `translate_defs`; `cli/__init__.py` + defines the `app` and `main` command. After translation the pipeline is: + dump YAML → `remove_schema_key_duplication` → optional `-M` deep merge + → optional `-O` overlay → `add_section_breaks` → output. 4. **`exceptions.py`** — Custom exceptions: - `NameCollisionError` — duplicate class/enum names across modules - `GeneratorReuseError` — attempting to reuse a single-use generator - `TranslationNotImplementedError` — schema type not yet handled - - `SlotExtensionError` — cannot extend a base slot to match a target via slot_usage + - `SlotExtensionError` — cannot extend a base slot to match a target + via slot_usage + - `YAMLContentError` — YAML file content is not what is expected (e.g., + not a mapping) ### Key Design Patterns diff --git a/README.md b/README.md index 2d8366ef..1ebfc619 100644 --- a/README.md +++ b/README.md @@ -10,3 +10,12 @@ A tool for translating models expressed in Pydantic to LinkML ```console pydantic2linkml -o o.yml -l INFO dandischema.models ``` + +### Options + +| Flag | Description | +|------|-------------| +| `-o` / `--output-file` | Write output to a file (default: stdout) | +| `-M` / `--merge-file` | Deep-merge a YAML file into the generated schema. Values from the file win on conflict; no field filtering is applied. | +| `-O` / `--overlay-file` | Shallow-merge a YAML file into the generated schema. Only `SchemaDefinition` fields are applied; unknown keys are skipped with a warning. | +| `-l` / `--log-level` | Log level (default: `WARNING`) | diff --git a/pyproject.toml b/pyproject.toml index b087ed1d..54479253 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", ] dependencies = [ + "deepmerge", "linkml", "pydantic~=2.7,<2.11", "PyYAML", diff --git a/src/pydantic2linkml/cli/__init__.py b/src/pydantic2linkml/cli/__init__.py index 36c54672..bbcb063e 100644 --- a/src/pydantic2linkml/cli/__init__.py +++ b/src/pydantic2linkml/cli/__init__.py @@ -3,15 +3,17 @@ from typing import Annotated, Optional import typer +import yaml from linkml_runtime.dumpers import yaml_dumper from pydantic import ValidationError from pydantic2linkml.cli.tools import LogLevel -from pydantic2linkml.exceptions import OverlayContentError +from pydantic2linkml.exceptions import YAMLContentError from pydantic2linkml.gen_linkml import translate_defs from pydantic2linkml.tools import ( add_section_breaks, apply_schema_overlay, + apply_yaml_deep_merge, remove_schema_key_duplication, ) @@ -22,6 +24,18 @@ @app.command() def main( module_names: list[str], + merge_file: Annotated[ + Optional[Path], + typer.Option( + "--merge-file", + "-M", + help="A YAML file whose contents are deep-merged into the generated " + "schema. Values from this file win on conflict. The result is " + "always a valid YAML file but may not be a valid LinkML schema — " + "it is the user's responsibility to supply a merge file that " + "produces a valid schema.", + ), + ] = None, overlay_file: Annotated[ Optional[Path], typer.Option( @@ -46,6 +60,25 @@ def main( schema = translate_defs(module_names) logger.info("Dumping schema") yml = remove_schema_key_duplication(yaml_dumper.dumps(schema)) + if merge_file is not None: + logger.info("Applying deep merge from %s", merge_file) + try: + yml = apply_yaml_deep_merge(schema_yml=yml, merge_file=merge_file) + except ValidationError as e: + raise typer.BadParameter( + f"The merge file path is invalid: {e}", + param_hint="'--merge-file'", + ) from e + except yaml.YAMLError as e: + raise typer.BadParameter( + f"The merge file does not contain valid YAML: {e}", + param_hint="'--merge-file'", + ) from e + except YAMLContentError as e: + raise typer.BadParameter( + f"The merge file does not contain a valid YAML mapping: {e}", + param_hint="'--merge-file'", + ) from e if overlay_file is not None: logger.info("Applying overlay from %s", overlay_file) try: @@ -55,14 +88,14 @@ def main( f"The overlay file path is invalid: {e}", param_hint="'--overlay-file'", ) from e - except OverlayContentError as e: + except YAMLContentError as e: raise typer.BadParameter( f"The overlay file does not contain a valid YAML mapping: {e}", param_hint="'--overlay-file'", ) from e yml = add_section_breaks(yml) if not output_file: - print(yml, end='') # noqa: T201 + print(yml, end="") # noqa: T201 else: with output_file.open("w") as f: f.write(yml) diff --git a/src/pydantic2linkml/exceptions.py b/src/pydantic2linkml/exceptions.py index f8166357..f570ce9d 100644 --- a/src/pydantic2linkml/exceptions.py +++ b/src/pydantic2linkml/exceptions.py @@ -106,7 +106,7 @@ def __repr__(self): ) -class OverlayContentError(ValueError): +class YAMLContentError(ValueError): """ - Raise when the content of an overlay file is not a valid YAML mapping + Raise when the content of a YAML file is not what is expected """ diff --git a/src/pydantic2linkml/tools.py b/src/pydantic2linkml/tools.py index 138069aa..cc0e0fe5 100644 --- a/src/pydantic2linkml/tools.py +++ b/src/pydantic2linkml/tools.py @@ -23,7 +23,7 @@ from pydantic2linkml.exceptions import ( NameCollisionError, - OverlayContentError, + YAMLContentError, SlotExtensionError, ) @@ -543,7 +543,7 @@ def apply_schema_overlay(schema_yml: str, overlay_file: FilePath) -> str: :return: YAML string with the overlay applied, keys ordered to match SchemaDefinition field order :raises ValueError: If ``schema_yml`` does not deserialize to a dict - :raises OverlayContentError: If the overlay file does not contain a YAML + :raises YAMLContentError: If the overlay file does not contain a YAML mapping """ schema_dict = yaml.safe_load(schema_yml) @@ -556,7 +556,7 @@ def apply_schema_overlay(schema_yml: str, overlay_file: FilePath) -> str: overlay = yaml.safe_load(f) if not isinstance(overlay, dict): - raise OverlayContentError( + raise YAMLContentError( f"Overlay file {overlay_file} must contain a YAML mapping" ) @@ -580,6 +580,46 @@ def apply_schema_overlay(schema_yml: str, overlay_file: FilePath) -> str: return yaml.dump(ordered, allow_unicode=True, sort_keys=False) +@validate_call +def apply_yaml_deep_merge(schema_yml: str, merge_file: FilePath) -> str: + """Deep-merge a YAML file into a serialized schema YAML string. + + Values from the merge file win on conflict. The merge is unrestricted — + no field filtering is applied. + + :param schema_yml: YAML string of a valid LinkML schema + :param merge_file: Path to an existing YAML file containing a mapping + :return: YAML string with the deep merge applied + :raises ValueError: If ``schema_yml`` does not contain valid YAML or does + not deserialize to a dict + :raises yaml.YAMLError: If the merge file does not contain valid YAML + :raises YAMLContentError: If the merge file does not contain a YAML mapping + """ + from deepmerge import always_merger + + try: + schema_dict = yaml.safe_load(schema_yml) + except yaml.YAMLError as e: + raise ValueError(f"schema_yml does not contain valid YAML: {e}") from e + + if not isinstance(schema_dict, dict): + raise ValueError( + f"schema_yml did not deserialize to a dict: {type(schema_dict)}" + ) + + with merge_file.open() as f: + merge_dict = yaml.safe_load(f) # raises yaml.YAMLError on invalid YAML + + if not isinstance(merge_dict, dict): + raise YAMLContentError(f"Merge file {merge_file} must contain a YAML mapping") + + return yaml.dump( + always_merger.merge(schema_dict, merge_dict), + allow_unicode=True, + sort_keys=False, + ) + + def remove_schema_key_duplication(yml: str) -> str: """Remove redundant name/text fields from a valid serialized LinkML schema. diff --git a/tests/test_cli.py b/tests/test_cli.py index 09124944..1abb8d43 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -6,7 +6,11 @@ from pydantic2linkml.cli import app, main -runner = CliRunner() +# Use a wide terminal so Typer's Rich error boxes are never wrapped across lines. +# terminal_width kwarg is not sufficient because Typer's Rich-based error +# formatting reads terminal width from shutil.get_terminal_size(), which +# respects the COLUMNS environment variable. +runner = CliRunner(env={"COLUMNS": "200"}) _MOCK_SCHEMA = SchemaDefinition(id="https://example.com/test", name="test-schema") @@ -58,3 +62,47 @@ def test_unknown_key(self, tmp_path: Path): result = runner.invoke(app, ["dandischema.models", "-O", str(overlay_file)]) assert result.exit_code == 0 assert "not_a_field" not in result.output + + +class TestCliDeepMerge: + @pytest.fixture(autouse=True) + def mock_translate_defs(self, mocker): + mocker.patch("pydantic2linkml.cli.translate_defs", return_value=_MOCK_SCHEMA) + + def test_valid_field(self, tmp_path: Path): + merge_file = tmp_path / "merge.yaml" + merge_file.write_text("name: my-name\n") + result = runner.invoke(app, ["dandischema.models", "-M", str(merge_file)]) + assert result.exit_code == 0 + assert "name: my-name" in result.output + + def test_nested_merge(self, tmp_path: Path): + merge_file = tmp_path / "merge.yaml" + merge_file.write_text("classes:\n Foo:\n description: test-desc\n") + result = runner.invoke(app, ["dandischema.models", "-M", str(merge_file)]) + assert result.exit_code == 0 + assert "description: test-desc" in result.output + # Original top-level fields are preserved + assert "id: https://example.com/test" in result.output + + def test_nonexistent_file(self, tmp_path: Path): + result = runner.invoke( + app, + ["dandischema.models", "-M", str(tmp_path / "no-such-file.yaml")], + ) + assert result.exit_code == 2 + assert "merge file path is invalid" in result.output + + def test_non_mapping(self, tmp_path: Path): + merge_file = tmp_path / "merge.yaml" + merge_file.write_text("- item1\n") + result = runner.invoke(app, ["dandischema.models", "-M", str(merge_file)]) + assert result.exit_code == 2 + assert "does not contain a valid YAML mapping" in result.output + + def test_invalid_yaml(self, tmp_path: Path): + merge_file = tmp_path / "merge.yaml" + merge_file.write_text("key: [unclosed\n") + result = runner.invoke(app, ["dandischema.models", "-M", str(merge_file)]) + assert result.exit_code == 2 + assert "does not contain valid YAML" in result.output diff --git a/tests/test_tools.py b/tests/test_tools.py index 3cac5db8..d42d42b6 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -13,12 +13,13 @@ from pydantic2linkml.exceptions import ( NameCollisionError, - OverlayContentError, + YAMLContentError, SlotExtensionError, ) from pydantic2linkml.tools import ( add_section_breaks, apply_schema_overlay, + apply_yaml_deep_merge, bucketize, ensure_unique_names, fetch_defs, @@ -35,8 +36,17 @@ sort_dict, ) -# A minimal YAML dict suitable as schema_yml input for apply_schema_overlay tests -SAMPLE_SCHEMA_YML = "id: https://example.com/test\nname: original-name\n" +# A minimal YAML dict suitable as schema_yml input for apply_schema_overlay +# and apply_yaml_deep_merge tests +SAMPLE_SCHEMA_YML = ( + "id: https://example.com/test\n" + "name: original-name\n" + "imports:\n" + " - linkml:types\n" + "classes:\n" + " Foo:\n" + " description: original description\n" +) def test_get_parent_models(): @@ -687,7 +697,7 @@ def test_non_dict_overlay_raises_overlay_content_error( ): overlay_file = tmp_path / "overlay.yaml" overlay_file.write_text(overlay_content) - with pytest.raises(OverlayContentError): + with pytest.raises(YAMLContentError): apply_schema_overlay( schema_yml=SAMPLE_SCHEMA_YML, overlay_file=overlay_file ) @@ -728,6 +738,131 @@ def test_unicode_content_preserved(self, tmp_path: Path): assert yaml.safe_load(result)["title"] == "\u00dc n\u00ef c\u00f6d\u00e9" +class TestApplyYamlDeepMerge: + @pytest.mark.parametrize( + "merge_content, expected", + [ + pytest.param( + "name: new-name\n", + { + "id": "https://example.com/test", + "name": "new-name", + "imports": ["linkml:types"], + "classes": {"Foo": {"description": "original description"}}, + }, + id="top_level_scalar_override", + ), + pytest.param( + # Nested dict merge: Foo.title added, Foo.description preserved, + # Bar added alongside Foo + "classes:\n" + " Foo:\n" + " title: new title\n" + " Bar:\n" + " description: bar desc\n", + { + "id": "https://example.com/test", + "name": "original-name", + "imports": ["linkml:types"], + "classes": { + "Foo": { + "description": "original description", + "title": "new title", + }, + "Bar": {"description": "bar desc"}, + }, + }, + id="nested_dict_merge", + ), + pytest.param( + # Nested dict override: Foo.description replaced + "classes:\n Foo:\n description: new description\n", + { + "id": "https://example.com/test", + "name": "original-name", + "imports": ["linkml:types"], + "classes": {"Foo": {"description": "new description"}}, + }, + id="nested_dict_override", + ), + pytest.param( + # Appending to list: always_merger appends elements to lists + "imports:\n - linkml:extra\n", + { + "id": "https://example.com/test", + "name": "original-name", + "imports": ["linkml:types", "linkml:extra"], + "classes": {"Foo": {"description": "original description"}}, + }, + id="append_to_list", + ), + ], + ) + def test_merge_applied(self, tmp_path: Path, merge_content, expected): + merge_file = tmp_path / "merge.yaml" + merge_file.write_text(merge_content) + result = apply_yaml_deep_merge( + schema_yml=SAMPLE_SCHEMA_YML, merge_file=merge_file + ) + assert yaml.safe_load(result) == expected + + @pytest.mark.parametrize( + "get_path", + [ + pytest.param(lambda p: p / "no-such-file.yaml", id="nonexistent_file"), + pytest.param(lambda p: p, id="directory"), + ], + ) + def test_invalid_merge_file_raises_validation_error(self, tmp_path: Path, get_path): + with pytest.raises(ValidationError): + apply_yaml_deep_merge( + schema_yml=SAMPLE_SCHEMA_YML, merge_file=get_path(tmp_path) + ) + + @pytest.mark.parametrize( + "merge_content", + [ + pytest.param("- item1\n- item2\n", id="list"), + pytest.param("", id="null"), + ], + ) + def test_non_dict_merge_raises_yaml_content_error( + self, tmp_path: Path, merge_content + ): + merge_file = tmp_path / "merge.yaml" + merge_file.write_text(merge_content) + with pytest.raises(YAMLContentError): + apply_yaml_deep_merge(schema_yml=SAMPLE_SCHEMA_YML, merge_file=merge_file) + + def test_invalid_yaml_in_merge_file_raises_yaml_error(self, tmp_path: Path): + merge_file = tmp_path / "merge.yaml" + merge_file.write_text("key: [unclosed\n") + with pytest.raises(yaml.YAMLError): + apply_yaml_deep_merge(schema_yml=SAMPLE_SCHEMA_YML, merge_file=merge_file) + + def test_schema_yml_not_dict_raises_value_error(self, tmp_path: Path): + merge_file = tmp_path / "merge.yaml" + merge_file.write_text("name: new-name\n") + with pytest.raises(ValueError): + apply_yaml_deep_merge( + schema_yml="- item1\n- item2\n", merge_file=merge_file + ) + + def test_schema_yml_invalid_yaml_raises_value_error(self, tmp_path: Path): + merge_file = tmp_path / "merge.yaml" + merge_file.write_text("name: new-name\n") + with pytest.raises(ValueError): + apply_yaml_deep_merge(schema_yml="key: [unclosed\n", merge_file=merge_file) + + def test_unicode_content_preserved(self, tmp_path: Path): + merge_file = tmp_path / "merge.yaml" + merge_file.write_text("title: \u00dc n\u00ef c\u00f6d\u00e9\n") + result = apply_yaml_deep_merge( + schema_yml=SAMPLE_SCHEMA_YML, merge_file=merge_file + ) + assert yaml.safe_load(result)["title"] == "\u00dc n\u00ef c\u00f6d\u00e9" + + class TestRemoveSchemaKeyDuplication: def test_classes_name_removed(self): schema = {"classes": {"Person": {"name": "Person", "description": "A person"}}}