diff --git a/README.md b/README.md index 5c6a173..28a7637 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ uv add yaml-reference ``` ## Spec -![Spec Status](https://img.shields.io/badge/spec%20v0.2.6--3-passing-brightgreen?link=https%3A%2F%2Fgithub.com%2Fdsillman2000%2Fyaml-reference-specs%2Ftree%2Fv0.2.6-3) +![Spec Status](https://img.shields.io/badge/spec%20v0.2.6--4-passing-brightgreen?link=https%3A%2F%2Fgithub.com%2Fdsillman2000%2Fyaml-reference-specs%2Ftree%2Fv0.2.6-4) This Python library implements the YAML specification for cross-file references and YAML composition in YAML files using tags `!reference`, `!reference-all`, `!flatten`, and `!merge` as defined in the [yaml-reference-specs project](https://github.com/dsillman2000/yaml-reference-specs). @@ -260,6 +260,21 @@ yaml-reference compile input.yml --allow /allowed/path1 --allow /allowed/path2 Whether or not `allow_paths` is specified, the default behavior is to allow references to files in the same directory as the source YAML file (or subdirectories). "Back-navigating" out of a the root directory is not allowed (".." local references in a root YAML file). This provides a secure baseline to prevent unsafe access which is not explicitly allowed. +### Glob matching behavior for `!reference-all` + +`!reference-all` applies **silent-omission semantics** when individual glob matches fall outside the allowed path set. Disallowed paths are filtered out *before* any file is opened (security invariant: disallowed file contents are never loaded into memory). The result is the subset of glob matches that are both reachable and allowed: + +| Scenario | Behaviour | Exit | +|---|---|---| +| Glob matches zero files | Returns `[]` | `rc=0` | +| Some matched files are outside `allow_paths` | Disallowed files are silently dropped; remaining files are returned | `rc=0` | +| All matched files are outside `allow_paths` | Returns `[]` | `rc=0` | +| Glob pattern is absolute (starts with `/`) | Hard error – `ValueError` raised | `rc=1` | +| A matched file is the calling file (self-reference) | Hard error – circularity `ValueError` raised | `rc=1` | +| A matched file transitively references the caller | Hard error – circularity `ValueError` raised | `rc=1` | + +This design keeps `!reference-all` resilient against partially-populated directory trees while still enforcing absolute-path and circularity invariants as hard failures. + ### Absolute path restrictions References using absolute paths (e.g., `/tmp/file.yml`) are explicitly rejected with a `ValueError`. All reference paths must be relative to the source file's directory. If you absolutely must reference an absolute path, relative paths to symlinks can be used. Note that their target directories must be explicitly allowed to avoid permission errors (see the above section about "Path restriction and `allow_paths`"). diff --git a/tests/unit/test_reference.py b/tests/unit/test_reference.py index 99376cb..2cc476a 100644 --- a/tests/unit/test_reference.py +++ b/tests/unit/test_reference.py @@ -155,11 +155,73 @@ def test_allow_paths_load_yaml_with_references(stage_files): ) assert data["all"][0]["outside"] == "outside_value" - # Test with allow_paths that doesn't include the referenced file (should fail, !reference-all) - with pytest.raises(PermissionError): - load_yaml_with_references( - stg / "inner/with_all.yml", allow_paths=[stg / "some"] - ) + # Test with allow_paths that doesn't include the referenced file (!reference-all): + # disallowed files are silently omitted, so the result is an empty list. + data = load_yaml_with_references( + stg / "inner/with_all.yml", allow_paths=[stg / "some"] + ) + assert data["all"] == [] + + +def test_reference_all_empty_glob(stage_files): + """Test that !reference-all returns [] without error when glob matches no files.""" + files = { + "test.yml": "data: !reference-all { glob: ./nonexistent/*.yml }", + } + stg = stage_files(files) + data = load_yaml_with_references(stg / "test.yml") + assert data["data"] == [] + + +def test_reference_all_silently_omits_disallowed_paths(stage_files): + """Test that !reference-all silently omits files outside the allowed paths. + + Relative-path violations (glob matches beyond the allowed directory tree) must be + dropped *before* reading the file (security invariant) and must NOT trigger an + error; the result is simply the subset of paths that are allowed. + """ + files = { + # test.yml lives in sub/; globs into ../chapters/ which is outside sub/ + "sub/test.yml": "data: !reference-all { glob: ../chapters/*.yml }", + "chapters/file1.yml": "val: 1", + "chapters/file2.yml": "val: 2", + } + stg = stage_files(files) + + # allow_paths=[stg/"other"] => effective = [stg/"other", stg/"sub"]. + # Neither covers stg/"chapters", so all matches are silently omitted. + data = load_yaml_with_references(stg / "sub/test.yml", allow_paths=[stg / "other"]) + assert data["data"] == [] + + # When we explicitly allow chapters/, both files are included. + data = load_yaml_with_references( + stg / "sub/test.yml", allow_paths=[stg / "chapters"] + ) + assert len(data["data"]) == 2 + assert {"val": 1} in data["data"] + assert {"val": 2} in data["data"] + + +def test_reference_all_partial_disallow(stage_files): + """Test that !reference-all includes only the allowed subset of glob matches. + + When a glob expands to paths spanning multiple directories and only some of those + directories are in allow_paths, the disallowed paths are silently omitted while the + allowed paths are still included in the result. + """ + files = { + "sub/test.yml": "data: !reference-all { glob: ../*/file.yml }", + "allowed/file.yml": "kind: allowed", + "blocked/file.yml": "kind: blocked", + } + stg = stage_files(files) + + # Only stg/"allowed" is in allow_paths (plus the default stg/"sub"). + # stg/"blocked"/file.yml is not covered, so it is silently omitted. + data = load_yaml_with_references( + stg / "sub/test.yml", allow_paths=[stg / "allowed"] + ) + assert data["data"] == [{"kind": "allowed"}] @pytest.mark.parametrize( diff --git a/yaml_reference/__init__.py b/yaml_reference/__init__.py index 40771ee..3180d84 100644 --- a/yaml_reference/__init__.py +++ b/yaml_reference/__init__.py @@ -400,6 +400,32 @@ def _recursively_attribute_location_to_references(data: Any, base_path: Path): return data +def _is_path_allowed(path: Path, allow_paths: Sequence[Path]) -> bool: + """Check whether a resolved path is accessible given the allow_paths configuration. + + Unlike `_check_file_path`, this never raises; it returns `False` for paths that + do not exist, are not regular files, or fall outside every entry in *allow_paths*. + An empty *allow_paths* sequence means "no directory restrictions" (all existing files + are considered allowed). + + Args: + path: Resolved, absolute path to check. + allow_paths: List of allowed directory paths. + + Returns: + True if the path is an accessible file within an allowed directory (or no + restrictions are in place). False otherwise. + """ + if not path.exists() or not path.is_file(): + return False + if not allow_paths: + return True + for allow_path in allow_paths: + if path.is_relative_to(allow_path): + return True + return False + + def _check_and_track_path(path: Path, visited_paths: set[Path]) -> None: """ Check for circular reference and add path to visited set. @@ -481,12 +507,26 @@ def _recursively_resolve_references( elif isinstance(data, ReferenceAll): glob_results = Path(data.location).parent.glob(data.glob) abs_paths = [path.resolve() for path in glob_results] + + # Empty glob match -> silent omission, return empty list. if not abs_paths: - raise FileNotFoundError( - f'No files found matching glob pattern "{data.glob}" in directory "{Path(data.location).parent}"' - ) - abs_paths = sorted(abs_paths, key=lambda x: str(x)) + return [] + + # Precompute allowed paths sequence once to avoid repeated list() + # construction in the comprehension below. + allowed_paths_seq = list(allow_paths) + # Security invariant: filter out disallowed / nonexistent paths *before* + # opening any file. Relative-path violations are silently omitted here; + # absolute-path violations are caught earlier in ReferenceAll.__init__. + abs_paths = [p for p in abs_paths if _is_path_allowed(p, allowed_paths_seq)] + + # All matched paths were disallowed → silent omission, return empty list. + if not abs_paths: + return [] + + # Sort only the allowed paths to avoid sorting entries that will be dropped. + abs_paths = sorted(abs_paths, key=lambda x: str(x)) resolved_items = [] for path in abs_paths: # Check for circular reference and track path