Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ uv add yaml-reference
```

## Spec
![Spec Status](https://img.shields.io/badge/spec%20v0.2.6--3-passing-brightgreen?link=https%3A%2F%2Fgithub.com%2Fdsillman2000%2Fyaml-reference-specs%2Ftree%2Fv0.2.6-3)
![Spec Status](https://img.shields.io/badge/spec%20v0.2.6--4-passing-brightgreen?link=https%3A%2F%2Fgithub.com%2Fdsillman2000%2Fyaml-reference-specs%2Ftree%2Fv0.2.6-4)

This Python library implements the YAML specification for cross-file references and YAML composition in YAML files using tags `!reference`, `!reference-all`, `!flatten`, and `!merge` as defined in the [yaml-reference-specs project](https://github.com/dsillman2000/yaml-reference-specs).

Expand Down Expand Up @@ -260,6 +260,21 @@ yaml-reference compile input.yml --allow /allowed/path1 --allow /allowed/path2

Whether or not `allow_paths` is specified, the default behavior is to allow references to files in the same directory as the source YAML file (or subdirectories). "Back-navigating" out of a the root directory is not allowed (".." local references in a root YAML file). This provides a secure baseline to prevent unsafe access which is not explicitly allowed.

### Glob matching behavior for `!reference-all`

`!reference-all` applies **silent-omission semantics** when individual glob matches fall outside the allowed path set. Disallowed paths are filtered out *before* any file is opened (security invariant: disallowed file contents are never loaded into memory). The result is the subset of glob matches that are both reachable and allowed:

| Scenario | Behaviour | Exit |
|---|---|---|
| Glob matches zero files | Returns `[]` | `rc=0` |
| Some matched files are outside `allow_paths` | Disallowed files are silently dropped; remaining files are returned | `rc=0` |
| All matched files are outside `allow_paths` | Returns `[]` | `rc=0` |
| Glob pattern is absolute (starts with `/`) | Hard error – `ValueError` raised | `rc=1` |
| A matched file is the calling file (self-reference) | Hard error – circularity `ValueError` raised | `rc=1` |
| A matched file transitively references the caller | Hard error – circularity `ValueError` raised | `rc=1` |

This design keeps `!reference-all` resilient against partially-populated directory trees while still enforcing absolute-path and circularity invariants as hard failures.

### Absolute path restrictions

References using absolute paths (e.g., `/tmp/file.yml`) are explicitly rejected with a `ValueError`. All reference paths must be relative to the source file's directory. If you absolutely must reference an absolute path, relative paths to symlinks can be used. Note that their target directories must be explicitly allowed to avoid permission errors (see the above section about "Path restriction and `allow_paths`").
Expand Down
72 changes: 67 additions & 5 deletions tests/unit/test_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,11 +155,73 @@ def test_allow_paths_load_yaml_with_references(stage_files):
)
assert data["all"][0]["outside"] == "outside_value"

# Test with allow_paths that doesn't include the referenced file (should fail, !reference-all)
with pytest.raises(PermissionError):
load_yaml_with_references(
stg / "inner/with_all.yml", allow_paths=[stg / "some"]
)
# Test with allow_paths that doesn't include the referenced file (!reference-all):
# disallowed files are silently omitted, so the result is an empty list.
data = load_yaml_with_references(
stg / "inner/with_all.yml", allow_paths=[stg / "some"]
)
assert data["all"] == []


def test_reference_all_empty_glob(stage_files):
"""Test that !reference-all returns [] without error when glob matches no files."""
files = {
"test.yml": "data: !reference-all { glob: ./nonexistent/*.yml }",
}
stg = stage_files(files)
data = load_yaml_with_references(stg / "test.yml")
assert data["data"] == []


def test_reference_all_silently_omits_disallowed_paths(stage_files):
"""Test that !reference-all silently omits files outside the allowed paths.

Relative-path violations (glob matches beyond the allowed directory tree) must be
dropped *before* reading the file (security invariant) and must NOT trigger an
error; the result is simply the subset of paths that are allowed.
"""
files = {
# test.yml lives in sub/; globs into ../chapters/ which is outside sub/
"sub/test.yml": "data: !reference-all { glob: ../chapters/*.yml }",
"chapters/file1.yml": "val: 1",
"chapters/file2.yml": "val: 2",
}
stg = stage_files(files)

# allow_paths=[stg/"other"] => effective = [stg/"other", stg/"sub"].
# Neither covers stg/"chapters", so all matches are silently omitted.
data = load_yaml_with_references(stg / "sub/test.yml", allow_paths=[stg / "other"])
assert data["data"] == []

# When we explicitly allow chapters/, both files are included.
data = load_yaml_with_references(
stg / "sub/test.yml", allow_paths=[stg / "chapters"]
)
assert len(data["data"]) == 2
assert {"val": 1} in data["data"]
assert {"val": 2} in data["data"]


def test_reference_all_partial_disallow(stage_files):
"""Test that !reference-all includes only the allowed subset of glob matches.

When a glob expands to paths spanning multiple directories and only some of those
directories are in allow_paths, the disallowed paths are silently omitted while the
allowed paths are still included in the result.
"""
files = {
"sub/test.yml": "data: !reference-all { glob: ../*/file.yml }",
"allowed/file.yml": "kind: allowed",
"blocked/file.yml": "kind: blocked",
}
stg = stage_files(files)

# Only stg/"allowed" is in allow_paths (plus the default stg/"sub").
# stg/"blocked"/file.yml is not covered, so it is silently omitted.
data = load_yaml_with_references(
stg / "sub/test.yml", allow_paths=[stg / "allowed"]
)
assert data["data"] == [{"kind": "allowed"}]


@pytest.mark.parametrize(
Expand Down
48 changes: 44 additions & 4 deletions yaml_reference/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,32 @@ def _recursively_attribute_location_to_references(data: Any, base_path: Path):
return data


def _is_path_allowed(path: Path, allow_paths: Sequence[Path]) -> bool:
"""Check whether a resolved path is accessible given the allow_paths configuration.

Unlike `_check_file_path`, this never raises; it returns `False` for paths that
do not exist, are not regular files, or fall outside every entry in *allow_paths*.
An empty *allow_paths* sequence means "no directory restrictions" (all existing files
are considered allowed).

Args:
path: Resolved, absolute path to check.
allow_paths: List of allowed directory paths.

Returns:
True if the path is an accessible file within an allowed directory (or no
restrictions are in place). False otherwise.
"""
if not path.exists() or not path.is_file():
return False
if not allow_paths:
return True
for allow_path in allow_paths:
if path.is_relative_to(allow_path):
return True
return False


def _check_and_track_path(path: Path, visited_paths: set[Path]) -> None:
"""
Check for circular reference and add path to visited set.
Expand Down Expand Up @@ -481,12 +507,26 @@ def _recursively_resolve_references(
elif isinstance(data, ReferenceAll):
glob_results = Path(data.location).parent.glob(data.glob)
abs_paths = [path.resolve() for path in glob_results]

# Empty glob match -> silent omission, return empty list.
if not abs_paths:
raise FileNotFoundError(
f'No files found matching glob pattern "{data.glob}" in directory "{Path(data.location).parent}"'
)
abs_paths = sorted(abs_paths, key=lambda x: str(x))
return []

# Precompute allowed paths sequence once to avoid repeated list()
# construction in the comprehension below.
allowed_paths_seq = list(allow_paths)

# Security invariant: filter out disallowed / nonexistent paths *before*
# opening any file. Relative-path violations are silently omitted here;
# absolute-path violations are caught earlier in ReferenceAll.__init__.
abs_paths = [p for p in abs_paths if _is_path_allowed(p, allowed_paths_seq)]

# All matched paths were disallowed → silent omission, return empty list.
if not abs_paths:
return []

# Sort only the allowed paths to avoid sorting entries that will be dropped.
abs_paths = sorted(abs_paths, key=lambda x: str(x))
resolved_items = []
for path in abs_paths:
# Check for circular reference and track path
Expand Down