Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions src/cloudai/_core/installables.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@

from pydantic import BaseModel, ConfigDict

from cloudai._core.types import DockerURL


class Installable(ABC):
"""Installable object."""
Expand Down Expand Up @@ -73,11 +75,20 @@ def cache_filename(self) -> str:
return f"{img_name}__{tag}.sqsh"

@property
def installed_path(self) -> Union[str, Path]:
"""Return the cached path or URL of the docker image."""
def installed_path(self) -> Union[DockerURL, Path]:
"""
Return the cached path or URL of the docker image.

Returns:
Path: Local .sqsh file path (when cached locally)
DockerURL: Registry URL (when not cached, pyxis pulls from registry)

Downstream code should check isinstance(result, Path) vs isinstance(result, DockerURL)
to determine how to handle the value. Do NOT call Path().absolute() on DockerURL.
"""
if self._installed_path:
return self._installed_path.absolute() if isinstance(self._installed_path, Path) else self._installed_path
return self.url
return DockerURL(self.url)

@installed_path.setter
def installed_path(self, value: Union[str, Path]) -> None:
Expand Down
34 changes: 34 additions & 0 deletions src/cloudai/_core/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Custom types for CloudAI."""


class DockerURL(str):
"""
Docker registry URL - not a filesystem path.

This type distinguishes docker registry URLs from local filesystem paths.
Downstream code should check isinstance(value, DockerURL) to determine
if the value is a URL (pass directly to pyxis) vs a Path (local .sqsh file).

Examples:
- "nvcr.io/nvidian/nemo:26.04.rc2"
- "docker://nvcr.io/nvidian/nemo:26.04.rc2"
- "docker.io/library/ubuntu:22.04"
"""

pass
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,10 @@ def _installed_container_path() -> str:
"(docker_image.installed_path is empty). Please run `cloudai install` first, or provide "
"a valid local .sqsh path in cmd_args.container_image."
)
return str(Path(installed).absolute())
if isinstance(installed, Path):
return str(installed.absolute())
# DockerURL - pass directly to pyxis (it will pull from registry)
return str(installed)

ci = str(args.container_image).strip()
if ci.startswith("/") or ci.startswith("."):
Expand Down
119 changes: 119 additions & 0 deletions tests/test_cache_docker_images_locally_false.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Test case: cache_docker_images_locally = false with DockerURL type.

When cache_docker_images_locally=false in system TOML:
1. cloudai install doesn't set DockerImage._installed_path
2. DockerImage.installed_path returns DockerURL(self.url)
3. slurm_command_gen_strategy.py checks isinstance(installed, Path) vs DockerURL
4. DockerURL is passed directly to pyxis (it pulls from registry)

The fix uses a custom DockerURL type to distinguish URLs from Paths.
"""

from pathlib import Path

from cloudai._core.installables import DockerImage
from cloudai._core.types import DockerURL


class TestCacheDockerImagesLocallyFalse:
"""Test cases for cache_docker_images_locally=false with DockerURL type."""

def test_installed_path_returns_docker_url_when_not_cached(self):
"""When _installed_path is None, installed_path returns a DockerURL."""
docker_url = "nvcr.io/nvidian/nemo:26.04.rc2"
img = DockerImage(url=docker_url)

assert img._installed_path is None, "Precondition: _installed_path should be None"

result = img.installed_path

# Returns DockerURL type (not plain str)
assert isinstance(result, DockerURL), f"Expected DockerURL, got {type(result)}"
assert result == docker_url
Comment on lines +43 to +49
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

Drop explicit assertion messages in these pytest asserts.

Pytest already renders clear actual/expected introspection, so these custom messages add noise.

🧹 Proposed cleanup
-        assert img._installed_path is None, "Precondition: _installed_path should be None"
+        assert img._installed_path is None
@@
-        assert isinstance(result, DockerURL), f"Expected DockerURL, got {type(result)}"
+        assert isinstance(result, DockerURL)
@@
-        assert isinstance(result, Path), f"Expected Path, got {type(result)}"
+        assert isinstance(result, Path)
Based on learnings: In pytest, rely on assertion introspection for failure details and avoid including explicit values in the assertion message.

Also applies to: 62-63

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@tests/test_cache_docker_images_locally_false.py` around lines 43 - 49, Remove
the explicit assertion message strings from the pytest asserts checking
img._installed_path, isinstance(result, DockerURL) and equality of result ==
docker_url (and the similar asserts around lines 62-63); keep the assertions
themselves (e.g., assert img._installed_path is None, assert isinstance(result,
DockerURL), assert result == docker_url) so pytest can use its built-in
introspection rather than custom message text.


def test_installed_path_returns_path_when_cached(self):
"""When _installed_path is set, installed_path returns a Path."""
docker_url = "nvcr.io/nvidian/nemo:26.04.rc2"
sqsh_path = Path("/install/nvcr.io_nvidian__nemo__26.04.rc2.sqsh")

img = DockerImage(url=docker_url)
img.installed_path = sqsh_path

result = img.installed_path

# Returns Path type (not DockerURL)
assert isinstance(result, Path), f"Expected Path, got {type(result)}"
assert result == sqsh_path.absolute()

def test_docker_url_is_subclass_of_str(self):
"""DockerURL is a str subclass, so it works with string operations."""
url = DockerURL("nvcr.io/nvidian/nemo:26.04.rc2")

assert isinstance(url, str)
assert isinstance(url, DockerURL)
assert "nvcr.io" in url
assert url.startswith("nvcr.io")

def test_type_check_distinguishes_url_from_path(self):
"""isinstance() can distinguish DockerURL from Path."""
docker_url = "nvcr.io/nvidian/nemo:26.04.rc2"
img = DockerImage(url=docker_url)

# Not cached - returns DockerURL
result_uncached = img.installed_path
assert isinstance(result_uncached, DockerURL)
assert not isinstance(result_uncached, Path)

# Cached - returns Path
img.installed_path = Path("/install/image.sqsh")
result_cached = img.installed_path
assert isinstance(result_cached, Path)
assert not isinstance(result_cached, DockerURL)

def test_container_path_resolution_logic(self):
"""
Test the correct container path resolution logic.

This is what slurm_command_gen_strategy.py should do:
- Path: call .absolute() and convert to str
- DockerURL: pass directly as str
"""
def resolve_container_path(installed) -> str:
if isinstance(installed, Path):
return str(installed.absolute())
# DockerURL - pass directly to pyxis
return str(installed)

# DockerURL case
url_result = resolve_container_path(DockerURL("nvcr.io/nvidian/nemo:26.04.rc2"))
assert url_result == "nvcr.io/nvidian/nemo:26.04.rc2"
assert not url_result.startswith("/") # Not mangled into a local path

# Path case
path_result = resolve_container_path(Path("/install/image.sqsh"))
assert path_result == "/install/image.sqsh"

Comment on lines +90 to +112
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

This test validates a local reimplementation, not the production path resolver.

resolve_container_path() at Line 98 duplicates intended behavior, so this test can stay green while src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py regresses.

Please exercise the real command-generation path (or extract the resolver into a shared function and test that directly) so this test guards the actual integration behavior.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@tests/test_cache_docker_images_locally_false.py` around lines 90 - 112, The
test currently defines a local helper resolve_container_path which duplicates
production logic and fails to protect
src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py from
regressions; update the test to exercise the real resolver instead of the local
stub by either (A) importing and calling the production resolver used in
slurm_command_gen_strategy.py (remove the local resolve_container_path and call
that resolver directly) or (B) refactoring the resolver into a shared function
(e.g., a new util exported from slurm_command_gen_strategy or a new module) and
have the test import and assert on that shared function; ensure the test still
covers both DockerURL and Path cases (use DockerURL(...) and Path(...)) so it
fails if the production resolver regresses.

def test_cache_filename_generation(self):
"""Verify cache filename is correctly generated from docker URL."""
docker_url = "nvcr.io/nvidian/nemo:26.04.rc2"
img = DockerImage(url=docker_url)

expected = "nvcr.io_nvidian__nemo__26.04.rc2.sqsh"
assert img.cache_filename == expected