From 2418d3dcd1b444d1a1b8fd7c384206b788cff076 Mon Sep 17 00:00:00 2001 From: Rutayan Patro Date: Wed, 25 Mar 2026 22:00:13 -0400 Subject: [PATCH 1/2] feat: add DockerURL type to fix cache_docker_images_locally=false When cache_docker_images_locally=false, DockerImage.installed_path was returning the raw URL string which downstream code passed through Path().absolute(), creating bogus paths like "/cwd/nvcr.io/...". This fix: - Adds DockerURL(str) type in _core/types.py to distinguish URLs from paths - Updates installed_path property to return DockerURL when not cached - Updates slurm_command_gen_strategy.py to check isinstance(Path) vs DockerURL and only call .absolute() on actual Path objects Now pyxis correctly receives the URL and can pull from registry. Fixes: cache_docker_images_locally=false causing container not found errors Made-with: Cursor --- src/cloudai/_core/installables.py | 17 ++++++++-- src/cloudai/_core/types.py | 34 +++++++++++++++++++ .../slurm_command_gen_strategy.py | 5 ++- 3 files changed, 52 insertions(+), 4 deletions(-) create mode 100644 src/cloudai/_core/types.py diff --git a/src/cloudai/_core/installables.py b/src/cloudai/_core/installables.py index d6db18589..158659cb2 100644 --- a/src/cloudai/_core/installables.py +++ b/src/cloudai/_core/installables.py @@ -21,6 +21,8 @@ from pydantic import BaseModel, ConfigDict +from cloudai._core.types import DockerURL + class Installable(ABC): """Installable object.""" @@ -73,11 +75,20 @@ def cache_filename(self) -> str: return f"{img_name}__{tag}.sqsh" @property - def installed_path(self) -> Union[str, Path]: - """Return the cached path or URL of the docker image.""" + def installed_path(self) -> Union[DockerURL, Path]: + """ + Return the cached path or URL of the docker image. + + Returns: + Path: Local .sqsh file path (when cached locally) + DockerURL: Registry URL (when not cached, pyxis pulls from registry) + + Downstream code should check isinstance(result, Path) vs isinstance(result, DockerURL) + to determine how to handle the value. Do NOT call Path().absolute() on DockerURL. + """ if self._installed_path: return self._installed_path.absolute() if isinstance(self._installed_path, Path) else self._installed_path - return self.url + return DockerURL(self.url) @installed_path.setter def installed_path(self, value: Union[str, Path]) -> None: diff --git a/src/cloudai/_core/types.py b/src/cloudai/_core/types.py new file mode 100644 index 000000000..d9e8bd2a0 --- /dev/null +++ b/src/cloudai/_core/types.py @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Custom types for CloudAI.""" + + +class DockerURL(str): + """ + Docker registry URL - not a filesystem path. + + This type distinguishes docker registry URLs from local filesystem paths. + Downstream code should check isinstance(value, DockerURL) to determine + if the value is a URL (pass directly to pyxis) vs a Path (local .sqsh file). + + Examples: + - "nvcr.io/nvidian/nemo:26.04.rc2" + - "docker://nvcr.io/nvidian/nemo:26.04.rc2" + - "docker.io/library/ubuntu:22.04" + """ + + pass diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py index abb0a1330..fa12ae353 100644 --- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py @@ -269,7 +269,10 @@ def _installed_container_path() -> str: "(docker_image.installed_path is empty). Please run `cloudai install` first, or provide " "a valid local .sqsh path in cmd_args.container_image." ) - return str(Path(installed).absolute()) + if isinstance(installed, Path): + return str(installed.absolute()) + # DockerURL - pass directly to pyxis (it will pull from registry) + return str(installed) ci = str(args.container_image).strip() if ci.startswith("/") or ci.startswith("."): From cecc3ae58fdfc9ed713d34debde81a945b4aa857 Mon Sep 17 00:00:00 2001 From: Rutayan Patro Date: Wed, 25 Mar 2026 22:00:50 -0400 Subject: [PATCH 2/2] test: add tests for DockerURL type and cache_docker_images_locally=false Tests verify: - installed_path returns DockerURL when _installed_path is not set - installed_path returns Path when _installed_path is set - DockerURL is a str subclass for compatibility - isinstance() can distinguish DockerURL from Path - Container path resolution logic handles both types correctly - cache_filename generation works correctly Made-with: Cursor --- .../test_cache_docker_images_locally_false.py | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 tests/test_cache_docker_images_locally_false.py diff --git a/tests/test_cache_docker_images_locally_false.py b/tests/test_cache_docker_images_locally_false.py new file mode 100644 index 000000000..02f4f66fe --- /dev/null +++ b/tests/test_cache_docker_images_locally_false.py @@ -0,0 +1,119 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Test case: cache_docker_images_locally = false with DockerURL type. + +When cache_docker_images_locally=false in system TOML: +1. cloudai install doesn't set DockerImage._installed_path +2. DockerImage.installed_path returns DockerURL(self.url) +3. slurm_command_gen_strategy.py checks isinstance(installed, Path) vs DockerURL +4. DockerURL is passed directly to pyxis (it pulls from registry) + +The fix uses a custom DockerURL type to distinguish URLs from Paths. +""" + +from pathlib import Path + +from cloudai._core.installables import DockerImage +from cloudai._core.types import DockerURL + + +class TestCacheDockerImagesLocallyFalse: + """Test cases for cache_docker_images_locally=false with DockerURL type.""" + + def test_installed_path_returns_docker_url_when_not_cached(self): + """When _installed_path is None, installed_path returns a DockerURL.""" + docker_url = "nvcr.io/nvidian/nemo:26.04.rc2" + img = DockerImage(url=docker_url) + + assert img._installed_path is None, "Precondition: _installed_path should be None" + + result = img.installed_path + + # Returns DockerURL type (not plain str) + assert isinstance(result, DockerURL), f"Expected DockerURL, got {type(result)}" + assert result == docker_url + + def test_installed_path_returns_path_when_cached(self): + """When _installed_path is set, installed_path returns a Path.""" + docker_url = "nvcr.io/nvidian/nemo:26.04.rc2" + sqsh_path = Path("/install/nvcr.io_nvidian__nemo__26.04.rc2.sqsh") + + img = DockerImage(url=docker_url) + img.installed_path = sqsh_path + + result = img.installed_path + + # Returns Path type (not DockerURL) + assert isinstance(result, Path), f"Expected Path, got {type(result)}" + assert result == sqsh_path.absolute() + + def test_docker_url_is_subclass_of_str(self): + """DockerURL is a str subclass, so it works with string operations.""" + url = DockerURL("nvcr.io/nvidian/nemo:26.04.rc2") + + assert isinstance(url, str) + assert isinstance(url, DockerURL) + assert "nvcr.io" in url + assert url.startswith("nvcr.io") + + def test_type_check_distinguishes_url_from_path(self): + """isinstance() can distinguish DockerURL from Path.""" + docker_url = "nvcr.io/nvidian/nemo:26.04.rc2" + img = DockerImage(url=docker_url) + + # Not cached - returns DockerURL + result_uncached = img.installed_path + assert isinstance(result_uncached, DockerURL) + assert not isinstance(result_uncached, Path) + + # Cached - returns Path + img.installed_path = Path("/install/image.sqsh") + result_cached = img.installed_path + assert isinstance(result_cached, Path) + assert not isinstance(result_cached, DockerURL) + + def test_container_path_resolution_logic(self): + """ + Test the correct container path resolution logic. + + This is what slurm_command_gen_strategy.py should do: + - Path: call .absolute() and convert to str + - DockerURL: pass directly as str + """ + def resolve_container_path(installed) -> str: + if isinstance(installed, Path): + return str(installed.absolute()) + # DockerURL - pass directly to pyxis + return str(installed) + + # DockerURL case + url_result = resolve_container_path(DockerURL("nvcr.io/nvidian/nemo:26.04.rc2")) + assert url_result == "nvcr.io/nvidian/nemo:26.04.rc2" + assert not url_result.startswith("/") # Not mangled into a local path + + # Path case + path_result = resolve_container_path(Path("/install/image.sqsh")) + assert path_result == "/install/image.sqsh" + + def test_cache_filename_generation(self): + """Verify cache filename is correctly generated from docker URL.""" + docker_url = "nvcr.io/nvidian/nemo:26.04.rc2" + img = DockerImage(url=docker_url) + + expected = "nvcr.io_nvidian__nemo__26.04.rc2.sqsh" + assert img.cache_filename == expected