Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

ARG TARGET=base
ARG BASE_IMAGE=ubuntu:22.04
ARG BASE_IMAGE_COLOCATED=us-docker.pkg.dev/cloud-tpu-v2-images/pathways-colocated-python/sidecar:2025_10_29-python_3.10-jax_0.6.2

FROM ${BASE_IMAGE} AS base

Expand Down Expand Up @@ -102,11 +103,39 @@ COPY pyproject.toml README.md /root/
RUN uv pip install -qq --prerelease=allow .[core,tpu] && uv cache clean
RUN if [ -n "$EXTRAS" ]; then uv pip install -qq .[$EXTRAS] && uv cache clean; fi
RUN if [ "$INSTALL_PATHWAYS_JAXLIB" = "true" ]; then \
uv pip install --prerelease=allow "jaxlib==0.5.3.dev20250918" \
uv pip install --prerelease=allow "jaxlib==0.6.2.dev20251021" \
--find-links https://storage.googleapis.com/axlearn-wheels/wheels.html; \
fi
COPY . .

################################################################################
# Colocated Python container spec. #
################################################################################

FROM ${BASE_IMAGE_COLOCATED} AS colocated-python

WORKDIR /app
COPY . .

# Install the additional user-provided dependencies, strictly enforcing the rules
# from the base image's constraints file.
RUN \
# 1. Install user-provided dependencies with modified constraints
grep -v "^numpy" /opt/venv/server_constraints.txt | grep -v "^scipy" > /tmp/modified_constraints.txt && \
echo "--> Installing user-provided dependencies..." && \
uv pip install ".[core,gcp]" -c /tmp/modified_constraints.txt && \
\
# 2. Override numpy and scipy with specific versions
uv pip install numpy==2.1.1 scipy==1.15.3 && \
\
# 3. Verify that the colocated_python_cpu_client is present.
echo "--> Verifying JAX patch integrity..." && \
python -c "from jax._src.lib import _jax; _jax.colocated_python_cpu_client" && \
echo "--> JAX patch verification successful." && \
\
# 4. Clean the cache to keep the image slim.
uv cache clean

################################################################################
# GPU container spec. #
################################################################################
Expand Down
89 changes: 80 additions & 9 deletions axlearn/cloud/gcp/bundler.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
from axlearn.cloud.common.utils import canonicalize_to_list, to_bool
from axlearn.cloud.gcp.cloud_build import wait_for_cloud_build
from axlearn.cloud.gcp.config import gcp_settings
from axlearn.cloud.gcp.pathways_utils import _COLOCATED_PYTHON_SIDECAR_NAME
from axlearn.cloud.gcp.utils import common_flags
from axlearn.common.config import REQUIRED, Required, config_class, maybe_set_config

Expand Down Expand Up @@ -98,20 +99,46 @@ class ArtifactRegistryBundler(DockerBundler):

TYPE = "artifactregistry"

@config_class
class Config(DockerBundler.Config):
"""Configures ArtifactRegistryBundler.

Attributes:
enable_colocated_python: Applicable only to Pathways jobs. Whether to build a Colocated
Python sidecar image alongside the main image. The sidecar image name will be
"{main_image_name}-colocated-sidecar".
"""

enable_colocated_python: bool = False

@classmethod
def from_spec(cls, spec: list[str], *, fv: Optional[flags.FlagValues]) -> DockerBundler.Config:
cfg = super().from_spec(spec, fv=fv)
cfg: ArtifactRegistryBundler.Config = super().from_spec(spec, fv=fv)
cfg.repo = cfg.repo or gcp_settings("docker_repo", required=False, fv=fv)
cfg.dockerfile = cfg.dockerfile or gcp_settings("default_dockerfile", required=False, fv=fv)
cfg.enable_colocated_python = cfg.enable_colocated_python or gcp_settings(
"enable_colocated_python", required=False, fv=fv
)
return cfg

def _build_and_push(self, *args, **kwargs):
def _build_and_push(self, *args, image: str, **kwargs):
cfg = self.config
subprocess.run(
["gcloud", "auth", "configure-docker", registry_from_repo(cfg.repo)],
check=True,
)
return super()._build_and_push(*args, **kwargs)

if cfg.enable_colocated_python:
# Build Colocated Python sidecar image
_, tag = image.rsplit(":", maxsplit=1)
colocated_bundler = cfg.set(
image=_COLOCATED_PYTHON_SIDECAR_NAME,
target="colocated-python",
enable_colocated_python=False,
).instantiate()
colocated_bundler.bundle(tag=tag)

return super()._build_and_push(*args, image=image, **kwargs)


@register_bundler
Expand All @@ -129,6 +156,9 @@ class Config(BaseDockerBundler.Config):
from flags.
is_async: Whether to build asynchronously. If True, callers should invoke
`wait_until_finished()` to wait for bundling to complete.
enable_colocated_python: Applicable only to Pathways jobs. Whether to build a Colocated
Python sidecar image alongside the main image. The sidecar image name will be
"{main_image_name}-colocated-sidecar".
"""

# GCP project.
Expand All @@ -138,6 +168,7 @@ class Config(BaseDockerBundler.Config):
# If provided, should be the identifier of a private worker pool.
# See: https://cloud.google.com/build/docs/private-pools/private-pools-overview
private_worker_pool: Optional[str] = None
enable_colocated_python: bool = False

@classmethod
def from_spec(
Expand All @@ -148,6 +179,9 @@ def from_spec(
cfg.repo = cfg.repo or gcp_settings("docker_repo", required=False, fv=fv)
cfg.dockerfile = cfg.dockerfile or gcp_settings("default_dockerfile", required=False, fv=fv)
cfg.is_async = to_bool(cfg.is_async)
cfg.enable_colocated_python = cfg.enable_colocated_python or gcp_settings(
"enable_colocated_python", required=False, fv=fv
)
return cfg

# pylint: disable-next=no-self-use,unused-argument
Expand Down Expand Up @@ -175,9 +209,14 @@ def _build_and_push(
)
image_path, image_tag = image.rsplit(":", maxsplit=1)
latest_tag = f"{image_path}:latest"
cloudbuild_yaml = f"""
steps:
- name: "gcr.io/cloud-builders/docker"

# Build steps - start with main image
build_steps = []
images_list = [f'"{image}"', f'"{latest_tag}"']

# Main image build step
build_steps.append(
f"""- name: "gcr.io/cloud-builders/docker"
args: [
"build",
"-f", "{os.path.relpath(dockerfile, context)}",
Expand All @@ -193,11 +232,43 @@ def _build_and_push(
"."
]
env:
- "DOCKER_BUILDKIT=1"
- "DOCKER_BUILDKIT=1\""""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we really need to branching the logic in the bundler.py file?

I think we just add some extra dependencies for colocated python. We can use a env_var in the Dockerfile to control that like this: https://github.com/apple/axlearn/blob/main/Dockerfile#L104

Which can be passed in via something like --bundler_spec=INSTALL_PATHWAYS_JAXLIB=true \

Copy link
Author

@rohitc33 rohitc33 Nov 12, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To enable colocated python, an extra image for the colocated python sidecar running on each worker pod needs to be built (so 2 images get built). This implementation works the same for the user - to enable it you just need to pass in --bundler_spec=enable_colocated_python=True.

Copy link
Contributor

@muyangyuapple muyangyuapple Nov 12, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can head pod and sidecar on worker share the same image?

If there are two images to build, we should define another bundler called "sidecar_bundler" here. Adding logics to bundler that is used only by a subset of jobs will make bundler hard to maintain in the long run.

)

# Add colocated image build step if required
if cfg.enable_colocated_python:
colocated_image_path = f"{cfg.repo}/{_COLOCATED_PYTHON_SIDECAR_NAME}"
colocated_image = f"{colocated_image_path}:{image_tag}"
colocated_latest_image = f"{colocated_image_path}:latest"

build_steps.append(
f"""- name: "gcr.io/cloud-builders/docker"
args: [
"build",
"-f", "{os.path.relpath(dockerfile, context)}",
"-t", "{colocated_image}",
"-t", "{colocated_latest_image}",
"--target", "colocated-python",
"--cache-from", "{colocated_image}",
"--cache-from", "{colocated_latest_image}",
{cache_from}
{build_platform}
{build_args}
{labels}
"."
]
env:
- "DOCKER_BUILDKIT=1\""""
)

images_list.extend([f'"{colocated_image}"', f'"{colocated_latest_image}"'])

cloudbuild_yaml = f"""
steps:
{chr(10).join(build_steps)}
timeout: 3600s
images:
- "{image}"
- "{latest_tag}"
{chr(10).join([f"- {img}" for img in images_list])}
tags: [{image_tag}]
options:
logging: CLOUD_LOGGING_ONLY
Expand Down
Loading