From 8d2d0e4b1955c3e21bf35277a16339c2ea38f4b4 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Sat, 25 Oct 2025 19:02:59 -0700 Subject: [PATCH 1/8] Fix deployment on arm64 architecture Fix service build and deployment on arm64 architecture. --- contrib/kubespray/script/environment.sh | 4 +- .../k8s-host-device-plugin.k8s.dockerfile | 6 +++ .../k8s-nvidia-device-plugin.k8s.dockerfile | 4 ++ .../k8s-rdma-shared-dev-plugin.k8s.dockerfile | 4 ++ .../k8s-rocm-device-plugin.k8s.dockerfile | 6 +++ ...lugin.yaml => device-plugin.yaml.template} | 7 +-- src/device-plugin/deploy/service.yaml | 1 + src/device-plugin/deploy/start.sh.template | 32 +++++++++---- .../deploy/frameworkcontroller.yaml.template | 2 + .../build/hivedscheduler.common.dockerfile | 47 ------------------- .../build/hivedscheduler.k8s.dockerfile | 28 +++++++++++ .../build/kube-scheduler.k8s.dockerfile | 4 ++ .../deploy/hivedscheduler.yaml.template | 4 ++ 13 files changed, 87 insertions(+), 62 deletions(-) create mode 100644 src/device-plugin/build/k8s-host-device-plugin.k8s.dockerfile create mode 100644 src/device-plugin/build/k8s-nvidia-device-plugin.k8s.dockerfile create mode 100644 src/device-plugin/build/k8s-rdma-shared-dev-plugin.k8s.dockerfile create mode 100644 src/device-plugin/build/k8s-rocm-device-plugin.k8s.dockerfile rename src/device-plugin/deploy/{device-plugin.yaml => device-plugin.yaml.template} (82%) delete mode 100644 src/hivedscheduler/build/hivedscheduler.common.dockerfile create mode 100644 src/hivedscheduler/build/hivedscheduler.k8s.dockerfile create mode 100644 src/hivedscheduler/build/kube-scheduler.k8s.dockerfile diff --git a/contrib/kubespray/script/environment.sh b/contrib/kubespray/script/environment.sh index fb7611fa..75d8ca38 100644 --- a/contrib/kubespray/script/environment.sh +++ b/contrib/kubespray/script/environment.sh @@ -51,5 +51,5 @@ sudo python3 -m pip install -r ${HOME}/pai-deploy/kubespray/requirements.txt # workaround python3-apt issue SOABI=$(python3 -c 'import sysconfig; print(sysconfig.get_config_var("SOABI"))') -sudo ln -s /usr/lib/python3/dist-packages/apt_inst.${SOABI}.so /usr/lib/python3/dist-packages/apt_inst.so -sudo ln -s /usr/lib/python3/dist-packages/apt_pkg.${SOABI}.so /usr/lib/python3/dist-packages/apt_pkg.so +sudo ln -sf /usr/lib/python3/dist-packages/apt_inst.${SOABI}.so /usr/lib/python3/dist-packages/apt_inst.so +sudo ln -sf /usr/lib/python3/dist-packages/apt_pkg.${SOABI}.so /usr/lib/python3/dist-packages/apt_pkg.so diff --git a/src/device-plugin/build/k8s-host-device-plugin.k8s.dockerfile b/src/device-plugin/build/k8s-host-device-plugin.k8s.dockerfile new file mode 100644 index 00000000..73cd0024 --- /dev/null +++ b/src/device-plugin/build/k8s-host-device-plugin.k8s.dockerfile @@ -0,0 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +FROM everpeace/k8s-host-device-plugin + +#TODO: add arm64 image diff --git a/src/device-plugin/build/k8s-nvidia-device-plugin.k8s.dockerfile b/src/device-plugin/build/k8s-nvidia-device-plugin.k8s.dockerfile new file mode 100644 index 00000000..2b4d9344 --- /dev/null +++ b/src/device-plugin/build/k8s-nvidia-device-plugin.k8s.dockerfile @@ -0,0 +1,4 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +FROM nvcr.io/nvidia/k8s-device-plugin:v0.15.0 diff --git a/src/device-plugin/build/k8s-rdma-shared-dev-plugin.k8s.dockerfile b/src/device-plugin/build/k8s-rdma-shared-dev-plugin.k8s.dockerfile new file mode 100644 index 00000000..723e361c --- /dev/null +++ b/src/device-plugin/build/k8s-rdma-shared-dev-plugin.k8s.dockerfile @@ -0,0 +1,4 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +FROM ghcr.io/mellanox/k8s-rdma-shared-dev-plugin:v1.5.3 diff --git a/src/device-plugin/build/k8s-rocm-device-plugin.k8s.dockerfile b/src/device-plugin/build/k8s-rocm-device-plugin.k8s.dockerfile new file mode 100644 index 00000000..367e11c1 --- /dev/null +++ b/src/device-plugin/build/k8s-rocm-device-plugin.k8s.dockerfile @@ -0,0 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +FROM rocm/k8s-device-plugin + +# no arm64 support diff --git a/src/device-plugin/deploy/device-plugin.yaml b/src/device-plugin/deploy/device-plugin.yaml.template similarity index 82% rename from src/device-plugin/deploy/device-plugin.yaml rename to src/device-plugin/deploy/device-plugin.yaml.template index edad5179..05f1d877 100644 --- a/src/device-plugin/deploy/device-plugin.yaml +++ b/src/device-plugin/deploy/device-plugin.yaml.template @@ -29,16 +29,15 @@ spec: name: k8s-host-device-plugin-ds template: metadata: - annotations: - scheduler.alpha.kubernetes.io/critical-pod: "" labels: name: k8s-host-device-plugin-ds spec: + priorityClassName: pai-daemon-priority tolerations: - key: CriticalAddonsOnly operator: Exists containers: - - image: luciaopenai.azurecr.io/luciaopenai/k8s-host-device-plugin:latest + - image: {{ cluster_cfg["cluster"]["docker-registry"]["prefix"] }}k8s-host-device-plugin:{{ cluster_cfg["cluster"]["docker-registry"]["tag"] }} name: k8s-host-device-plugin-ctr securityContext: privileged: true @@ -62,3 +61,5 @@ spec: items: - key: config.json path: config.json + imagePullSecrets: + - name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }} diff --git a/src/device-plugin/deploy/service.yaml b/src/device-plugin/deploy/service.yaml index 8babf14d..41c2c576 100644 --- a/src/device-plugin/deploy/service.yaml +++ b/src/device-plugin/deploy/service.yaml @@ -25,6 +25,7 @@ prerequisite: template-list: - start.sh - delete.sh + - device-plugin.yaml start-script: start.sh stop-script: stop.sh diff --git a/src/device-plugin/deploy/start.sh.template b/src/device-plugin/deploy/start.sh.template index d8351ad4..91b8b598 100644 --- a/src/device-plugin/deploy/start.sh.template +++ b/src/device-plugin/deploy/start.sh.template @@ -29,15 +29,19 @@ pushd $(dirname "$0") > /dev/null # Begin: NVIDIA GPU device plugin {% if 'nvidia.com/gpu' in cluster_cfg['device-plugin']['devices'] %} -curl -s https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.15.0/deployments/static/nvidia-device-plugin.yml \ -| sed 's|nvcr.io/nvidia/k8s-device-plugin|luciaopenai.azurecr.io/luciaopenai/nvidia/k8s-device-plugin|' \ +{ curl -s https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.15.0/deployments/static/nvidia-device-plugin.yml \ +| sed 's|nvcr.io/nvidia/k8s-device-plugin:v0.15.0|{{ cluster_cfg['cluster']['docker-registry']['prefix'] }}k8s-nvidia-device-plugin:{{ cluster_cfg['cluster']['docker-registry']['tag'] }}|' \ | sed -E '/^[[:space:]]*allowPrivilegeEscalation: false/ { h s/^([[:space:]]*)allowPrivilegeEscalation: false.*$/\1privileged: false/ G s/(^[[:space:]]*allowPrivilegeEscalation: false.*)\n([[:space:]]*privileged: false)/\1\n\2/ -}' \ - | kubectl apply --overwrite=true -f - || exit $? +}'; + cat <<'YAML' + imagePullSecrets: + - name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }} +YAML +} | kubectl apply --overwrite=true -f - || exit $? {% endif %} # End: NVIDIA GPU device plugin @@ -45,9 +49,13 @@ curl -s https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.15.0/deplo # Begin: AMD GPU device plugin {% if 'amd.com/gpu' in cluster_cfg['device-plugin']['devices'] %} -curl -s https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml \ -| sed 's|rocm/k8s-device-plugin|luciaopenai.azurecr.io/luciaopenai/rocm/k8s-device-plugin:{{ cluster_cfg['cluster']['docker-registry']['tag'] }}|' \ - | kubectl apply --overwrite=true -f - || exit $? +{ curl -s https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml \ +| sed 's|rocm/k8s-device-plugin|{{ cluster_cfg['cluster']['docker-registry']['prefix'] }}k8s-rocm-device-plugin:{{ cluster_cfg['cluster']['docker-registry']['tag'] }}|'; + cat <<'YAML' + imagePullSecrets: + - name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }} +YAML +} | kubectl apply --overwrite=true -f - || exit $? {% endif %} # End: AMD GPU device plugin @@ -75,9 +83,13 @@ kubectl apply --overwrite=true -f device-plugin.yaml || exit $? {% if 'rdma/hca' in cluster_cfg['device-plugin']['devices'] %} kubectl apply --overwrite=true -f rdma-devices.yaml || exit $? -curl -s https://raw.githubusercontent.com/Mellanox/k8s-rdma-shared-dev-plugin/v1.4.0/deployment/k8s/base/daemonset.yaml \ - | sed 's|ghcr.io/mellanox/k8s-rdma-shared-dev-plugin|luciaopenai.azurecr.io/luciaopenai/k8s-rdma-shared-dev-plugin:{{ cluster_cfg['cluster']['docker-registry']['tag'] }}|' \ - | kubectl apply --overwrite=true -f - || exit $? +{ curl -s https://raw.githubusercontent.com/Mellanox/k8s-rdma-shared-dev-plugin/v1.4.0/deployment/k8s/base/daemonset.yaml \ +| sed 's|ghcr.io/mellanox/k8s-rdma-shared-dev-plugin|{{ cluster_cfg['cluster']['docker-registry']['prefix'] }}k8s-rdma-shared-dev-plugin:{{ cluster_cfg['cluster']['docker-registry']['tag'] }}|'; + cat <<'YAML' + imagePullSecrets: + - name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }} +YAML +} | kubectl apply --overwrite=true -f - || exit $? {% endif %} diff --git a/src/frameworkcontroller/deploy/frameworkcontroller.yaml.template b/src/frameworkcontroller/deploy/frameworkcontroller.yaml.template index caff4a62..b239af29 100644 --- a/src/frameworkcontroller/deploy/frameworkcontroller.yaml.template +++ b/src/frameworkcontroller/deploy/frameworkcontroller.yaml.template @@ -50,3 +50,5 @@ spec: - name: frameworkcontroller-config configMap: name: frameworkcontroller-config + imagePullSecrets: + - name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }} diff --git a/src/hivedscheduler/build/hivedscheduler.common.dockerfile b/src/hivedscheduler/build/hivedscheduler.common.dockerfile deleted file mode 100644 index f109693f..00000000 --- a/src/hivedscheduler/build/hivedscheduler.common.dockerfile +++ /dev/null @@ -1,47 +0,0 @@ -# MIT License -# -# Copyright (c) Microsoft Corporation. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE - -FROM golang:1.24.3-alpine3.21 as builder - -ARG TEST=false -ENV GOPATH=/go -ENV PROJECT_DIR=/src -ENV INSTALL_DIR=/opt/hivedscheduler/hivedscheduler - -RUN apk update && apk add --no-cache bash -RUN mkdir -p ${PROJECT_DIR} ${INSTALL_DIR} -COPY src ${PROJECT_DIR} -RUN if [ ${TEST} == "true" ]; \ - then ${PROJECT_DIR}/build/hivedscheduler/go-build.sh test; \ - else ${PROJECT_DIR}/build/hivedscheduler/go-build.sh; fi && \ - mv ${PROJECT_DIR}/dist/hivedscheduler/* ${INSTALL_DIR} - - -FROM alpine:3.21 - -ENV INSTALL_DIR=/opt/hivedscheduler/hivedscheduler - -RUN apk update && apk add --no-cache bash -COPY --from=builder ${INSTALL_DIR} ${INSTALL_DIR} -WORKDIR ${INSTALL_DIR} - -ENTRYPOINT ["./start.sh"] diff --git a/src/hivedscheduler/build/hivedscheduler.k8s.dockerfile b/src/hivedscheduler/build/hivedscheduler.k8s.dockerfile new file mode 100644 index 00000000..c903fbd7 --- /dev/null +++ b/src/hivedscheduler/build/hivedscheduler.k8s.dockerfile @@ -0,0 +1,28 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +FROM golang:1.24.3-alpine3.21 AS builder + +ARG TEST=false +ENV GOPATH=/go +ENV PROJECT_DIR=/src +ENV INSTALL_DIR=/opt/hivedscheduler/hivedscheduler + +RUN apk update && apk add --no-cache bash +RUN mkdir -p ${PROJECT_DIR} ${INSTALL_DIR} +COPY src ${PROJECT_DIR} +RUN if [ ${TEST} == "true" ]; \ + then ${PROJECT_DIR}/build/hivedscheduler/go-build.sh test; \ + else ${PROJECT_DIR}/build/hivedscheduler/go-build.sh; fi && \ + mv ${PROJECT_DIR}/dist/hivedscheduler/* ${INSTALL_DIR} + + +FROM alpine:3.21 + +ENV INSTALL_DIR=/opt/hivedscheduler/hivedscheduler + +RUN apk update && apk add --no-cache bash +COPY --from=builder ${INSTALL_DIR} ${INSTALL_DIR} +WORKDIR ${INSTALL_DIR} + +ENTRYPOINT ["./start.sh"] diff --git a/src/hivedscheduler/build/kube-scheduler.k8s.dockerfile b/src/hivedscheduler/build/kube-scheduler.k8s.dockerfile new file mode 100644 index 00000000..d0ac5901 --- /dev/null +++ b/src/hivedscheduler/build/kube-scheduler.k8s.dockerfile @@ -0,0 +1,4 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +FROM registry.k8s.io/kube-scheduler:v1.28.9 diff --git a/src/hivedscheduler/deploy/hivedscheduler.yaml.template b/src/hivedscheduler/deploy/hivedscheduler.yaml.template index ab4d992a..3f4fae95 100644 --- a/src/hivedscheduler/deploy/hivedscheduler.yaml.template +++ b/src/hivedscheduler/deploy/hivedscheduler.yaml.template @@ -53,6 +53,8 @@ spec: - name: hivedscheduler-config configMap: name: hivedscheduler-config + imagePullSecrets: + - name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }} {%- for vc in cluster_cfg['hivedscheduler']['structured-config']['virtualClusters'] %} --- @@ -96,4 +98,6 @@ spec: - name: hivedscheduler-config configMap: name: hivedscheduler-config + imagePullSecrets: + - name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }} {%- endfor %} From 52b0d1f6cb0d7a722bd2c1c2734e3e3eb4b04a04 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Mon, 27 Oct 2025 15:21:35 -0700 Subject: [PATCH 2/8] Update Update. --- src/grafana/build/grafana.common.dockerfile | 3 ++- .../build/job-exporter.common.dockerfile | 13 +++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/grafana/build/grafana.common.dockerfile b/src/grafana/build/grafana.common.dockerfile index 930e9a5b..812dedec 100644 --- a/src/grafana/build/grafana.common.dockerfile +++ b/src/grafana/build/grafana.common.dockerfile @@ -18,6 +18,7 @@ FROM ubuntu:22.04 +ARG TARGETARCH ENV \ GRAFANA_VERSION=10.4.18+security~01 \ GF_PLUGIN_DIR=/grafana-plugins \ @@ -29,7 +30,7 @@ ENV \ RUN \ apt-get update && \ apt-get -y --force-yes --no-install-recommends install libfontconfig wget ca-certificates adduser libfontconfig1 musl curl jq && \ - wget -O /tmp/grafana.deb https://dl.grafana.com/oss/release/grafana_${GRAFANA_VERSION}_amd64.deb && \ + wget -O /tmp/grafana.deb https://dl.grafana.com/oss/release/grafana_${GRAFANA_VERSION}_${TARGETARCH}.deb && \ dpkg -i /tmp/grafana.deb && \ rm -f /tmp/grafana.deb && \ ### branding && \ diff --git a/src/job-exporter/build/job-exporter.common.dockerfile b/src/job-exporter/build/job-exporter.common.dockerfile index db4983ec..267a0c90 100644 --- a/src/job-exporter/build/job-exporter.common.dockerfile +++ b/src/job-exporter/build/job-exporter.common.dockerfile @@ -17,6 +17,8 @@ FROM mcr.microsoft.com/mirror/nvcr/nvidia/cuda:12.0.1-runtime-ubuntu22.04 + +ARG TARGETARCH # Register the ROCM package repository, and install rocm-dev package ARG ROCM_VERSION=6.2.2 ARG AMDGPU_VERSION=6.2.2 @@ -25,8 +27,8 @@ RUN echo "$APT_PREF" > /etc/apt/preferences.d/rocm-pin-600 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends curl libnuma-dev gnupg \ && curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - \ - && printf "deb [arch=amd64] https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" | tee /etc/apt/sources.list.d/rocm.list \ - && printf "deb [arch=amd64] https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" | tee /etc/apt/sources.list.d/amdgpu.list \ + && printf "deb https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" | tee /etc/apt/sources.list.d/rocm.list \ + && printf "deb https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" | tee /etc/apt/sources.list.d/amdgpu.list \ && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ sudo \ libelf1 \ @@ -74,13 +76,12 @@ COPY build/update-dcgm.py . # For the job exporter ENV NERDCTL_VERSION=2.1.3 RUN apt-get update && apt-get install --no-install-recommends -y wget ca-certificates -RUN wget https://github.com/containerd/nerdctl/releases/download/v${NERDCTL_VERSION}/nerdctl-${NERDCTL_VERSION}-linux-amd64.tar.gz && \ +RUN wget -O /tmp/nerdctl.tar.gz https://github.com/containerd/nerdctl/releases/download/v${NERDCTL_VERSION}/nerdctl-${NERDCTL_VERSION}-linux-${TARGETARCH}.tar.gz && \ mkdir -p /tmp/nerdctl && \ - tar -xzvf nerdctl-${NERDCTL_VERSION}-linux-amd64.tar.gz -C /tmp/nerdctl && \ + tar -xzvf /tmp/nerdctl.tar.gz -C /tmp/nerdctl && \ mv /tmp/nerdctl/nerdctl /usr/local/bin/nerdctl && \ mkdir -p /job_exporter && \ - rm -rf /tmp/nerdctl && \ - rm -rf nerdctl-${NERDCTL_VERSION}-linux-amd64.tar.gz + rm -rf /tmp/nerdctl* COPY requirements.txt /job_exporter/ RUN pip3 install -r /job_exporter/requirements.txt From 3561b5155af7db56863268e65423fade1060d65c Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Mon, 27 Oct 2025 17:35:29 -0700 Subject: [PATCH 3/8] Use buildkit for Docker build Use buildkit for Docker build. --- .github/workflows/build-deploy-changes.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build-deploy-changes.yaml b/.github/workflows/build-deploy-changes.yaml index ef0ac13a..a9b01ece 100644 --- a/.github/workflows/build-deploy-changes.yaml +++ b/.github/workflows/build-deploy-changes.yaml @@ -23,6 +23,8 @@ jobs: image: ubuntu:latest volumes: - /var/run/docker.sock:/var/run/docker.sock + env: + DOCKER_BUILDKIT: "1" steps: - name: Install git run: | From f6d4f5e01602081ce480d431d2a94c3d260b928e Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Mon, 27 Oct 2025 21:10:02 -0700 Subject: [PATCH 4/8] Upgrade NVIDIA k8s device plugin Upgrade NVIDIA k8s device plugin version. --- src/device-plugin/build/k8s-nvidia-device-plugin.k8s.dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/device-plugin/build/k8s-nvidia-device-plugin.k8s.dockerfile b/src/device-plugin/build/k8s-nvidia-device-plugin.k8s.dockerfile index 2b4d9344..29089a81 100644 --- a/src/device-plugin/build/k8s-nvidia-device-plugin.k8s.dockerfile +++ b/src/device-plugin/build/k8s-nvidia-device-plugin.k8s.dockerfile @@ -1,4 +1,4 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -FROM nvcr.io/nvidia/k8s-device-plugin:v0.15.0 +FROM nvcr.io/nvidia/k8s-device-plugin:v0.17.1 From 04f5c4d1b69678994e28aeddae9f950327059390 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Wed, 5 Nov 2025 16:09:32 -0800 Subject: [PATCH 5/8] Update according to comments Update according to comments. --- src/device-plugin/build/k8s-nvidia-device-plugin.k8s.dockerfile | 2 +- .../build/k8s-rdma-shared-dev-plugin.k8s.dockerfile | 2 +- src/openpai-runtime/build/openpai-runtime.common.dockerfile | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/device-plugin/build/k8s-nvidia-device-plugin.k8s.dockerfile b/src/device-plugin/build/k8s-nvidia-device-plugin.k8s.dockerfile index 29089a81..f5a1bf40 100644 --- a/src/device-plugin/build/k8s-nvidia-device-plugin.k8s.dockerfile +++ b/src/device-plugin/build/k8s-nvidia-device-plugin.k8s.dockerfile @@ -1,4 +1,4 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -FROM nvcr.io/nvidia/k8s-device-plugin:v0.17.1 +FROM nvcr.io/nvidia/k8s-device-plugin:v0.18.0 diff --git a/src/device-plugin/build/k8s-rdma-shared-dev-plugin.k8s.dockerfile b/src/device-plugin/build/k8s-rdma-shared-dev-plugin.k8s.dockerfile index 723e361c..9de01b06 100644 --- a/src/device-plugin/build/k8s-rdma-shared-dev-plugin.k8s.dockerfile +++ b/src/device-plugin/build/k8s-rdma-shared-dev-plugin.k8s.dockerfile @@ -1,4 +1,4 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -FROM ghcr.io/mellanox/k8s-rdma-shared-dev-plugin:v1.5.3 +FROM ghcr.io/mellanox/k8s-rdma-shared-dev-plugin:1.4.0 diff --git a/src/openpai-runtime/build/openpai-runtime.common.dockerfile b/src/openpai-runtime/build/openpai-runtime.common.dockerfile index 258b4980..d2cad57b 100644 --- a/src/openpai-runtime/build/openpai-runtime.common.dockerfile +++ b/src/openpai-runtime/build/openpai-runtime.common.dockerfile @@ -64,6 +64,7 @@ WORKDIR /kube-runtime/src COPY src/src ./ COPY src/requirements.txt ./ +#TODO: update the hardcode image for arm64 COPY --from=frameworkcontroller/frameworkbarrier:v1.0.0 $BARRIER_DIR/frameworkbarrier ./init.d COPY --from=builder ${INSTALL_DIR}/* ./runtime.d/ From ac227c2bd6cffd3ed1e1d7167a40ff1bcf293f37 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Thu, 6 Nov 2025 14:24:42 -0800 Subject: [PATCH 6/8] Update docker install Update docker install. --- .github/workflows/build-deploy-changes.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-deploy-changes.yaml b/.github/workflows/build-deploy-changes.yaml index a9b01ece..f9c2f723 100644 --- a/.github/workflows/build-deploy-changes.yaml +++ b/.github/workflows/build-deploy-changes.yaml @@ -79,8 +79,9 @@ jobs: - name: Install Package if: steps.check.outputs.has_changed == 'true' run: | - DEBIAN_FRONTEND=noninteractive apt install -y python3 python-is-python3 pip git unzip docker-cli ca-certificates curl apt-transport-https lsb-release gnupg parallel + DEBIAN_FRONTEND=noninteractive apt install -y python3 python-is-python3 pip git unzip ca-certificates curl apt-transport-https lsb-release gnupg parallel curl -sL https://aka.ms/InstallAzureCLIDeb | bash + curl -fsSL https://get.docker.com | sh - name: Install python libs if: steps.check.outputs.has_changed == 'true' From 5dd75b8888f0a60542ad4f514081c60ae71c3f30 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Fri, 7 Nov 2025 14:54:44 -0800 Subject: [PATCH 7/8] Add ghcr login Add ghcr login. --- .github/workflows/build-deploy-changes.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/build-deploy-changes.yaml b/.github/workflows/build-deploy-changes.yaml index 2e80537c..ebe49a6c 100644 --- a/.github/workflows/build-deploy-changes.yaml +++ b/.github/workflows/build-deploy-changes.yaml @@ -101,6 +101,11 @@ jobs: mv $GITHUB_WORKSPACE/config/auth-configuration /tmp/ ls -l /tmp/auth-configuration + - name: Login to GHCR + if: steps.check.outputs.has_changed == 'true' + run: | + docker login ghcr.io -u ${{ github.actor }} -p ${{ secrets.GITHUB_TOKEN }} + - name: Build Images of Changed Services if: steps.check.outputs.has_changed == 'true' run: | From a93459b26830f4122cceaa486940a864b0af2e6e Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Fri, 7 Nov 2025 15:47:38 -0800 Subject: [PATCH 8/8] Update Update.