Skip to content

chore(deps): bump k8s.io/client-go from 0.35.0-alpha.0 to 0.35.0-alpha.2 #560

chore(deps): bump k8s.io/client-go from 0.35.0-alpha.0 to 0.35.0-alpha.2

chore(deps): bump k8s.io/client-go from 0.35.0-alpha.0 to 0.35.0-alpha.2 #560

Workflow file for this run

name: E2E Tests for PRs
on:
workflow_dispatch: {}
pull_request:
types: [opened, synchronize, reopened]
issue_comment:
types: [created]
permissions:
contents: read
packages: write
issues: read
pull-requests: read
concurrency:
group: e2e-${{ github.event.pull_request.number || github.event.issue.number || github.run_id }}
cancel-in-progress: false
env:
E2E_APPROVAL_COMMENT: "/ok-to-e2e"
GO_VERSION: "1.24"
KO_VERSION: "0.15.4"
KUBECTL_VERSION: "1.28.0"
jobs:
gate:
name: Gate (manual or approver comment)
runs-on: ubuntu-latest
outputs:
approved: ${{ steps.decide.outputs.approved }}
pr_number: ${{ steps.decide.outputs.pr_number }}
head_sha: ${{ steps.decide.outputs.head_sha }}
ref: ${{ steps.decide.outputs.ref }}
image_tag: ${{ steps.decide.outputs.image_tag }}
image_ref: ${{ steps.decide.outputs.image_ref }}
steps:
- id: decide
uses: actions/github-script@v7
with:
script: |
async function checkUserPermission(username) {
try {
const { data: collaborator } = await github.rest.repos.getCollaboratorPermissionLevel({
owner: context.repo.owner,
repo: context.repo.repo,
username: username
});
const hasWriteAccess = ['admin', 'write', 'maintain'].includes(collaborator.permission);
return {
hasAccess: hasWriteAccess,
permission: collaborator.permission
};
} catch (error) {
console.log(`Could not check permissions for ${username}: ${error.message}`);
return {
hasAccess: false,
permission: 'unknown'
};
}
}
let approved = false;
let prNumber = '';
let headSHA = context.sha;
let ref = (context.ref || '').replace('refs/heads/', '');
if (context.eventName === 'pull_request') {
prNumber = String(context.payload.pull_request.number);
headSHA = context.payload.pull_request.head.sha;
ref = context.payload.pull_request.head.ref;
const author = context.payload.pull_request.user.login;
const permCheck = await checkUserPermission(author);
if (permCheck.hasAccess) {
approved = true;
console.log(`Auto-approved E2E for ${author} (${permCheck.permission} access)`);
} else {
console.log(`E2E requires manual approval for ${author} (${permCheck.permission} access)`);
}
} else if (context.eventName === 'issue_comment') {
const comment = context.payload.comment.body || '';
const commenter = context.payload.comment.user.login || '';
if (context.payload.issue.pull_request && comment.includes('/ok-to-e2e')) {
const permCheck = await checkUserPermission(commenter);
if (permCheck.hasAccess) {
approved = true;
console.log(`Manual E2E approval by ${commenter} (${permCheck.permission} access)`);
const { data: pr } = await github.rest.pulls.get({
owner: context.repo.owner,
repo: context.repo.repo,
pull_number: context.payload.issue.number
});
headSHA = pr.head.sha;
ref = pr.head.ref;
prNumber = String(context.payload.issue.number);
} else {
console.log(`E2E approval denied for ${commenter} (${permCheck.permission} access)`);
}
}
} else if (context.eventName === 'workflow_dispatch') {
approved = true;
}
const tag = (prNumber ? `pr-${prNumber}-` : '') + headSHA.substring(0, 12);
const imageRef = `quay.io/karpenter-provider-ibm-cloud/controller:${tag}`;
console.log('Event:', context.eventName);
console.log('Approved:', approved);
console.log('PR:', prNumber);
console.log('Image:', imageRef);
core.setOutput('approved', approved ? 'true' : 'false');
core.setOutput('pr_number', prNumber);
core.setOutput('head_sha', headSHA);
core.setOutput('ref', ref);
core.setOutput('image_tag', tag);
core.setOutput('image_ref', imageRef);
build:
name: Build PR image with ko
needs: [gate]
if: ${{ needs.gate.outputs.approved == 'true' }}
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
steps:
- uses: actions/checkout@v4
with:
ref: ${{ needs.gate.outputs.ref }}
fetch-depth: 0
- uses: actions/setup-go@v4
with:
go-version: ${{ env.GO_VERSION }}
cache: true
- uses: ko-build/setup-ko@v0.6
with:
version: v${{ env.KO_VERSION }}
- name: Build & push image
env:
KO_DOCKER_REPO: ghcr.io/${{ github.repository_owner }}/${{ github.event.repository.name }}
run: |
echo "${{ secrets.GITHUB_TOKEN }}" | ko login ghcr.io --username ${{ github.actor }} --password-stdin
ko build ./cmd/controller --platform=linux/amd64 --bare --tags="${{ needs.gate.outputs.image_tag }}"
- name: Cleanup old PR images
if: ${{ needs.gate.outputs.pr_number != '' }}
continue-on-error: true
uses: actions/github-script@v7
with:
script: |
const packageName = context.repo.repo;
const prNumber = '${{ needs.gate.outputs.pr_number }}';
try {
const { data: versions } = await github.rest.packages.getAllPackageVersionsForPackageOwnedByOrg({
package_type: 'container',
package_name: packageName,
org: context.repo.owner,
per_page: 100
});
const prVersions = versions
.filter(version =>
version.metadata?.container?.tags?.some(tag => tag.startsWith(`pr-${prNumber}-`))
)
.sort((a, b) => new Date(b.created_at) - new Date(a.created_at));
const toDelete = prVersions.slice(3);
console.log(`Found ${prVersions.length} images for PR #${prNumber}`);
console.log(`Keeping latest 3, deleting ${toDelete.length} old images`);
for (const version of toDelete) {
try {
await github.rest.packages.deletePackageVersionForOrg({
package_type: 'container',
package_name: packageName,
org: context.repo.owner,
package_version_id: version.id
});
console.log(`Deleted image version ${version.id}`);
} catch (error) {
console.log(`Failed to delete version ${version.id}: ${error.message}`);
}
}
} catch (error) {
console.log(`Failed to cleanup images: ${error.message}`);
}
e2e:
name: Run E2E against PR image
needs: [gate, build]
if: ${{ needs.gate.outputs.approved == 'true' }}
runs-on: [self-hosted, ibm-e2e]
timeout-minutes: 210
container:
image: golang:1.24.6
options: --user 0
env:
KUBECONFIG: /tmp/kubeconfig
steps:
- uses: actions/checkout@v4
with:
ref: ${{ needs.gate.outputs.ref }}
- name: Install dependencies
run: |
apt-get update && apt-get install -y curl jq
curl -LO "https://dl.k8s.io/release/v${{ env.KUBECTL_VERSION }}/bin/linux/amd64/kubectl"
chmod +x kubectl && mv kubectl /usr/local/bin/
curl -fsSL https://clis.cloud.ibm.com/install/linux | sh
ibmcloud plugin install vpc-infrastructure
- name: Setup kubeconfig
env:
KUBECONFIG_CONTENT: ${{ secrets.KUBECONFIG }}
run: |
printf '%s' "$KUBECONFIG_CONTENT" | base64 -d > /tmp/kubeconfig
chmod 600 /tmp/kubeconfig
kubectl version --client
- name: Verify cluster access
run: |
kubectl cluster-info
kubectl auth can-i create nodeclaims --all-namespaces
kubectl auth can-i create nodepools --all-namespaces
kubectl auth can-i create ibmnodeclasses --all-namespaces
- name: Configure IBM Cloud CLI
env:
IBMCLOUD_API_KEY: ${{ secrets.IBMCLOUD_API_KEY }}
run: |
ibmcloud login --apikey "${{ secrets.IBMCLOUD_API_KEY }}" -r "${{ secrets.IBMCLOUD_REGION }}"
- name: Deploy PR version
run: |
kubectl apply -f charts/crds/
echo "🔍 Checking deployment structure..."
kubectl get deployment karpenter-karpenter-ibm -n karpenter -o yaml | grep -A 10 "containers:"
CONTAINER_NAME=$(kubectl get deployment karpenter-karpenter-ibm -n karpenter -o jsonpath='{.spec.template.spec.containers[0].name}')
echo "📋 Found container name: $CONTAINER_NAME"
kubectl set image deployment/karpenter-karpenter-ibm \
$CONTAINER_NAME=${{ needs.gate.outputs.image_ref }} \
-n karpenter
kubectl rollout status deployment/karpenter-karpenter-ibm -n karpenter --timeout=300s
CURRENT_IMAGE=$(kubectl get deployment karpenter-karpenter-ibm -n karpenter -o jsonpath='{.spec.template.spec.containers[0].image}')
echo "✅ Deployment updated to: $CURRENT_IMAGE"
- name: Pre-test cleanup
run: |
echo "🧹 Cleaning up any existing e2e test resources..."
kubectl delete pods -l test=e2e --all-namespaces --timeout=300s || true
kubectl delete deployments -l test=e2e --all-namespaces --timeout=300s || true
kubectl delete nodeclaims -l test=e2e --timeout=300s || true
kubectl delete nodepools -l test=e2e --timeout=300s || true
kubectl delete ibmnodeclasses -l test=e2e --timeout=300s || true
echo "⏳ Waiting for cluster stabilization..."
for i in {1..30}; do
pending_pods=$(kubectl get pods -l test=e2e --all-namespaces --field-selector=status.phase=Pending --no-headers 2>/dev/null | wc -l)
if [ "$pending_pods" -eq 0 ]; then
echo "✅ No pending e2e pods found"
break
fi
echo "⏳ Still have $pending_pods pending e2e pods, waiting..."
sleep 10
done
for i in {1..30}; do
disrupted_nodes=$(kubectl get nodes --no-headers -o custom-columns="NAME:.metadata.name,TAINTS:.spec.taints[*].key" | grep -c "karpenter.sh/disrupted" 2>/dev/null || echo "0")
disrupted_nodes=$(echo "$disrupted_nodes" | tr -d '\n' | grep -o '[0-9]*' || echo "0")
if [ "$disrupted_nodes" -eq 0 ]; then
echo "✅ No disrupted nodes found"
break
fi
echo "⏳ Still have $disrupted_nodes disrupted nodes, waiting..."
sleep 10
done
sleep 30
echo "✅ Pre-test cleanup completed"
- name: Run E2E tests
env:
RUN_E2E_TESTS: "true"
IBMCLOUD_API_KEY: ${{ secrets.IBMCLOUD_API_KEY }}
VPC_API_KEY: ${{ secrets.IBMCLOUD_API_KEY }}
IBMCLOUD_REGION: ${{ secrets.IBMCLOUD_REGION }}
TEST_VPC_ID: ${{ secrets.E2E_TEST_VPC_ID }}
TEST_SUBNET_ID: ${{ secrets.E2E_TEST_SUBNET_ID }}
TEST_IMAGE_ID: ${{ secrets.E2E_TEST_IMAGE_ID }}
TEST_ZONE: ${{ secrets.E2E_TEST_ZONE }}
TEST_SECURITY_GROUP_ID: ${{ secrets.E2E_TEST_SECURITY_GROUP_ID }}
VPC_URL: ${{ secrets.VPC_URL }}
KUBERNETES_API_SERVER_ENDPOINT: ${{ secrets.KUBERNETES_API_SERVER_ENDPOINT }}
IBM_RESOURCE_GROUP_ID: ${{ secrets.IBM_RESOURCE_GROUP_ID }}
IBM_SSH_KEY_ID: ${{ secrets.IBM_SSH_KEY_ID }}
E2E_SEQUENTIAL: "true"
E2E_CLEANUP_TIMEOUT: "300s"
E2E_STABILIZATION_WAIT: "60s"
run: |
echo "🚀 Starting E2E test suite..."
# Define test groups
# Core functionality tests from basic_workflow_test.go
core_tests="TestE2EFullWorkflow TestE2ENodePoolInstanceTypeSelection TestE2EInstanceTypeSelection TestE2EDriftStability"
# NodeClass validation tests from validation_test.go
validation_tests="TestE2ENodeClassValidation TestE2EValidNodeClassCreation TestE2ENodeClassWithMissingFields"
# Block device mapping tests from block_device_test.go
block_device_tests="TestE2EBlockDeviceMapping TestE2EBlockDeviceMappingValidation"
# Scheduling constraint tests from scheduling_test.go and e2e_taints_test.go
scheduling_tests="TestE2EPodDisruptionBudget TestE2EConsolidationWithPDB TestE2EPodAntiAffinity TestE2ENodeAffinity TestE2EStartupTaints TestE2EStartupTaintsRemoval TestE2ETaintsBasicScheduling TestE2ETaintValues TestE2ETaintSync TestE2EUnregisteredTaintHandling"
# UserData feature tests from userdata_test.go
userdata_tests="TestE2EUserDataAppend TestE2EStandardBootstrap"
# Image selector tests from image_selector_test.go
image_selector_tests="TestE2EImageSelector"
# Multi-zone tests from multizone_test.go
multizone_tests="TestE2EMultiZoneDistribution TestE2EZoneAntiAffinity TestE2ETopologySpreadConstraints TestE2EPlacementStrategyValidation TestE2EZoneFailover"
# Cleanup tests from cleanup_test.go
cleanup_tests="TestE2ECleanupNodePoolDeletion TestE2ECleanupNodeClassDeletion TestE2ECleanupOrphanedResources TestE2ECleanupIBMCloudResources"
# Combine all tests
all_tests="$core_tests $validation_tests $block_device_tests $scheduling_tests $userdata_tests $image_selector_tests $multizone_tests $cleanup_tests"
test_failed="false"
passed_tests=0
failed_tests=0
total_tests=$(echo $all_tests | wc -w)
echo "📋 Test Suite Summary:"
echo " Core Tests: $(echo $core_tests | wc -w)"
echo " Validation Tests: $(echo $validation_tests | wc -w)"
echo " Block Device Tests: $(echo $block_device_tests | wc -w)"
echo " Scheduling Tests: $(echo $scheduling_tests | wc -w)"
echo " UserData Tests: $(echo $userdata_tests | wc -w)"
echo " Image Selector Tests: $(echo $image_selector_tests | wc -w)"
echo " Multi-Zone Tests: $(echo $multizone_tests | wc -w)"
echo " Cleanup Tests: $(echo $cleanup_tests | wc -w)"
echo " Total Tests: $total_tests"
echo ""
for test in $all_tests; do
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "🧪 Running test: $test"
echo "Progress: $((passed_tests + failed_tests + 1))/$total_tests"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
# Set appropriate timeout based on test type
timeout="20m"
case "$test" in
"TestE2EDriftStability")
timeout="30m" # Drift test needs more time for monitoring
;;
"TestE2EMultiZone"*|"TestE2EZone"*|"TestE2ETopology"*|"TestE2EPlacementStrategy"*)
timeout="25m" # Multi-zone tests need extra time for cross-zone provisioning
;;
"TestE2ECleanup"*)
timeout="15m" # Cleanup tests are typically faster
;;
"TestE2EValidation"*|"TestE2ENodeClass"*)
timeout="10m" # Validation tests are quick
;;
*)
timeout="20m" # Default timeout for other tests
;;
esac
# Create test-specific log file to capture all output
test_log="test-artifacts/${test}-$(date +%s).log"
mkdir -p test-artifacts
# Run test with enhanced logging and crash recovery
# E2E tests MUST run serially (-p 1 -parallel 1) to avoid resource conflicts
set +e # Don't exit on failure
timeout $timeout go test -tags=e2e -v -timeout $timeout ./test/e2e -run "^$test$" -count=1 -p 1 -parallel 1 2>&1 | tee "$test_log"
test_exit_code=$?
set -e # Re-enable exit on failure
if [ $test_exit_code -eq 0 ]; then
echo "✅ Test $test passed"
passed_tests=$((passed_tests + 1))
else
echo "❌ Test $test failed (exit code: $test_exit_code)"
failed_tests=$((failed_tests + 1))
# Enhanced debug information on failure
echo "📊 Debug information for failed test $test:"
echo " Exit code: $test_exit_code"
echo " Log file: $test_log"
# Collect system state
kubectl get nodes --no-headers | wc -l | xargs echo " Total nodes:"
kubectl get nodeclaims --no-headers 2>/dev/null | wc -l | xargs echo " Total nodeclaims:" || echo " Total nodeclaims: 0"
kubectl get pods -l test=e2e --all-namespaces --no-headers 2>/dev/null | wc -l | xargs echo " Total e2e pods:" || echo " Total e2e pods: 0"
# Collect Karpenter pod status
echo " Karpenter pod status:"
kubectl get pods -n karpenter -l app.kubernetes.io/name=karpenter --no-headers 2>/dev/null || echo " No Karpenter pods found"
# Collect recent events (errors and warnings)
echo " Recent warning events:"
kubectl get events -A --field-selector type=Warning --sort-by='.lastTimestamp' 2>/dev/null | tail -5 || echo " No warning events"
# Check for panic or crash indicators in test log
if grep -i "panic\|fatal\|segmentation\|killed" "$test_log" >/dev/null 2>&1; then
echo " ⚠️ Test appears to have crashed (panic/fatal error detected)"
fi
# Collect Karpenter logs immediately after failure
kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter --tail=100 > "test-artifacts/karpenter-logs-${test}-$(date +%s).txt" 2>/dev/null || echo " Failed to collect Karpenter logs"
test_failed="true"
fi
echo "🧹 Cleaning up after test: $test"
kubectl delete pods -l test=e2e --all-namespaces --timeout=300s || true
kubectl delete deployments -l test=e2e --all-namespaces --timeout=300s || true
kubectl delete nodeclaims -l test=e2e --timeout=300s || true
kubectl delete nodepools -l test=e2e --timeout=300s || true
kubectl delete ibmnodeclasses -l test=e2e --timeout=300s || true
echo "⏳ Waiting for cleanup to complete..."
sleep 30
kubectl get nodes --no-headers | grep -c Ready | xargs echo "Ready nodes:"
kubectl get nodeclaims --no-headers | grep -c True | xargs echo "Ready nodeclaims:" || echo "Ready nodeclaims: 0"
echo "✅ Completed test: $test"
echo ""
done
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "📊 Test Suite Results:"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo " Total Tests: $total_tests"
echo " ✅ Passed: $passed_tests"
echo " ❌ Failed: $failed_tests"
echo " Success Rate: $((passed_tests * 100 / total_tests))%"
echo ""
if [ "$test_failed" = "true" ]; then
echo "❌ Test suite failed with $failed_tests failures"
exit 1
fi
echo "✅ All E2E tests completed successfully!"
- name: Collect test artifacts
if: always()
run: |
echo "📦 Collecting comprehensive test artifacts..."
mkdir -p test-artifacts
# Collect Karpenter logs with different tail sizes for completeness
echo " Collecting Karpenter logs..."
kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter --tail=2000 > test-artifacts/karpenter-logs.txt 2>/dev/null || echo "Failed to collect current Karpenter logs" > test-artifacts/karpenter-logs.txt
kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter --previous --tail=1000 > test-artifacts/karpenter-logs-previous.txt 2>/dev/null || echo "No previous Karpenter logs available" > test-artifacts/karpenter-logs-previous.txt
# Collect events with different filters
echo " Collecting events..."
kubectl get events -A --sort-by='.lastTimestamp' > test-artifacts/events.txt 2>/dev/null || echo "Failed to collect events" > test-artifacts/events.txt
kubectl get events -A --field-selector type=Warning --sort-by='.lastTimestamp' > test-artifacts/events-warnings.txt 2>/dev/null || echo "No warning events" > test-artifacts/events-warnings.txt
kubectl get events -A --field-selector type=Normal --sort-by='.lastTimestamp' | tail -50 > test-artifacts/events-normal-recent.txt 2>/dev/null || echo "No normal events" > test-artifacts/events-normal-recent.txt
# Collect resource states
echo " Collecting resource states..."
kubectl get nodes -o wide > test-artifacts/nodes.txt 2>/dev/null || echo "Failed to collect nodes" > test-artifacts/nodes.txt
kubectl get nodeclaims -o yaml > test-artifacts/nodeclaims.yaml 2>/dev/null || echo "apiVersion: v1\nitems: []\nkind: List" > test-artifacts/nodeclaims.yaml
kubectl get nodepools -o yaml > test-artifacts/nodepools.yaml 2>/dev/null || echo "apiVersion: v1\nitems: []\nkind: List" > test-artifacts/nodepools.yaml
kubectl get ibmnodeclasses -o yaml > test-artifacts/ibmnodeclasses.yaml 2>/dev/null || echo "apiVersion: v1\nitems: []\nkind: List" > test-artifacts/ibmnodeclasses.yaml
# Collect Karpenter deployment status
echo " Collecting Karpenter deployment status..."
kubectl describe deployment -n karpenter karpenter-karpenter-ibm > test-artifacts/karpenter-deployment.txt 2>/dev/null || echo "Failed to describe Karpenter deployment" > test-artifacts/karpenter-deployment.txt
kubectl get pods -n karpenter -o wide > test-artifacts/karpenter-pods.txt 2>/dev/null || echo "Failed to get Karpenter pods" > test-artifacts/karpenter-pods.txt
# Collect any crash dumps or additional logs
echo " Collecting additional diagnostics..."
kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded > test-artifacts/problematic-pods.txt 2>/dev/null || echo "No problematic pods found" > test-artifacts/problematic-pods.txt
# Create summary of artifacts
echo " Creating artifact summary..."
{
echo "E2E Test Artifacts Summary"
echo "========================="
echo "Generated: $(date)"
echo "Test run ID: ${{ github.run_id }}"
echo ""
echo "Files collected:"
ls -la test-artifacts/ 2>/dev/null || echo "No artifacts directory"
} > test-artifacts/README.txt
echo "✅ Test artifact collection completed"
- uses: actions/upload-artifact@v4
if: always()
with:
name: e2e-test-artifacts-${{ github.run_id }}
path: test-artifacts/
retention-days: 7
- name: Cleanup test resources
if: always()
env:
IBMCLOUD_API_KEY: ${{ secrets.IBMCLOUD_API_KEY }}
run: |
echo "🧹 Starting comprehensive cleanup..."
kubectl delete pods -l test=e2e --all-namespaces --timeout=10m || true
kubectl delete deployments -l test=e2e --all-namespaces --timeout=10m || true
kubectl delete nodeclaims -l test=e2e --timeout=10m || true
kubectl delete nodepools -l test=e2e --timeout=10m || true
kubectl delete ibmnodeclasses -l test=e2e --timeout=10m || true
kubectl patch nodeclaims --selector test=e2e --type='merge' -p='{"metadata":{"finalizers":[]}}' || true
kubectl patch nodepools --selector test=e2e --type='merge' -p='{"metadata":{"finalizers":[]}}' || true
kubectl patch ibmnodeclasses --selector test=e2e --type='merge' -p='{"metadata":{"finalizers":[]}}' || true
ibmcloud is instances --output json | \
jq -r '.[] | select(.tags | index("karpenter-e2e")) | .id' | \
xargs -I {} ibmcloud is instance-delete {} --force || true
ibmcloud is virtual-network-interfaces --output json | \
jq -r '.[] | select(.name | test("e2e-.*-vni")) | .id' | \
xargs -I {} ibmcloud is virtual-network-interface-delete {} --force || true
ibmcloud is volumes --output json | \
jq -r '.[] | select(.name | test("e2e-.*-boot")) | .id' | \
xargs -I {} ibmcloud is volume-delete {} --force || true
echo "✅ Cleanup completed"
- name: Restore original deployment
if: always()
run: |
echo "🔄 Restoring original karpenter deployment..."
kubectl rollout restart deployment/karpenter-karpenter-ibm -n karpenter
kubectl rollout status deployment/karpenter-karpenter-ibm -n karpenter --timeout=300s