chore(deps): bump k8s.io/apimachinery from 0.35.0-alpha.0 to 0.35.0-alpha.2 #341
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: E2E Tests (In-Cluster Runner) | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| run_benchmarks: | |
| description: 'Run performance benchmarks' | |
| required: false | |
| type: boolean | |
| default: false | |
| schedule: | |
| # Run nightly at 2 AM UTC | |
| - cron: '0 2 * * *' | |
| pull_request: | |
| types: [labeled] | |
| jobs: | |
| e2e-tests: | |
| # Use self-hosted runner with 'ibm-e2e' label | |
| runs-on: [self-hosted, ibm-e2e] | |
| if: | | |
| github.event_name == 'workflow_dispatch' || | |
| github.event_name == 'schedule' || | |
| (github.event_name == 'pull_request' && contains(github.event.label.name, 'run-e2e')) | |
| # Prevent concurrent e2e runs | |
| concurrency: | |
| group: e2e-tests | |
| cancel-in-progress: false | |
| timeout-minutes: 210 | |
| container: | |
| # Run in a container with necessary tools | |
| image: golang:1.24.6 | |
| options: --user 0 | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| - name: Install dependencies | |
| run: | | |
| apt-get update && apt-get install -y curl jq | |
| # Install kubectl | |
| curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" | |
| chmod +x kubectl | |
| mv kubectl /usr/local/bin/ | |
| # Install IBM Cloud CLI | |
| curl -fsSL https://clis.cloud.ibm.com/install/linux | sh | |
| ibmcloud plugin install vpc-infrastructure | |
| - name: Setup kubeconfig | |
| env: | |
| KUBECONFIG_CONTENT: ${{ secrets.KUBECONFIG }} | |
| run: | | |
| # Create kubeconfig from secret | |
| printf '%s' "$KUBECONFIG_CONTENT" | base64 -d > /tmp/kubeconfig | |
| chmod 600 /tmp/kubeconfig | |
| export KUBECONFIG=/tmp/kubeconfig | |
| # Verify the kubeconfig works | |
| kubectl version --client | |
| - name: Verify cluster access | |
| env: | |
| KUBECONFIG: /tmp/kubeconfig | |
| run: | | |
| # Verify cluster access with provided kubeconfig | |
| kubectl cluster-info | |
| kubectl auth can-i create nodeclaims --all-namespaces | |
| kubectl auth can-i create nodepools --all-namespaces | |
| kubectl auth can-i create ibmnodeclasses --all-namespaces | |
| - name: Configure IBM Cloud CLI | |
| env: | |
| IBMCLOUD_API_KEY: ${{ secrets.IBMCLOUD_API_KEY }} | |
| run: | | |
| ibmcloud login --apikey "${{ secrets.IBMCLOUD_API_KEY }}" -r "${{ secrets.IBMCLOUD_REGION }}" | |
| - name: Deploy latest version | |
| env: | |
| KUBECONFIG: /tmp/kubeconfig | |
| run: | | |
| # Install or update Karpenter CRDs | |
| kubectl apply -f charts/crds/ | |
| # Restart operator pods to pull latest upstream image tag | |
| kubectl rollout restart deployment/karpenter-karpenter-ibm -n karpenter | |
| - name: Pre-test cleanup | |
| env: | |
| KUBECONFIG: /tmp/kubeconfig | |
| run: | | |
| echo "🧹 Cleaning up any existing e2e test resources..." | |
| # Clean up any leftover resources from previous runs | |
| kubectl delete pods -l test=e2e --all-namespaces --timeout=300s || true | |
| kubectl delete deployments -l test=e2e --all-namespaces --timeout=300s || true | |
| kubectl delete nodeclaims -l test=e2e --timeout=300s || true | |
| kubectl delete nodepools -l test=e2e --timeout=300s || true | |
| kubectl delete ibmnodeclasses -l test=e2e --timeout=300s || true | |
| # Wait for cluster stabilization | |
| echo "⏳ Waiting for cluster stabilization..." | |
| # Wait for no pending e2e pods | |
| for i in {1..30}; do | |
| pending_pods=$(kubectl get pods -l test=e2e --all-namespaces --field-selector=status.phase=Pending --no-headers 2>/dev/null | wc -l) | |
| if [ "$pending_pods" -eq 0 ]; then | |
| echo "✅ No pending e2e pods found" | |
| break | |
| fi | |
| echo "⏳ Still have $pending_pods pending e2e pods, waiting..." | |
| sleep 10 | |
| done | |
| # Wait for no disrupted nodes | |
| for i in {1..30}; do | |
| disrupted_nodes=$(kubectl get nodes --no-headers -o custom-columns="NAME:.metadata.name,TAINTS:.spec.taints[*].key" | grep -c "karpenter.sh/disrupted" 2>/dev/null || echo "0") | |
| disrupted_nodes=$(echo "$disrupted_nodes" | tr -d '\n' | grep -o '[0-9]*' || echo "0") | |
| if [ "$disrupted_nodes" -eq 0 ]; then | |
| echo "✅ No disrupted nodes found" | |
| break | |
| fi | |
| echo "⏳ Still have $disrupted_nodes disrupted nodes, waiting..." | |
| sleep 10 | |
| done | |
| # Brief pause for final cleanup | |
| sleep 30 | |
| echo "✅ Pre-test cleanup completed" | |
| - name: Run E2E tests (Sequential) | |
| env: | |
| RUN_E2E_TESTS: "true" | |
| IBMCLOUD_API_KEY: ${{ secrets.IBMCLOUD_API_KEY }} | |
| VPC_API_KEY: ${{ secrets.IBMCLOUD_API_KEY }} | |
| IBMCLOUD_REGION: ${{ secrets.IBMCLOUD_REGION }} | |
| TEST_VPC_ID: ${{ secrets.E2E_TEST_VPC_ID }} | |
| TEST_SUBNET_ID: ${{ secrets.E2E_TEST_SUBNET_ID }} | |
| TEST_IMAGE_ID: ${{ secrets.E2E_TEST_IMAGE_ID }} | |
| TEST_ZONE: ${{ secrets.E2E_TEST_ZONE }} | |
| TEST_SECURITY_GROUP_ID: ${{ secrets.E2E_TEST_SECURITY_GROUP_ID }} | |
| VPC_URL: ${{ secrets.VPC_URL }} | |
| KUBERNETES_API_SERVER_ENDPOINT: ${{ secrets.KUBERNETES_API_SERVER_ENDPOINT }} | |
| IBM_RESOURCE_GROUP_ID: ${{ secrets.IBM_RESOURCE_GROUP_ID }} | |
| IBM_SSH_KEY_ID: ${{ secrets.IBM_SSH_KEY_ID }} | |
| RUN_E2E_BENCHMARKS: ${{ inputs.run_benchmarks }} | |
| # Use the kubeconfig we set up | |
| KUBECONFIG: /tmp/kubeconfig | |
| # Configure e2e test behavior | |
| E2E_SEQUENTIAL: "true" | |
| E2E_CLEANUP_TIMEOUT: "300s" | |
| E2E_STABILIZATION_WAIT: "60s" | |
| run: | | |
| echo "🚀 Starting E2E test suite..." | |
| # Define test groups | |
| # Core functionality tests from basic_workflow_test.go | |
| core_tests="TestE2EFullWorkflow TestE2ENodePoolInstanceTypeSelection TestE2EInstanceTypeSelection TestE2EDriftStability" | |
| # NodeClass validation tests from validation_test.go | |
| validation_tests="TestE2ENodeClassValidation TestE2EValidNodeClassCreation TestE2ENodeClassWithMissingFields" | |
| # Block device mapping tests from block_device_test.go | |
| block_device_tests="TestE2EBlockDeviceMapping TestE2EBlockDeviceMappingValidation" | |
| # Scheduling constraint tests from scheduling_test.go and e2e_taints_test.go | |
| scheduling_tests="TestE2EPodDisruptionBudget TestE2EConsolidationWithPDB TestE2EPodAntiAffinity TestE2ENodeAffinity TestE2EStartupTaints TestE2EStartupTaintsRemoval TestE2ETaintsBasicScheduling TestE2ETaintValues TestE2ETaintSync TestE2EUnregisteredTaintHandling" | |
| # UserData feature tests from userdata_test.go | |
| userdata_tests="TestE2EUserDataAppend TestE2EStandardBootstrap" | |
| # Image selector tests from image_selector_test.go | |
| image_selector_tests="TestE2EImageSelector" | |
| # Multi-zone tests from multizone_test.go | |
| multizone_tests="TestE2EMultiZoneDistribution TestE2EZoneAntiAffinity TestE2ETopologySpreadConstraints TestE2EPlacementStrategyValidation TestE2EZoneFailover" | |
| # Cleanup tests from cleanup_test.go | |
| cleanup_tests="TestE2ECleanupNodePoolDeletion TestE2ECleanupNodeClassDeletion TestE2ECleanupOrphanedResources TestE2ECleanupIBMCloudResources" | |
| # Combine all tests | |
| all_tests="$core_tests $validation_tests $block_device_tests $scheduling_tests $userdata_tests $image_selector_tests $multizone_tests $cleanup_tests" | |
| test_failed="false" | |
| passed_tests=0 | |
| failed_tests=0 | |
| total_tests=$(echo $all_tests | wc -w) | |
| echo "📋 Test Suite Summary:" | |
| echo " Core Tests: $(echo $core_tests | wc -w)" | |
| echo " Validation Tests: $(echo $validation_tests | wc -w)" | |
| echo " Block Device Tests: $(echo $block_device_tests | wc -w)" | |
| echo " Scheduling Tests: $(echo $scheduling_tests | wc -w)" | |
| echo " UserData Tests: $(echo $userdata_tests | wc -w)" | |
| echo " Image Selector Tests: $(echo $image_selector_tests | wc -w)" | |
| echo " Multi-Zone Tests: $(echo $multizone_tests | wc -w)" | |
| echo " Cleanup Tests: $(echo $cleanup_tests | wc -w)" | |
| echo " Total Tests: $total_tests" | |
| echo "" | |
| # Run each test individually with cleanup between | |
| for test in $all_tests; do | |
| echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" | |
| echo "🧪 Running test: $test" | |
| echo "Progress: $((passed_tests + failed_tests + 1))/$total_tests" | |
| echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" | |
| # Set appropriate timeout based on test type | |
| timeout="20m" | |
| case "$test" in | |
| "TestE2EDriftStability") | |
| timeout="30m" # Drift test needs more time for monitoring | |
| ;; | |
| "TestE2EMultiZone"*|"TestE2EZone"*|"TestE2ETopology"*|"TestE2EPlacementStrategy"*) | |
| timeout="25m" # Multi-zone tests need extra time for cross-zone provisioning | |
| ;; | |
| "TestE2ECleanup"*) | |
| timeout="15m" # Cleanup tests are typically faster | |
| ;; | |
| "TestE2EValidation"*|"TestE2ENodeClass"*) | |
| timeout="10m" # Validation tests are quick | |
| ;; | |
| *) | |
| timeout="20m" # Default timeout for other tests | |
| ;; | |
| esac | |
| # Create test-specific log file to capture all output | |
| test_log="test-artifacts/${test}-$(date +%s).log" | |
| mkdir -p test-artifacts | |
| # Run test with enhanced logging and crash recovery | |
| set +e # Don't exit on failure | |
| timeout $timeout go test -tags=e2e -v -timeout $timeout ./test/e2e -run "^$test$" -count=1 2>&1 | tee "$test_log" | |
| test_exit_code=$? | |
| set -e # Re-enable exit on failure | |
| if [ $test_exit_code -eq 0 ]; then | |
| echo "✅ Test $test passed" | |
| passed_tests=$((passed_tests + 1)) | |
| else | |
| echo "❌ Test $test failed (exit code: $test_exit_code)" | |
| failed_tests=$((failed_tests + 1)) | |
| # Enhanced debug information on failure | |
| echo "📊 Debug information for failed test $test:" | |
| echo " Exit code: $test_exit_code" | |
| echo " Log file: $test_log" | |
| # Collect system state | |
| kubectl get nodes --no-headers | wc -l | xargs echo " Total nodes:" | |
| kubectl get nodeclaims --no-headers 2>/dev/null | wc -l | xargs echo " Total nodeclaims:" || echo " Total nodeclaims: 0" | |
| kubectl get pods -l test=e2e --all-namespaces --no-headers 2>/dev/null | wc -l | xargs echo " Total e2e pods:" || echo " Total e2e pods: 0" | |
| # Collect Karpenter pod status | |
| echo " Karpenter pod status:" | |
| kubectl get pods -n karpenter -l app.kubernetes.io/name=karpenter --no-headers 2>/dev/null || echo " No Karpenter pods found" | |
| # Collect recent events (errors and warnings) | |
| echo " Recent warning events:" | |
| kubectl get events -A --field-selector type=Warning --sort-by='.lastTimestamp' 2>/dev/null | tail -5 || echo " No warning events" | |
| # Check for panic or crash indicators in test log | |
| if grep -i "panic\|fatal\|segmentation\|killed" "$test_log" >/dev/null 2>&1; then | |
| echo " ⚠️ Test appears to have crashed (panic/fatal error detected)" | |
| fi | |
| # Collect Karpenter logs immediately after failure | |
| kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter --tail=100 > "test-artifacts/karpenter-logs-${test}-$(date +%s).txt" 2>/dev/null || echo " Failed to collect Karpenter logs" | |
| test_failed="true" | |
| fi | |
| # Inter-test cleanup and stabilization | |
| echo "🧹 Cleaning up after test: $test" | |
| # Delete test-specific resources | |
| kubectl delete pods -l test=e2e --all-namespaces --timeout=300s || true | |
| kubectl delete deployments -l test=e2e --all-namespaces --timeout=300s || true | |
| kubectl delete nodeclaims -l test=e2e --timeout=300s || true | |
| kubectl delete nodepools -l test=e2e --timeout=300s || true | |
| kubectl delete ibmnodeclasses -l test=e2e --timeout=300s || true | |
| # Wait for cleanup to complete | |
| echo "⏳ Waiting for cleanup to complete..." | |
| # Extended cleanup wait for drift stability test due to NodeClaim deletion timeouts | |
| if [ "$test" = "TestE2EDriftStability" ]; then | |
| echo "⏳ Extended cleanup wait for drift stability test..." | |
| sleep 120 # 2 minutes for NodeClaim finalizers to complete | |
| else | |
| sleep 30 # Standard cleanup wait | |
| fi | |
| # Check cluster health before next test | |
| kubectl get nodes --no-headers | grep -c Ready | xargs echo "Ready nodes:" | |
| kubectl get nodeclaims --no-headers | grep -c True | xargs echo "Ready nodeclaims:" || echo "Ready nodeclaims: 0" | |
| echo "✅ Completed test: $test" | |
| echo "" | |
| done | |
| echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" | |
| echo "📊 Test Suite Results:" | |
| echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" | |
| echo " Total Tests: $total_tests" | |
| echo " ✅ Passed: $passed_tests" | |
| echo " ❌ Failed: $failed_tests" | |
| echo " Success Rate: $((passed_tests * 100 / total_tests))%" | |
| echo "" | |
| # Check if any test failed | |
| if [ "$test_failed" = "true" ]; then | |
| echo "❌ Test suite failed with $failed_tests failures" | |
| exit 1 | |
| fi | |
| echo "✅ All E2E tests completed successfully!" | |
| # Run benchmarks if requested | |
| if [ "$RUN_E2E_BENCHMARKS" = "true" ]; then | |
| echo "📊 Running performance benchmarks..." | |
| go test -tags=e2e -v -timeout 30m ./test/e2e/... -run=^$ -bench=. | |
| fi | |
| - name: Collect test artifacts | |
| if: always() | |
| env: | |
| KUBECONFIG: /tmp/kubeconfig | |
| run: | | |
| echo "📦 Collecting comprehensive test artifacts..." | |
| mkdir -p test-artifacts | |
| # Collect Karpenter logs with different tail sizes for completeness | |
| echo " Collecting Karpenter logs..." | |
| kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter --tail=2000 > test-artifacts/karpenter-logs.txt 2>/dev/null || echo "Failed to collect current Karpenter logs" > test-artifacts/karpenter-logs.txt | |
| kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter --previous --tail=1000 > test-artifacts/karpenter-logs-previous.txt 2>/dev/null || echo "No previous Karpenter logs available" > test-artifacts/karpenter-logs-previous.txt | |
| # Collect events with different filters | |
| echo " Collecting events..." | |
| kubectl get events -A --sort-by='.lastTimestamp' > test-artifacts/events.txt 2>/dev/null || echo "Failed to collect events" > test-artifacts/events.txt | |
| kubectl get events -A --field-selector type=Warning --sort-by='.lastTimestamp' > test-artifacts/events-warnings.txt 2>/dev/null || echo "No warning events" > test-artifacts/events-warnings.txt | |
| kubectl get events -A --field-selector type=Normal --sort-by='.lastTimestamp' | tail -50 > test-artifacts/events-normal-recent.txt 2>/dev/null || echo "No normal events" > test-artifacts/events-normal-recent.txt | |
| # Collect resource states | |
| echo " Collecting resource states..." | |
| kubectl get nodes -o wide > test-artifacts/nodes.txt 2>/dev/null || echo "Failed to collect nodes" > test-artifacts/nodes.txt | |
| kubectl get nodeclaims -o yaml > test-artifacts/nodeclaims.yaml 2>/dev/null || echo "apiVersion: v1\nitems: []\nkind: List" > test-artifacts/nodeclaims.yaml | |
| kubectl get nodepools -o yaml > test-artifacts/nodepools.yaml 2>/dev/null || echo "apiVersion: v1\nitems: []\nkind: List" > test-artifacts/nodepools.yaml | |
| kubectl get ibmnodeclasses -o yaml > test-artifacts/ibmnodeclasses.yaml 2>/dev/null || echo "apiVersion: v1\nitems: []\nkind: List" > test-artifacts/ibmnodeclasses.yaml | |
| # Collect Karpenter deployment status | |
| echo " Collecting Karpenter deployment status..." | |
| kubectl describe deployment -n karpenter karpenter-karpenter-ibm > test-artifacts/karpenter-deployment.txt 2>/dev/null || echo "Failed to describe Karpenter deployment" > test-artifacts/karpenter-deployment.txt | |
| kubectl get pods -n karpenter -o wide > test-artifacts/karpenter-pods.txt 2>/dev/null || echo "Failed to get Karpenter pods" > test-artifacts/karpenter-pods.txt | |
| # Collect any crash dumps or additional logs | |
| echo " Collecting additional diagnostics..." | |
| kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded > test-artifacts/problematic-pods.txt 2>/dev/null || echo "No problematic pods found" > test-artifacts/problematic-pods.txt | |
| # Create summary of artifacts | |
| echo " Creating artifact summary..." | |
| { | |
| echo "E2E Test Artifacts Summary" | |
| echo "=========================" | |
| echo "Generated: $(date)" | |
| echo "Test run ID: ${{ github.run_id }}" | |
| echo "" | |
| echo "Files collected:" | |
| ls -la test-artifacts/ 2>/dev/null || echo "No artifacts directory" | |
| } > test-artifacts/README.txt | |
| echo "✅ Test artifact collection completed" | |
| - name: Upload test artifacts | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: e2e-test-artifacts-${{ github.run_id }} | |
| path: test-artifacts/ | |
| retention-days: 7 | |
| - name: Cleanup test resources | |
| if: always() | |
| env: | |
| IBMCLOUD_API_KEY: ${{ secrets.IBMCLOUD_API_KEY }} | |
| KUBECONFIG: /tmp/kubeconfig | |
| run: | | |
| echo "🧹 Starting comprehensive cleanup..." | |
| # Clean up Kubernetes resources with extended timeouts | |
| echo "Cleaning up Kubernetes resources..." | |
| kubectl delete pods -l test=e2e --all-namespaces --timeout=10m || true | |
| kubectl delete deployments -l test=e2e --all-namespaces --timeout=10m || true | |
| kubectl delete nodeclaims -l test=e2e --timeout=10m || true | |
| kubectl delete nodepools -l test=e2e --timeout=10m || true | |
| kubectl delete ibmnodeclasses -l test=e2e --timeout=10m || true | |
| # Force cleanup any stuck resources with direct patching | |
| echo "Force cleaning up any stuck resources..." | |
| kubectl patch nodeclaims --selector test=e2e --type='merge' -p='{"metadata":{"finalizers":[]}}' || true | |
| kubectl patch nodepools --selector test=e2e --type='merge' -p='{"metadata":{"finalizers":[]}}' || true | |
| kubectl patch ibmnodeclasses --selector test=e2e --type='merge' -p='{"metadata":{"finalizers":[]}}' || true | |
| # Clean up IBM Cloud instances created by e2e tests | |
| echo "Cleaning up IBM Cloud instances..." | |
| ibmcloud is instances --output json | \ | |
| jq -r '.[] | select(.tags | index("karpenter-e2e")) | .id' | \ | |
| xargs -I {} ibmcloud is instance-delete {} --force || true | |
| # Clean up orphaned VNIs (Virtual Network Interfaces) | |
| echo "Cleaning up orphaned VNIs..." | |
| ibmcloud is virtual-network-interfaces --output json | \ | |
| jq -r '.[] | select(.name | test("e2e-.*-vni")) | .id' | \ | |
| xargs -I {} ibmcloud is virtual-network-interface-delete {} --force || true | |
| # Clean up orphaned volumes | |
| echo "Cleaning up orphaned volumes..." | |
| ibmcloud is volumes --output json | \ | |
| jq -r '.[] | select(.name | test("e2e-.*-boot")) | .id' | \ | |
| xargs -I {} ibmcloud is volume-delete {} --force || true | |
| echo "✅ Cleanup completed" |