From 17e48339a30958302c4e54bb809135af6762f90b Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Thu, 4 Dec 2025 15:17:46 -0800 Subject: [PATCH 01/40] add bebugging --- .config/guardian/.gdnbaselines | 2 +- .pipelines/azure_pipeline_mergedbranches.yaml | 69 ++- ...eploy-and-test-ci-image-in-aks-cluster.yml | 263 +++++++++++ .pipelines/e2e-test/verify-pod-images.sh | 388 ++++++++++++++++ ...I-Agent-Auto-Deploy-Implementation-Plan.md | 417 ++++++++++++++++++ test/testkube/helm-testkube-values.yaml | 3 + .../install-and-execute-testkube-tests.sh | 2 +- 7 files changed, 1140 insertions(+), 4 deletions(-) create mode 100644 .pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml create mode 100644 .pipelines/e2e-test/verify-pod-images.sh create mode 100644 Documentation/CI-Agent-Auto-Deploy-Implementation-Plan.md diff --git a/.config/guardian/.gdnbaselines b/.config/guardian/.gdnbaselines index 2b12b418dd..eff01b8012 100644 --- a/.config/guardian/.gdnbaselines +++ b/.config/guardian/.gdnbaselines @@ -154,4 +154,4 @@ "justification": "This error is baselined with an expiration date of 180 days from 2025-05-20 23:41:13Z" } } -} \ No newline at end of file +} diff --git a/.pipelines/azure_pipeline_mergedbranches.yaml b/.pipelines/azure_pipeline_mergedbranches.yaml index 291772961a..db5c896550 100644 --- a/.pipelines/azure_pipeline_mergedbranches.yaml +++ b/.pipelines/azure_pipeline_mergedbranches.yaml @@ -42,7 +42,15 @@ extends: customBuildTags: - ES365AIMigrationTooling stages: - - stage: stage + # This stage will be skipped when LinuxImageOverride and WindowsImageOverride are both set + # This feature allows bypassing the build stage when using pre-built images for testing, which saves time and resources. + - stage: Build_And_Publish_Images + displayName: 'Build and Publish Container Images' + condition: | + or( + eq(variables['LinuxImageOverride'], ''), + eq(variables['WindowsImageOverride'], '') + ) jobs: - job: common pool: @@ -880,4 +888,61 @@ extends: ScanType: CustomScan FileDirPath: '$(Build.ArtifactStagingDirectory)' DisableRemediation: false - AcceptableOutdatedSignatureInHours: 72 \ No newline at end of file + AcceptableOutdatedSignatureInHours: 72 + - stage: Deploy_and_Test_Images_In_Dev_Clusters + displayName: Deploy and Test Images in Dev Clusters + lockBehavior: sequential + dependsOn: + - Build_And_Publish_Images + # Deploy runs when Build succeeds OR when Build is skipped with valid overrides + # TODO: remove eq(variables['Build.SourceBranch'], 'refs/heads/zane/ci-agent-auto-deploy'), + # this stage runs when Build_And_Publish_Images succeeds or is skipped with valid overrides. + condition: | + and( + or( + eq(variables['Build.SourceBranch'], 'refs/heads/ci_prod'), + eq(variables['Build.SourceBranch'], 'refs/heads/zane/ci-agent-auto-deploy'), + contains(variables['Build.SourceBranch'], 'run-e2e') + ), + or( + eq(dependencies.Build_And_Publish_Images.result, 'Succeeded'), + and( + eq(dependencies.Build_And_Publish_Images.result, 'Skipped'), + ne(variables['LinuxImageOverride'], ''), + ne(variables['WindowsImageOverride'], '') + ) + ) + ) + variables: + # Use images built from previous build stage by default + # To override: Set pipeline variables 'LinuxImageOverride' and 'WindowsImageOverride' when queuing + linuxImageTagUnderTest: $[coalesce(variables['LinuxImageOverride'], stageDependencies.Build_And_Publish_Images.common.outputs['setup.linuxImagetag'])] + windowsImageTagUnderTest: $[coalesce(variables['WindowsImageOverride'], stageDependencies.Build_And_Publish_Images.common.outputs['setup.windowsImageTag'])] + jobs: + # TODO: gradually add more clusters from test automation framework when the tests are stable + # TODO: TeamsWebhookUri to be added + # Cluster 1: zane-test Cluster + - template: /.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml@self + parameters: + clusterName: 'zane-test' + resourceGroup: 'zane-test' + azureSubscription: 'ContainerInsights_Build_Subscription_CI' + environmentName: 'CI-Agent-Dev' + linuxImageTag: $(linuxImageTagUnderTest) + windowsImageTag: $(windowsImageTagUnderTest) + azureClientId: $(AksZaneTestClientId) + azureTenantId: $(AzureZaneTestTenantId) + teamsWebhookUri: $(TeamsWebhookUri) + + # Cluster 2: zane-test2 Cluster + - template: /.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml@self + parameters: + clusterName: 'zane-test2' + resourceGroup: 'zane-test' + azureSubscription: 'ContainerInsights_Build_Subscription_CI' + environmentName: 'CI-Agent-Dev2' + linuxImageTag: $(linuxImageTagUnderTest) + windowsImageTag: $(windowsImageTagUnderTest) + azureClientId: $(AksZaneTest2ClientId) + azureTenantId: $(AzureZaneTestTenantId) + teamsWebhookUri: $(TeamsWebhookUri) diff --git a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml new file mode 100644 index 0000000000..69f557c8b4 --- /dev/null +++ b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml @@ -0,0 +1,263 @@ +parameters: +- name: clusterName + type: string +- name: resourceGroup + type: string +- name: azureSubscription + type: string + default: 'ContainerInsights_Build_Subscription_CI' +- name: environmentName + type: string +- name: linuxImageTag + type: string +- name: windowsImageTag + type: string +- name: azureClientId + type: string +- name: azureTenantId + type: string +- name: teamsWebhookUri + type: string + default: '$(TeamsWebhookUri)' +- name: additionalTestParams + type: string + default: '' + +jobs: +- deployment: Deploy_${{ replace(parameters.clusterName, '-', '_') }} + displayName: 'Deploy & Test: ${{ parameters.clusterName }}' + environment: ${{ parameters.environmentName }} + pool: + name: Azure-Pipelines-CI-Test-EO + image: ci-1es-managed-ubuntu-2204 + os: linux + variables: + skipComponentGovernanceDetection: true + strategy: + runOnce: + deploy: + steps: + # Log deployment start + - bash: | + set -euo pipefail + + echo "=========================================" + echo "CLUSTER DEPLOYMENT STARTING" + echo "=========================================" + echo "Cluster: ${{ parameters.clusterName }}" + echo "Environment: ${{ parameters.environmentName }}" + echo "Build ID: $(Build.BuildId)" + echo "Pipeline Run: $(Build.BuildNumber)" + echo "" + echo "✓ Sequential deployment locking enabled at stage level" + echo "✓ Multiple pipeline runs will execute sequentially" + echo "=========================================" + displayName: 'Deployment Start' + + - checkout: self + persistCredentials: true + + - script: | + set -euo pipefail + echo "Ensuring kubectl & helm are installed" + if ! command -v kubectl >/dev/null 2>&1; then + echo "Installing kubectl" + sudo az aks install-cli + else + echo "kubectl already installed: $(kubectl version --client --short || true)" + fi + if ! command -v helm >/dev/null 2>&1; then + echo "Installing Helm 3" + curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + else + echo "Helm already installed: $(helm version --short || true)" + fi + displayName: 'Install kubectl and Helm' + + - task: AzureCLI@2 + displayName: 'Get credentials for ${{ parameters.clusterName }}' + inputs: + azureSubscription: ${{ parameters.azureSubscription }} + scriptLocation: 'inlineScript' + scriptType: 'bash' + inlineScript: 'az aks get-credentials -g ${{ parameters.resourceGroup }} -n ${{ parameters.clusterName }}' + + # Determine MCR repository paths based on image tags. + - task: Bash@3 + name: DetermineMcrRepo + displayName: 'Determine MCR Repository Paths' + env: + LINUX_IMAGE_TAG: ${{ parameters.linuxImageTag }} + WINDOWS_IMAGE_TAG: ${{ parameters.windowsImageTag }} + inputs: + targetType: 'inline' + script: | + # Function to determine registry path based on image tag + # CI dev builds contain git hash pattern (e.g., -gbdc2f3f42-20250701203056) + # Production releases are simple versions (e.g., 3.1.32) + get_mcr_repo() { + local image_tag="$1" + if [[ "$image_tag" =~ -g[a-f0-9]+-[0-9]+ ]]; then + echo "mcr.microsoft.com/azuremonitor/containerinsights/cidev" + else + echo "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" + fi + } + + LINUX_MCR_REPO=$(get_mcr_repo "$LINUX_IMAGE_TAG") + WINDOWS_MCR_REPO=$(get_mcr_repo "$WINDOWS_IMAGE_TAG") + + echo "Repository Path Detection:" + echo " Linux image tag: $LINUX_IMAGE_TAG" + echo " → Linux MCR repo: $LINUX_MCR_REPO" + echo " Windows image tag: $WINDOWS_IMAGE_TAG" + echo " → Windows MCR repo: $WINDOWS_MCR_REPO" + + # Export for subsequent steps + echo "##vso[task.setvariable variable=linuxMcrRepo;isOutput=true]$LINUX_MCR_REPO" + echo "##vso[task.setvariable variable=windowsMcrRepo;isOutput=true]$WINDOWS_MCR_REPO" + + # TODO: consider to use helm chart when it is ready for aks deployment + - task: Bash@3 + displayName: 'Patch ama-logs pods with new images' + env: + LINUX_IMAGE_TAG: ${{ parameters.linuxImageTag }} + WINDOWS_IMAGE_TAG: ${{ parameters.windowsImageTag }} + LINUX_MCR_REPO: $(DetermineMcrRepo.linuxMcrRepo) + WINDOWS_MCR_REPO: $(DetermineMcrRepo.windowsMcrRepo) + inputs: + targetType: 'inline' + script: | + echo "Deploying to cluster: ${{ parameters.clusterName }}" + echo " Linux image: $LINUX_MCR_REPO:$LINUX_IMAGE_TAG" + echo " Windows image: $WINDOWS_MCR_REPO:$WINDOWS_IMAGE_TAG" + echo "" + echo "Finding and patching ama-logs pods in kube-system namespace..." + + kubectl get pods -n kube-system --no-headers | grep ama-logs | awk '{print $1}' | while read pod_name; do + echo "Processing pod: $pod_name" + + if [[ "$pod_name" =~ ^ama-logs-windows ]]; then + IMG_URL="$WINDOWS_MCR_REPO:$WINDOWS_IMAGE_TAG" + container_name="ama-logs-windows" + elif [[ "$pod_name" =~ ^ama-logs-rs ]] || [[ "$pod_name" =~ ^ama-logs-[a-z0-9]{5}$ ]]; then + IMG_URL="$LINUX_MCR_REPO:$LINUX_IMAGE_TAG" + container_name="ama-logs" + else + echo " ⚠ Unknown pod pattern: $pod_name - skipping" + continue + fi + + echo " → Patching with image: $IMG_URL (container: $container_name)" + + kubectl patch pod "$pod_name" -n kube-system \ + --patch "{\"spec\": {\"containers\": [{\"name\": \"$container_name\", \"image\": \"$IMG_URL\"}]}}" \ + && echo " ✓ Successfully patched $pod_name" \ + || echo " ✗ Failed to patch $pod_name" + done + + echo "" + echo "Pod patching complete!" + echo "Current ama-logs pods:" + kubectl get pods -n kube-system | grep ama-logs + + # verify ci agent gets the new images + # output container start time for log analytics filtering + - task: Bash@3 + name: VerifyPods + displayName: 'Wait for pods to be ready with new images' + env: + LINUX_IMAGE_TAG: ${{ parameters.linuxImageTag }} + WINDOWS_IMAGE_TAG: ${{ parameters.windowsImageTag }} + LINUX_MCR_REPO: $(DetermineMcrRepo.linuxMcrRepo) + WINDOWS_MCR_REPO: $(DetermineMcrRepo.windowsMcrRepo) + inputs: + targetType: 'inline' + script: | + chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-pod-images.sh + $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-pod-images.sh pre-test "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO" + + # Export container start time for use in tests + if [ -f /tmp/container-deployment-time.env ]; then + source /tmp/container-deployment-time.env + echo "Container start time captured: $CONTAINER_START_TIME" + echo "##vso[task.setvariable variable=CONTAINER_START_TIME;isOutput=true]$CONTAINER_START_TIME" + else + echo "ERROR: Container start time not found at /tmp/container-deployment-time.env" + echo "This is required for Log Analytics query filtering" + exit 1 + fi + + - task: Bash@3 + displayName: 'Wait for logs to be ingested into Log Analytics (20 min)' + inputs: + targetType: 'inline' + script: | + echo "========================================" + echo "Waiting for Log Analytics Ingestion" + echo "========================================" + echo "Cluster: ${{ parameters.clusterName }}" + echo "Container start time: $(VerifyPods.CONTAINER_START_TIME)" + echo "" + echo "Waiting 20 minutes to allow logs to be ingested..." + echo "This ensures queries will find logs from the newly deployed containers." + echo "" + + wait_time=1200 + interval=60 + elapsed=0 + + while [ $elapsed -lt $wait_time ]; do + remaining=$((wait_time - elapsed)) + minutes_elapsed=$((elapsed / 60)) + minutes_remaining=$((remaining / 60)) + echo "⏳ Waiting... ($minutes_elapsed/$((wait_time / 60)) minutes elapsed, $minutes_remaining minutes remaining)" + sleep $interval + elapsed=$((elapsed + interval)) + done + + echo "" + echo "✓ Wait complete! Logs should now be available in Log Analytics." + echo "✓ Tests will query logs with filter: TimeGenerated > datetime('$(VerifyPods.CONTAINER_START_TIME)')" + echo "========================================" + # TODO (improvement): container start time is captured in previous step, but not used for now. Consider passing container start time to test script to use in log queries + - bash: | + # Pass container start time to tests + export CONTAINER_START_TIME="$(VerifyPods.CONTAINER_START_TIME)" + echo "Running tests for cluster: ${{ parameters.clusterName }}" + echo "Container start time: $CONTAINER_START_TIME" + + chmod +x ./install-and-execute-testkube-tests.sh + ./install-and-execute-testkube-tests.sh \ + AzureClientId=${{ parameters.azureClientId }} \ + AzureTenantId=${{ parameters.azureTenantId }} \ + TeamsWebhookUri=${{ parameters.teamsWebhookUri }} \ + ${{ parameters.additionalTestParams }} + workingDirectory: $(Build.SourcesDirectory)/test/testkube/ + displayName: 'Install Testkube and run E2E tests' + + - task: Bash@3 + displayName: 'Verify images remained stable after tests' + condition: always() + env: + LINUX_IMAGE_TAG: ${{ parameters.linuxImageTag }} + WINDOWS_IMAGE_TAG: ${{ parameters.windowsImageTag }} + LINUX_MCR_REPO: $(DetermineMcrRepo.linuxMcrRepo) + WINDOWS_MCR_REPO: $(DetermineMcrRepo.windowsMcrRepo) + inputs: + targetType: 'inline' + script: | + chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-pod-images.sh + $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-pod-images.sh post-test "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO" + + # Log deployment completion + - bash: | + echo "=========================================" + echo "DEPLOYMENT COMPLETE" + echo "=========================================" + echo "Cluster: ${{ parameters.clusterName }}" + echo "Build ID: $(Build.BuildId)" + echo "✓ Deployment finished for: ${{ parameters.clusterName }}" + echo "=========================================" + displayName: 'Deployment Completion' + condition: always() diff --git a/.pipelines/e2e-test/verify-pod-images.sh b/.pipelines/e2e-test/verify-pod-images.sh new file mode 100644 index 0000000000..c7b4df2eb4 --- /dev/null +++ b/.pipelines/e2e-test/verify-pod-images.sh @@ -0,0 +1,388 @@ +#!/bin/bash +# Script to verify AKS pod images match expected tags +# Can be used for both pre-test and post-test verification + +set -e + +# Parse command line arguments +MODE="${1:-pre-test}" # pre-test or post-test +LINUX_IMAGE_TAG="${2}" +WINDOWS_IMAGE_TAG="${3}" +LINUX_MCR_REPO="${4}" +WINDOWS_MCR_REPO="${5}" + +if [ -z "$LINUX_IMAGE_TAG" ] || [ -z "$WINDOWS_IMAGE_TAG" ] || [ -z "$LINUX_MCR_REPO" ] || [ -z "$WINDOWS_MCR_REPO" ]; then + echo "Error: Missing required parameters" + echo "Usage: $0 " + exit 1 +fi + +LINUX_IMAGE="$LINUX_MCR_REPO:$LINUX_IMAGE_TAG" +WINDOWS_IMAGE="$WINDOWS_MCR_REPO:$WINDOWS_IMAGE_TAG" + +if [ "$MODE" = "pre-test" ]; then + echo "================================" + echo "Pre-Test Image Verification" + echo "================================" + echo "Verifying pods are running with new images and are ready..." +else + echo "================================" + echo "Post-Test Image Verification" + echo "================================" + echo "Verifying pods still have the correct images after test execution..." +fi + +echo "" +echo "Repository Configuration:" +echo " Linux MCR repo: $LINUX_MCR_REPO" +echo " Windows MCR repo: $WINDOWS_MCR_REPO" +echo "" +echo "Expected Images:" +echo " Linux image: $LINUX_IMAGE" +echo " Windows image: $WINDOWS_IMAGE" +echo "" + +# Unified function to check all pods (with optional retry attempts) +# max_retries of 0 means instant check (no wait), otherwise retries up to max_retries times +check_all_pods() { + local -n configs_ref=$1 # Use different name to avoid circular reference + local max_retries=${2:-0} # Default to 0 (instant check, no retry) + local check_interval=15 # Wait 15 seconds between retries + + if [ $max_retries -gt 0 ]; then + # Wait mode (pre-test): Monitor pods with retries + local attempt=1 + + echo "================================" + echo "Waiting for all pods to be ready" + echo "================================" + echo "Total pods to check: ${#configs_ref[@]}" + echo "Maximum retries: $max_retries" + echo "Check interval: ${check_interval}s" + echo "Maximum wait time: $(((max_retries * check_interval) / 60)) minutes" + echo "" + + # Track ready status for each pod + declare -A pod_ready_status + for config in "${configs_ref[@]}"; do + pod_name=$(echo "$config" | cut -d: -f1) + pod_ready_status["$pod_name"]=false + done + + while [ $attempt -le $max_retries ]; do + local all_ready=true + local ready_count=0 + local total_count=${#configs_ref[@]} + + # Check each pod in this iteration + for config in "${configs_ref[@]}"; do + IFS=':' read -r pod_name expected_image container_name <<< "$config" + + # Skip if already marked as ready + if [ "${pod_ready_status[$pod_name]}" = "true" ]; then + ((ready_count++)) + continue + fi + + # DEBUG: Try alternative methods to get the image + # Method 1: Original jsonpath (what we've been using) + current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "") + + # Method 2: If method 1 is empty, try getting first container image + if [ -z "$current_image" ]; then + current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[0].image}" 2>/dev/null || echo "") + echo " [DEBUG] Method 1 (jsonpath filter) returned empty, trying method 2 (first container)" + echo " [DEBUG] Method 2 result: $current_image" + fi + + # Method 3: If still empty, try go-template + if [ -z "$current_image" ]; then + current_image=$(kubectl get pod "$pod_name" -n kube-system -o go-template='{{range .spec.containers}}{{if eq .name "'"$container_name"'"}}{{.image}}{{end}}{{end}}' 2>/dev/null || echo "") + echo " [DEBUG] Method 2 also empty, trying method 3 (go-template)" + echo " [DEBUG] Method 3 result: $current_image" + fi + + pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown") + + # Try similar methods for container ready + container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "") + if [ -z "$container_ready" ]; then + container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[0].ready}" 2>/dev/null || echo "false") + fi + + # Check if pod is ready + if [[ "$current_image" == "$expected_image" ]] && [[ "$pod_status" == "Running" ]] && [[ "$container_ready" == "true" ]]; then + pod_ready_status["$pod_name"]=true + ((ready_count++)) + echo " ✓ $pod_name - Ready" + else + all_ready=false + + # Show status for pods that aren't ready yet + if [ $((attempt % 4)) -eq 1 ] || [ $attempt -eq 1 ]; then # Log every 60 seconds + echo " ⏳ $pod_name - Waiting (Status: $pod_status, Container ready: $container_ready)" + if [[ "$current_image" != "$expected_image" ]]; then + echo " Image mismatch: expected $expected_image, got $current_image" + fi + fi + fi + done + + # Show progress summary + local elapsed_seconds=$(((attempt - 1) * check_interval)) + local minutes_elapsed=$((elapsed_seconds / 60)) + local seconds_elapsed=$((elapsed_seconds % 60)) + local remaining_retries=$((max_retries - attempt)) + local remaining_seconds=$((remaining_retries * check_interval)) + local minutes_remaining=$((remaining_seconds / 60)) + local seconds_remaining=$((remaining_seconds % 60)) + + if [ $((attempt % 4)) -eq 1 ] || [ $attempt -eq 1 ] || [ "$all_ready" = true ]; then + echo "" + echo "Attempt $attempt/$max_retries (${minutes_elapsed}m${seconds_elapsed}s elapsed, ${minutes_remaining}m${seconds_remaining}s remaining)" + echo "Progress: $ready_count/$total_count pods ready" + echo "" + fi + + # Exit early if all pods are ready + if [ "$all_ready" = true ]; then + echo "================================" + echo "✓ SUCCESS: All pods are ready!" + echo "================================" + echo "Total attempts: $attempt" + echo "Total wait time: ${minutes_elapsed}m${seconds_elapsed}s" + echo "" + return 0 + fi + + # Don't sleep after the last attempt + if [ $attempt -lt $max_retries ]; then + sleep $check_interval + fi + + ((attempt++)) + done + + # Max retries reached - report which pods failed + echo "================================" + echo "✗ MAX RETRIES REACHED: Not all pods became ready after $max_retries attempts" + echo "================================" + echo "" + echo "Failed pods:" + for config in "${configs_ref[@]}"; do + IFS=':' read -r pod_name expected_image container_name <<< "$config" + if [ "${pod_ready_status[$pod_name]}" != "true" ]; then + current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[0].image}" 2>/dev/null || echo "ERROR") + pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown") + container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[0].ready}" 2>/dev/null || echo "false") + + echo " ✗ $pod_name" + echo " Expected image: $expected_image" + echo " Current image: $current_image" + echo " Pod status: $pod_status" + echo " Container ready: $container_ready" + fi + done + echo "" + + return 1 + else + # Instant check mode (post-test): Single check, no waiting + local mismatches=() + + echo "Performing instant verification of all pods..." + echo "" + + for config in "${configs_ref[@]}"; do + IFS=':' read -r pod_name expected_image container_name <<< "$config" + + # Use first container image as fallback + current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[0].image}" 2>/dev/null || echo "ERROR") + pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown") + + echo "Pod: $pod_name" + echo " Container: $container_name" + echo " Expected image: $expected_image" + echo " Current image: $current_image" + echo " Pod status: $pod_status" + + if [[ "$current_image" != "$expected_image" ]]; then + echo " ✗ IMAGE MISMATCH DETECTED!" + mismatches+=("$pod_name: expected '$expected_image' but found '$current_image'") + else + echo " ✓ Image is correct" + fi + echo "" + done + + # Return mismatches via global array (bash limitation workaround) + image_mismatches=("${mismatches[@]}") + + if [ ${#mismatches[@]} -eq 0 ]; then + return 0 + else + return 1 + fi + fi +} + +# Get all ama-logs pods +echo "Getting list of ama-logs pods..." +pod_list=$(kubectl get pods -n kube-system --no-headers | grep ama-logs | awk '{print $1}') + +# Build configurations for all pods +pod_configs=() +image_mismatches=() + +for pod_name in $pod_list; do + # Determine expected image based on pod type + if [[ "$pod_name" =~ ^ama-logs-windows ]]; then + expected_image="$WINDOWS_IMAGE" + container_name="ama-logs-windows" + elif [[ "$pod_name" =~ ^ama-logs-rs ]] || [[ "$pod_name" =~ ^ama-logs-[a-z0-9]{5}$ ]]; then + # Matches both ReplicaSet pods (ama-logs-rs-*) and DaemonSet pods (ama-logs-xxxxx) + expected_image="$LINUX_IMAGE" + container_name="ama-logs" + else + echo "⚠ Unknown pod pattern: $pod_name - skipping verification" + continue + fi + + # Add to configurations for parallel checking + pod_configs+=("$pod_name:$expected_image:$container_name") +done + +echo "Found ${#pod_configs[@]} pods to verify" +echo "" + +# Use different check based on mode +if [ "$MODE" = "pre-test" ]; then + # Pre-test: Wait for all pods to be ready (60 retries × 15s = 15 minutes max) + if ! check_all_pods pod_configs 60; then + # Function already reports which pods failed + failed_pods=true + else + failed_pods=false + fi +else + # Post-test: Instant check of all pods (no retry) + check_all_pods pod_configs 0 +fi + +echo "" +echo "================================" +if [ "$MODE" = "pre-test" ]; then + echo "Pre-Test Verification Summary" +else + echo "Post-Test Verification Summary" +fi +echo "================================" + +# Report results based on mode +if [ "$MODE" = "pre-test" ]; then + if [ "$failed_pods" = false ]; then + echo "✓ All pods are running with the correct images and are ready!" + echo "" + echo "Final pod status:" + kubectl get pods -n kube-system | grep ama-logs + echo "" + echo "Image verification:" + kubectl get pods -n kube-system -l component=ama-logs-agent -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.containers[0].image}{"\t"}{.status.phase}{"\t"}{.status.containerStatuses[0].ready}{"\n"}{end}' | column -t 2>/dev/null || true + + echo "" + echo "================================" + echo "Container Start Time Capture" + echo "================================" + echo "Capturing LATEST container start time for Log Analytics queries..." + + # Get all container start times and find the LATEST one + latest_start_time="" + + pod_list=$(kubectl get pods -n kube-system --no-headers | grep ama-logs | awk '{print $1}') + for pod_name in $pod_list; do + # Get container name based on pod type + if [[ "$pod_name" =~ ^ama-logs-windows ]]; then + container_name="ama-logs-windows" + elif [[ "$pod_name" =~ ^ama-logs-rs ]] || [[ "$pod_name" =~ ^ama-logs-[a-z0-9]{5}$ ]]; then + container_name="ama-logs" + else + continue + fi + + # Get container start time - try first container if filter doesn't work + start_time=$(kubectl get pod "$pod_name" -n kube-system \ + -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].state.running.startedAt}" 2>/dev/null || echo "") + + if [ -z "$start_time" ]; then + start_time=$(kubectl get pod "$pod_name" -n kube-system \ + -o jsonpath="{.status.containerStatuses[0].state.running.startedAt}" 2>/dev/null || echo "") + fi + + if [ -n "$start_time" ]; then + echo " Pod $pod_name container started at: $start_time" + + # Track LATEST time (lexicographically later in ISO 8601 format) + if [ -z "$latest_start_time" ] || [[ "$start_time" > "$latest_start_time" ]]; then + latest_start_time="$start_time" + fi + else + echo "✗ ERROR: Could not determine container start time for pod $pod_name (container: $container_name)" + echo "This is required for Log Analytics query filtering" + exit 1 + fi + done + + if [ -n "$latest_start_time" ]; then + # Export for use in tests + echo "CONTAINER_START_TIME=$latest_start_time" > /tmp/container-deployment-time.env + echo "" + echo "✓ LATEST container start time: $latest_start_time" + echo "✓ Saved to /tmp/container-deployment-time.env" + echo "✓ Log Analytics queries should filter: TimeGenerated > datetime('$latest_start_time')" + else + echo "✗ ERROR: Could not determine container start times" + echo "This is required for Log Analytics query filtering" + exit 1 + fi + + exit 0 + else + echo "✗ Pod verification failed (see details above)" + echo "" + echo "Final pod status:" + kubectl get pods -n kube-system | grep ama-logs + exit 1 + fi +else + # Post-test mode + if [ ${#image_mismatches[@]} -eq 0 ]; then + echo "✓ SUCCESS: All pods maintained the correct images throughout the test execution!" + echo "" + echo "Final pod status:" + kubectl get pods -n kube-system | grep ama-logs + echo "" + echo "Image summary:" + kubectl get pods -n kube-system -l component=ama-logs-agent -o custom-columns=NAME:.metadata.name,IMAGE:.spec.containers[0].image,STATUS:.status.phase,READY:.status.containerStatuses[0].ready 2>/dev/null || true + exit 0 + else + echo "✗ FAILURE: Some pods changed images during test execution!" + echo "" + echo "Pods with image mismatches:" + printf ' - %s\n' "${image_mismatches[@]}" + echo "" + echo "This indicates the pods may have been restarted or updated during testing." + echo "This could cause test instability or false results." + echo "" + echo "Current pod status:" + kubectl get pods -n kube-system | grep ama-logs + echo "" + echo "Detailed pod information:" + for mismatch in "${image_mismatches[@]}"; do + pod=$(echo "$mismatch" | cut -d: -f1) + echo "" + echo "--- Details for $pod ---" + kubectl describe pod "$pod" -n kube-system | grep -A 20 "Events:" || kubectl describe pod "$pod" -n kube-system | tail -30 + done + exit 1 + fi +fi diff --git a/Documentation/CI-Agent-Auto-Deploy-Implementation-Plan.md b/Documentation/CI-Agent-Auto-Deploy-Implementation-Plan.md new file mode 100644 index 0000000000..98bbaf4319 --- /dev/null +++ b/Documentation/CI-Agent-Auto-Deploy-Implementation-Plan.md @@ -0,0 +1,417 @@ +# CI Agent Auto-Deploy Implementation Plan + +## Overview +This document outlines the implementation plan for enabling auto-deployment of CI Agent to a dev cluster on every PR merge to main branch, following the Prom Agent pattern. + +**Goal:** Automatically deploy freshly built CI agent images to a dev cluster after each successful build on main branch. + +**Pattern:** Based on Prom Agent's `azure-pipeline-build.yml` approach - sequential deployments using `helm upgrade --install`. + +--- + +## Key Findings + +### ✅ No Chart Modifications Needed +- **ServiceAccount**: Hardcoded `ama-logs` works fine for sequential deployments +- **Image Tags**: Can be overridden via `--set` flags at deployment time +- **Release Name**: Using same release name (`ama-logs-dev`) for all deployments allows Helm to upgrade in place + +### ✅ Prom Agent Pattern +- Uses `helm upgrade --install` with same release name every time +- Deploys to different clusters (not multiple releases per cluster) +- Each cluster has exactly ONE release +- No ServiceAccount conflicts with sequential deployments + +--- + +## Implementation Changes + +### 1. Pipeline Modification + +**File:** `Docker-Provider/.pipelines/azure_pipeline_mergedbranches.yaml` + +**Add Deployment Stage** after existing build stages: + +```yaml +- stage: Deploy_Dev_Cluster + displayName: Deploy to Dev Cluster + dependsOn: + - BuildLinuxImages + - BuildWindowsImages + # Only deploy on main branch merges (not PRs) + condition: and(succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main')) + + jobs: + - deployment: Deploy_AKS_Chart + displayName: "Deploy: AKS dev cluster" + environment: CI-Agent-Dev # Create this environment in Azure DevOps + pool: + name: Azure-Pipelines-CI-Test-EO + + variables: + # Get image tags from build stages + linuxImageTag: $[ stageDependencies.BuildLinuxImages.Build.outputs['setImageTag.linuxTag'] ] + windowsImageTag: $[ stageDependencies.BuildWindowsImages.Build.outputs['setImageTag.windowsTag'] ] + + strategy: + runOnce: + deploy: + steps: + - checkout: self + + - task: HelmDeploy@0 + displayName: "Deploy to dev cluster" + inputs: + connectionType: 'Azure Resource Manager' + azureSubscription: 'ContainerInsights_Build_Subscription(9b96ebbd-c57a-42d1-bbe9-b69296e4c7fb)' + azureResourceGroup: 'YOUR-DEV-CLUSTER-RG' + kubernetesCluster: 'YOUR-DEV-CLUSTER-NAME' + useClusterAdmin: true + namespace: 'kube-system' + command: 'upgrade' + chartType: 'FilePath' + chartPath: '$(Build.SourcesDirectory)/charts/azuremonitor-containers/' + releaseName: 'ama-logs-dev' + overrideValues: | + amalogs.image.repo=mcr.microsoft.com/azuremonitor/containerinsights/cidev + amalogs.image.tag=$(linuxImageTag) + amalogs.image.tagWindows=$(windowsImageTag) + arguments: '--install --create-namespace' +``` + +--- + +### 2. Ensure Build Stages Export Image Tags + +**Verify in BuildLinuxImages stage:** + +```yaml +- stage: BuildLinuxImages + jobs: + - job: Build + steps: + # ... existing build steps ... + + # Add this step to export tag + - script: | + echo "##vso[task.setvariable variable=linuxTag;isOutput=true]$(IMAGE_TAG)" + name: setImageTag + displayName: Export Linux image tag +``` + +**Verify in BuildWindowsImages stage:** + +```yaml +- stage: BuildWindowsImages + jobs: + - job: Build + steps: + # ... existing build steps ... + + # Add this step to export tag + - script: | + echo "##vso[task.setvariable variable=windowsTag;isOutput=true]$(IMAGE_TAG)" + name: setImageTag + displayName: Export Windows image tag +``` + +--- + +### 3. Configuration Updates + +**Replace these placeholders with actual values:** + +| Placeholder | Description | Example Value | +|-------------|-------------|---------------| +| `YOUR-DEV-CLUSTER-RG` | Resource group containing dev cluster | `ci-dev-aks-rg` | +| `YOUR-DEV-CLUSTER-NAME` | Name of dev AKS cluster | `ci-dev-aks-eus` | + +**Optional: Add more overrides for dev-specific configuration:** + +```yaml +overrideValues: | + amalogs.image.repo=mcr.microsoft.com/azuremonitor/containerinsights/cidev + amalogs.image.tag=$(linuxImageTag) + amalogs.image.tagWindows=$(windowsImageTag) + amalogs.secret.wsid=YOUR-DEV-WORKSPACE-ID + amalogs.secret.key=YOUR-DEV-WORKSPACE-KEY + amalogs.env.clusterName=ci-dev-cluster + amalogs.ISTEST=true +``` + +--- + +### 4. Azure DevOps Environment Setup + +**Create deployment environment:** +1. Navigate to: Azure DevOps → Pipelines → Environments +2. Click "New environment" +3. Name: `CI-Agent-Dev` +4. Resource: None (environment-only) +5. (Optional) Add approval gates if needed + +--- + +## Chart Details - No Modifications Required + +### ServiceAccount Handling +- **Current:** Hardcoded as `ama-logs` +- **Works because:** Sequential deployments reuse same ServiceAccount +- **Pattern:** `helm upgrade` updates existing resources, doesn't recreate + +### Image Tag Handling +- **Current:** Hardcoded in `values.yaml` +- **Override:** Via `--set` flags at deployment time +- **Files affected:** None (pure runtime override) + +### Files with ServiceAccount References (No changes needed) +1. `templates/ama-logs-rbac.yaml` - Creates ServiceAccount `ama-logs` +2. `templates/ama-logs-daemonset.yaml` - References `serviceAccountName: ama-logs` +3. `templates/ama-logs-daemonset-windows.yaml` - References `serviceAccountName: ama-logs` +4. `templates/ama-logs-deployment.yaml` - References `serviceAccountName: ama-logs` + +--- + +## How It Works + +### Deployment Flow + +``` +┌─────────────────────────────────────────────────────────────┐ +│ 1. PR Merged to Main Branch │ +└─────────────────────────────────────────────────────────────┘ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ 2. Build Pipeline Triggered │ +│ - BuildLinuxImages stage → produces linuxImageTag │ +│ - BuildWindowsImages stage → produces windowsImageTag │ +└─────────────────────────────────────────────────────────────┘ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ 3. Deploy_Dev_Cluster Stage │ +│ - Gets image tags from build stages │ +│ - Runs: helm upgrade ama-logs-dev --install │ +│ - Overrides: image.tag=$(linuxImageTag) │ +└─────────────────────────────────────────────────────────────┘ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ 4. Helm Deployment on Dev Cluster │ +│ - First run: Creates new release "ama-logs-dev" │ +│ - Subsequent runs: Updates existing release │ +│ - ServiceAccount "ama-logs" reused (no conflicts) │ +└─────────────────────────────────────────────────────────────┘ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ 5. Dev Cluster Running Latest Build │ +│ - DaemonSet updated with new image tags │ +│ - Windows DaemonSet updated with new image tags │ +│ - Deployment (ReplicaSet) updated with new image tags │ +└─────────────────────────────────────────────────────────────┘ +``` + +### Sequential Deployment Example + +```bash +# Build 1 - Creates initial deployment +helm upgrade ama-logs-dev ./chart --install \ + --set amalogs.image.tag=3.1.30-20231101 \ + --set amalogs.image.tagWindows=win-3.1.30-20231101 +# Result: New release created, ServiceAccount "ama-logs" created + +# Build 2 - Updates existing deployment +helm upgrade ama-logs-dev ./chart --install \ + --set amalogs.image.tag=3.1.30-20231102 \ + --set amalogs.image.tagWindows=win-3.1.30-20231102 +# Result: Release updated, ServiceAccount "ama-logs" reused ✅ + +# Build 3 - Updates existing deployment +helm upgrade ama-logs-dev ./chart --install \ + --set amalogs.image.tag=3.1.30-20231103 \ + --set amalogs.image.tagWindows=win-3.1.30-20231103 +# Result: Release updated, ServiceAccount "ama-logs" reused ✅ +``` + +--- + +## Testing Plan + +### Pre-Deployment Testing + +1. **Validate Chart Templates:** +```bash +cd Docker-Provider/charts/azuremonitor-containers +helm template ama-logs-dev . \ + --set amalogs.image.tag=test-tag \ + --set amalogs.image.tagWindows=test-tag-win \ + --debug +``` + +2. **Dry Run Deployment:** +```bash +helm upgrade ama-logs-dev . --install \ + --namespace kube-system \ + --set amalogs.image.tag=test-tag \ + --dry-run --debug +``` + +### Post-Deployment Validation + +1. **Check Pipeline Execution:** + - Verify Deploy_Dev_Cluster stage runs + - Check image tags are passed correctly + - Confirm Helm deployment succeeds + +2. **Verify Cluster Deployment:** +```bash +# Check pods are running +kubectl get pods -n kube-system | grep ama-logs + +# Verify DaemonSet +kubectl describe daemonset ama-logs -n kube-system + +# Verify Windows DaemonSet +kubectl describe daemonset ama-logs-win -n kube-system + +# Verify Deployment (ReplicaSet) +kubectl describe deployment ama-logs-rs -n kube-system + +# Check image tags match build +kubectl get daemonset ama-logs -n kube-system -o jsonpath='{.spec.template.spec.containers[0].image}' +``` + +3. **Verify ServiceAccount:** +```bash +# Confirm ServiceAccount exists and is used +kubectl get serviceaccount ama-logs -n kube-system +kubectl get pods -n kube-system -l dsName=ama-logs-ds -o jsonpath='{.items[0].spec.serviceAccountName}' +``` + +--- + +## Rollback Plan + +If deployment fails or causes issues: + +### Option 1: Rollback via Helm +```bash +# List releases +helm list -n kube-system + +# Rollback to previous version +helm rollback ama-logs-dev -n kube-system +``` + +### Option 2: Manual Revert +```bash +# Revert to specific image version +helm upgrade ama-logs-dev ./chart --install \ + --set amalogs.image.tag=PREVIOUS-WORKING-TAG \ + --set amalogs.image.tagWindows=PREVIOUS-WORKING-TAG-win +``` + +### Option 3: Remove Pipeline Stage +- Comment out `Deploy_Dev_Cluster` stage in pipeline +- Commit and push +- Cluster remains at current version + +--- + +## Comparison: CI Agent vs Prom Agent + +| Aspect | Prom Agent | CI Agent (This Plan) | +|--------|-----------|---------------------| +| **Chart Changes** | None | None | +| **ServiceAccount** | Hardcoded `ama-metrics-serviceaccount` | Hardcoded `ama-logs` | +| **Deployment Method** | `helm upgrade --install` | `helm upgrade --install` | +| **Release Name** | `ama-metrics` | `ama-logs-dev` | +| **Image Override** | `--set image.tag=...` | `--set amalogs.image.tag=...` | +| **Multiple Versions** | ❌ Not supported | ❌ Not supported (sequential only) | +| **Cluster Strategy** | One release per cluster | One release per cluster | + +--- + +## Estimated Effort + +| Task | Effort | Notes | +|------|--------|-------| +| Add deployment stage to pipeline | 30 min | Copy from Prom agent pattern | +| Update cluster name/RG variables | 5 min | Simple config update | +| Create Azure DevOps environment | 5 min | One-time setup | +| Verify build tag exports | 15 min | May already exist | +| Test dry-run deployment | 15 min | Validate before merge | +| Deploy and validate | 30 min | First deployment + verification | +| **Total** | **~2 hours** | Including testing and validation | + +--- + +## Future Enhancements (Optional) + +### 1. Add E2E Tests Post-Deployment +Similar to Prom agent's TestKube integration: +```yaml +- job: Run_E2E_Tests + dependsOn: Deploy_AKS_Chart + steps: + - script: kubectl testkube run testsuite ci-agent-e2e-tests +``` + +### 2. Deploy to Multiple Dev Clusters +Add additional deployment jobs for different regions: +```yaml +- deployment: Deploy_EUS_Cluster + cluster: ci-dev-aks-eus + +- deployment: Deploy_WUS_Cluster + cluster: ci-dev-aks-wus +``` + +### 3. Slack/Teams Notifications +Notify team of successful deployments: +```yaml +- task: SlackNotification@1 + inputs: + message: "✅ CI Agent $(linuxImageTag) deployed to dev cluster" +``` + +--- + +## References + +- **Prom Agent Build Pipeline:** `prometheus-collector/.pipelines/azure-pipeline-build.yml` +- **CI Agent Current Pipeline:** `Docker-Provider/.pipelines/azure_pipeline_mergedbranches.yaml` +- **Helm Chart:** `Docker-Provider/charts/azuremonitor-containers/` +- **Prom Agent Chart:** `prometheus-collector/otelcollector/deploy/addon-chart/azure-monitor-metrics-addon/` + +--- + +## Questions & Answers + +### Q: Why not use Release.Name for ServiceAccount? +**A:** Not needed for sequential deployments. Same release name = same ServiceAccount = no conflicts. Only needed for parallel deployments (multiple versions simultaneously). + +### Q: Can we deploy multiple versions to same cluster? +**A:** No, with current approach (hardcoded ServiceAccount). Would require chart modifications to use `{{ .Release.Name }}` pattern. Not recommended unless specifically needed. + +### Q: What if build fails? +**A:** Deploy stage has `condition: succeeded()` - won't run if build fails. Cluster stays at previous version. + +### Q: How to deploy to production? +**A:** This plan is for dev cluster only. Production deployments should continue using existing release pipeline with proper approvals and phased rollouts. + +--- + +## Status + +- [x] Research Prom agent pattern +- [x] Document findings +- [x] Create implementation plan +- [ ] Update pipeline with deployment stage +- [ ] Test deployment to dev cluster +- [ ] Validate with team +- [ ] Merge to main branch + +--- + +**Last Updated:** 2025-11-07 +**Author:** Implementation plan based on Prom agent analysis +**Status:** Ready for implementation diff --git a/test/testkube/helm-testkube-values.yaml b/test/testkube/helm-testkube-values.yaml index 04a273fe69..29727dcfba 100644 --- a/test/testkube/helm-testkube-values.yaml +++ b/test/testkube/helm-testkube-values.yaml @@ -1304,4 +1304,7 @@ testkube-operator: # ref: https://cloud.google.com/kubernetes-engine/docs/how-to/prepare-arm-workloads-for-deployment#node-affinity-multi-arch-arm # -- Tolerations to schedule a workload to nodes with any architecture type. Required for deployment to GKE cluster. tolerations: [] +<<<<<<< HEAD +======= +>>>>>>> efd34efac (add bebugging) diff --git a/test/testkube/install-and-execute-testkube-tests.sh b/test/testkube/install-and-execute-testkube-tests.sh index 164be7fe69..1df4bccf47 100644 --- a/test/testkube/install-and-execute-testkube-tests.sh +++ b/test/testkube/install-and-execute-testkube-tests.sh @@ -125,4 +125,4 @@ EOF # Explicitly fail the ADO task since at least one test failed exit 1 -fi \ No newline at end of file +fi From 0586eec790d6bd270622e07e204b5eada50999da Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Thu, 4 Dec 2025 15:55:52 -0800 Subject: [PATCH 02/40] debug --- .pipelines/e2e-test/verify-pod-images.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.pipelines/e2e-test/verify-pod-images.sh b/.pipelines/e2e-test/verify-pod-images.sh index c7b4df2eb4..86ac0f3410 100644 --- a/.pipelines/e2e-test/verify-pod-images.sh +++ b/.pipelines/e2e-test/verify-pod-images.sh @@ -76,7 +76,12 @@ check_all_pods() { # Check each pod in this iteration for config in "${configs_ref[@]}"; do + echo " [DEBUG] Raw config string: '$config'" IFS=':' read -r pod_name expected_image container_name <<< "$config" + echo " [DEBUG] Parsed values:" + echo " pod_name='$pod_name'" + echo " expected_image='$expected_image'" + echo " container_name='$container_name'" # Skip if already marked as ready if [ "${pod_ready_status[$pod_name]}" = "true" ]; then @@ -86,7 +91,9 @@ check_all_pods() { # DEBUG: Try alternative methods to get the image # Method 1: Original jsonpath (what we've been using) + echo " [DEBUG] Attempting kubectl jsonpath query..." current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "") + echo " [DEBUG] Method 1 result: '$current_image'" # Method 2: If method 1 is empty, try getting first container image if [ -z "$current_image" ]; then From e89108833dbeaf29859f38f2b32f7ffb5392e7a8 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Thu, 4 Dec 2025 16:13:41 -0800 Subject: [PATCH 03/40] fix ifs spliting --- .pipelines/e2e-test/verify-pod-images.sh | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/.pipelines/e2e-test/verify-pod-images.sh b/.pipelines/e2e-test/verify-pod-images.sh index 86ac0f3410..97fdb67ca7 100644 --- a/.pipelines/e2e-test/verify-pod-images.sh +++ b/.pipelines/e2e-test/verify-pod-images.sh @@ -65,7 +65,7 @@ check_all_pods() { # Track ready status for each pod declare -A pod_ready_status for config in "${configs_ref[@]}"; do - pod_name=$(echo "$config" | cut -d: -f1) + pod_name=$(echo "$config" | cut -d'|' -f1) pod_ready_status["$pod_name"]=false done @@ -77,7 +77,7 @@ check_all_pods() { # Check each pod in this iteration for config in "${configs_ref[@]}"; do echo " [DEBUG] Raw config string: '$config'" - IFS=':' read -r pod_name expected_image container_name <<< "$config" + IFS='|' read -r pod_name expected_image container_name <<< "$config" echo " [DEBUG] Parsed values:" echo " pod_name='$pod_name'" echo " expected_image='$expected_image'" @@ -177,7 +177,7 @@ check_all_pods() { echo "" echo "Failed pods:" for config in "${configs_ref[@]}"; do - IFS=':' read -r pod_name expected_image container_name <<< "$config" + IFS='|' read -r pod_name expected_image container_name <<< "$config" if [ "${pod_ready_status[$pod_name]}" != "true" ]; then current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[0].image}" 2>/dev/null || echo "ERROR") pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown") @@ -201,7 +201,7 @@ check_all_pods() { echo "" for config in "${configs_ref[@]}"; do - IFS=':' read -r pod_name expected_image container_name <<< "$config" + IFS='|' read -r pod_name expected_image container_name <<< "$config" # Use first container image as fallback current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[0].image}" 2>/dev/null || echo "ERROR") @@ -256,7 +256,8 @@ for pod_name in $pod_list; do fi # Add to configurations for parallel checking - pod_configs+=("$pod_name:$expected_image:$container_name") + # Use | as delimiter since colons appear in image tags (e.g., ciprod:3.1.31) + pod_configs+=("$pod_name|$expected_image|$container_name") done echo "Found ${#pod_configs[@]} pods to verify" From a2e61c995156c4e0e68dc273f33d36a9ffc7dd27 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Thu, 4 Dec 2025 16:54:50 -0800 Subject: [PATCH 04/40] more parsing fix --- .pipelines/e2e-test/verify-pod-images.sh | 31 +++++------------------- 1 file changed, 6 insertions(+), 25 deletions(-) diff --git a/.pipelines/e2e-test/verify-pod-images.sh b/.pipelines/e2e-test/verify-pod-images.sh index 97fdb67ca7..db16ff7da9 100644 --- a/.pipelines/e2e-test/verify-pod-images.sh +++ b/.pipelines/e2e-test/verify-pod-images.sh @@ -76,9 +76,9 @@ check_all_pods() { # Check each pod in this iteration for config in "${configs_ref[@]}"; do - echo " [DEBUG] Raw config string: '$config'" + echo " Raw config string: '$config'" IFS='|' read -r pod_name expected_image container_name <<< "$config" - echo " [DEBUG] Parsed values:" + echo " Parsed values:" echo " pod_name='$pod_name'" echo " expected_image='$expected_image'" echo " container_name='$container_name'" @@ -89,29 +89,10 @@ check_all_pods() { continue fi - # DEBUG: Try alternative methods to get the image - # Method 1: Original jsonpath (what we've been using) - echo " [DEBUG] Attempting kubectl jsonpath query..." current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "") - echo " [DEBUG] Method 1 result: '$current_image'" - - # Method 2: If method 1 is empty, try getting first container image - if [ -z "$current_image" ]; then - current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[0].image}" 2>/dev/null || echo "") - echo " [DEBUG] Method 1 (jsonpath filter) returned empty, trying method 2 (first container)" - echo " [DEBUG] Method 2 result: $current_image" - fi - - # Method 3: If still empty, try go-template - if [ -z "$current_image" ]; then - current_image=$(kubectl get pod "$pod_name" -n kube-system -o go-template='{{range .spec.containers}}{{if eq .name "'"$container_name"'"}}{{.image}}{{end}}{{end}}' 2>/dev/null || echo "") - echo " [DEBUG] Method 2 also empty, trying method 3 (go-template)" - echo " [DEBUG] Method 3 result: $current_image" - fi pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown") - # Try similar methods for container ready container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "") if [ -z "$container_ready" ]; then container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[0].ready}" 2>/dev/null || echo "false") @@ -179,9 +160,9 @@ check_all_pods() { for config in "${configs_ref[@]}"; do IFS='|' read -r pod_name expected_image container_name <<< "$config" if [ "${pod_ready_status[$pod_name]}" != "true" ]; then - current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[0].image}" 2>/dev/null || echo "ERROR") + current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "ERROR") pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown") - container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[0].ready}" 2>/dev/null || echo "false") + container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false") echo " ✗ $pod_name" echo " Expected image: $expected_image" @@ -203,8 +184,8 @@ check_all_pods() { for config in "${configs_ref[@]}"; do IFS='|' read -r pod_name expected_image container_name <<< "$config" - # Use first container image as fallback - current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[0].image}" 2>/dev/null || echo "ERROR") + # Use correct container name from config + current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "ERROR") pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown") echo "Pod: $pod_name" From c0b44d53e72d114531a4378fb82b87b4d01d6fa2 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Thu, 4 Dec 2025 18:02:58 -0800 Subject: [PATCH 05/40] fix image verfication container name --- .pipelines/e2e-test/verify-pod-images.sh | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.pipelines/e2e-test/verify-pod-images.sh b/.pipelines/e2e-test/verify-pod-images.sh index db16ff7da9..b3be17596a 100644 --- a/.pipelines/e2e-test/verify-pod-images.sh +++ b/.pipelines/e2e-test/verify-pod-images.sh @@ -274,10 +274,6 @@ if [ "$MODE" = "pre-test" ]; then echo "" echo "Final pod status:" kubectl get pods -n kube-system | grep ama-logs - echo "" - echo "Image verification:" - kubectl get pods -n kube-system -l component=ama-logs-agent -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.containers[0].image}{"\t"}{.status.phase}{"\t"}{.status.containerStatuses[0].ready}{"\n"}{end}' | column -t 2>/dev/null || true - echo "" echo "================================" echo "Container Start Time Capture" @@ -349,9 +345,6 @@ else echo "" echo "Final pod status:" kubectl get pods -n kube-system | grep ama-logs - echo "" - echo "Image summary:" - kubectl get pods -n kube-system -l component=ama-logs-agent -o custom-columns=NAME:.metadata.name,IMAGE:.spec.containers[0].image,STATUS:.status.phase,READY:.status.containerStatuses[0].ready 2>/dev/null || true exit 0 else echo "✗ FAILURE: Some pods changed images during test execution!" From 039301f5807792db164aef3b8fb465eb217f9155 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Thu, 4 Dec 2025 18:14:16 -0800 Subject: [PATCH 06/40] use short wait time temporarily --- .../azure-template-deploy-and-test-ci-image-in-aks-cluster.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml index 69f557c8b4..82e4718237 100644 --- a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml +++ b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml @@ -203,7 +203,7 @@ jobs: echo "This ensures queries will find logs from the newly deployed containers." echo "" - wait_time=1200 + wait_time=60 #TODO: change back to 1200 (20 minutes) after testing interval=60 elapsed=0 From 33f8b32c116c14d836fe2a8886a75ba9c1e6f55b Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Thu, 4 Dec 2025 20:05:38 -0800 Subject: [PATCH 07/40] simplify --- .pipelines/e2e-test/verify-pod-images.sh | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/.pipelines/e2e-test/verify-pod-images.sh b/.pipelines/e2e-test/verify-pod-images.sh index b3be17596a..8485f85eb3 100644 --- a/.pipelines/e2e-test/verify-pod-images.sh +++ b/.pipelines/e2e-test/verify-pod-images.sh @@ -90,13 +90,8 @@ check_all_pods() { fi current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "") - pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown") - - container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "") - if [ -z "$container_ready" ]; then - container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[0].ready}" 2>/dev/null || echo "false") - fi + container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false") # Check if pod is ready if [[ "$current_image" == "$expected_image" ]] && [[ "$pod_status" == "Running" ]] && [[ "$container_ready" == "true" ]]; then From 4fb699db1ab69e15a75156b8a54209a27c9fdfa0 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Thu, 4 Dec 2025 20:18:49 -0800 Subject: [PATCH 08/40] refactor --- .pipelines/e2e-test/verify-pod-images.sh | 32 ++++++++++-------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/.pipelines/e2e-test/verify-pod-images.sh b/.pipelines/e2e-test/verify-pod-images.sh index 8485f85eb3..0b6de6995f 100644 --- a/.pipelines/e2e-test/verify-pod-images.sh +++ b/.pipelines/e2e-test/verify-pod-images.sh @@ -47,7 +47,7 @@ echo "" check_all_pods() { local -n configs_ref=$1 # Use different name to avoid circular reference local max_retries=${2:-0} # Default to 0 (instant check, no retry) - local check_interval=15 # Wait 15 seconds between retries + local check_interval=60 # Wait 60 seconds between retries if [ $max_retries -gt 0 ]; then # Wait mode (pre-test): Monitor pods with retries @@ -70,7 +70,7 @@ check_all_pods() { done while [ $attempt -le $max_retries ]; do - local all_ready=true + local has_not_ready_pod=false local ready_count=0 local total_count=${#configs_ref[@]} @@ -99,14 +99,10 @@ check_all_pods() { ((ready_count++)) echo " ✓ $pod_name - Ready" else - all_ready=false - - # Show status for pods that aren't ready yet - if [ $((attempt % 4)) -eq 1 ] || [ $attempt -eq 1 ]; then # Log every 60 seconds - echo " ⏳ $pod_name - Waiting (Status: $pod_status, Container ready: $container_ready)" - if [[ "$current_image" != "$expected_image" ]]; then - echo " Image mismatch: expected $expected_image, got $current_image" - fi + has_not_ready_pod=true + echo " ⏳ $pod_name - Waiting (Status: $pod_status, Container ready: $container_ready)" + if [[ "$current_image" != "$expected_image" ]]; then + echo " Image mismatch: expected $expected_image, got $current_image" fi fi done @@ -120,15 +116,13 @@ check_all_pods() { local minutes_remaining=$((remaining_seconds / 60)) local seconds_remaining=$((remaining_seconds % 60)) - if [ $((attempt % 4)) -eq 1 ] || [ $attempt -eq 1 ] || [ "$all_ready" = true ]; then - echo "" - echo "Attempt $attempt/$max_retries (${minutes_elapsed}m${seconds_elapsed}s elapsed, ${minutes_remaining}m${seconds_remaining}s remaining)" - echo "Progress: $ready_count/$total_count pods ready" - echo "" - fi + echo "" + echo "Attempt $attempt/$max_retries (${minutes_elapsed}m${seconds_elapsed}s elapsed, ${minutes_remaining}m${seconds_remaining}s remaining)" + echo "Progress: $ready_count/$total_count pods ready" + echo "" # Exit early if all pods are ready - if [ "$all_ready" = true ]; then + if [ "$has_not_ready_pod" = false ]; then echo "================================" echo "✓ SUCCESS: All pods are ready!" echo "================================" @@ -241,8 +235,8 @@ echo "" # Use different check based on mode if [ "$MODE" = "pre-test" ]; then - # Pre-test: Wait for all pods to be ready (60 retries × 15s = 15 minutes max) - if ! check_all_pods pod_configs 60; then + # Pre-test: Wait for all pods to be ready (15 retries × 60s = 15 minutes max) + if ! check_all_pods pod_configs 15; then # Function already reports which pods failed failed_pods=true else From 354b53c57613fda21d36fa1f32df5277904d7f35 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Thu, 4 Dec 2025 23:04:06 -0800 Subject: [PATCH 09/40] post check improve --- .pipelines/e2e-test/verify-pod-images.sh | 30 ++++++++++++++++++++---- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/.pipelines/e2e-test/verify-pod-images.sh b/.pipelines/e2e-test/verify-pod-images.sh index 0b6de6995f..39165ef700 100644 --- a/.pipelines/e2e-test/verify-pod-images.sh +++ b/.pipelines/e2e-test/verify-pod-images.sh @@ -173,21 +173,41 @@ check_all_pods() { for config in "${configs_ref[@]}"; do IFS='|' read -r pod_name expected_image container_name <<< "$config" - # Use correct container name from config + # Get pod details current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "ERROR") pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown") + container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false") echo "Pod: $pod_name" echo " Container: $container_name" echo " Expected image: $expected_image" echo " Current image: $current_image" echo " Pod status: $pod_status" + echo " Container ready: $container_ready" + + # Check for any issues + local has_issue=false if [[ "$current_image" != "$expected_image" ]]; then - echo " ✗ IMAGE MISMATCH DETECTED!" - mismatches+=("$pod_name: expected '$expected_image' but found '$current_image'") - else - echo " ✓ Image is correct" + echo " ✗ IMAGE MISMATCH!" + mismatches+=("$pod_name: expected image '$expected_image' but found '$current_image'") + has_issue=true + fi + + if [[ "$pod_status" != "Running" ]]; then + echo " ✗ POD NOT RUNNING!" + mismatches+=("$pod_name: pod status is '$pod_status' (expected 'Running')") + has_issue=true + fi + + if [[ "$container_ready" != "true" ]]; then + echo " ✗ CONTAINER NOT READY!" + mismatches+=("$pod_name: container '$container_name' is not ready") + has_issue=true + fi + + if [[ "$has_issue" = false ]]; then + echo " ✓ All checks passed" fi echo "" done From 744de21bf53e4f899beeccb3928efa59818abf71 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 5 Dec 2025 10:30:33 -0800 Subject: [PATCH 10/40] refactor pod verify script --- ...eploy-and-test-ci-image-in-aks-cluster.yml | 17 +- .pipelines/e2e-test/post-test-verify-pods.sh | 123 ++++++++++ .pipelines/e2e-test/pre-test-verify-pods.sh | 217 ++++++++++++++++++ .pipelines/e2e-test/util.sh | 46 ++++ .pipelines/e2e-test/verify-pod-images.sh | 132 ++++++----- 5 files changed, 470 insertions(+), 65 deletions(-) create mode 100644 .pipelines/e2e-test/post-test-verify-pods.sh create mode 100644 .pipelines/e2e-test/pre-test-verify-pods.sh create mode 100644 .pipelines/e2e-test/util.sh diff --git a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml index 82e4718237..ce9ae339f4 100644 --- a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml +++ b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml @@ -161,11 +161,11 @@ jobs: echo "Current ama-logs pods:" kubectl get pods -n kube-system | grep ama-logs - # verify ci agent gets the new images - # output container start time for log analytics filtering + # Pre-test verification: Wait for pods to be ready with new images + # Outputs container start time for Log Analytics query filtering - task: Bash@3 name: VerifyPods - displayName: 'Wait for pods to be ready with new images' + displayName: 'Pre-Test: Wait for pods to be ready with new images' env: LINUX_IMAGE_TAG: ${{ parameters.linuxImageTag }} WINDOWS_IMAGE_TAG: ${{ parameters.windowsImageTag }} @@ -174,8 +174,8 @@ jobs: inputs: targetType: 'inline' script: | - chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-pod-images.sh - $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-pod-images.sh pre-test "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO" + chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/pre-test-verify-pods.sh + $(Build.SourcesDirectory)/.pipelines/e2e-test/pre-test-verify-pods.sh "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO" # Export container start time for use in tests if [ -f /tmp/container-deployment-time.env ]; then @@ -236,8 +236,9 @@ jobs: workingDirectory: $(Build.SourcesDirectory)/test/testkube/ displayName: 'Install Testkube and run E2E tests' + # Post-test verification: Check pods are still healthy after test execution - task: Bash@3 - displayName: 'Verify images remained stable after tests' + displayName: 'Post-Test: Verify pods remained stable after tests' condition: always() env: LINUX_IMAGE_TAG: ${{ parameters.linuxImageTag }} @@ -247,8 +248,8 @@ jobs: inputs: targetType: 'inline' script: | - chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-pod-images.sh - $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-pod-images.sh post-test "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO" + chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/post-test-verify-pods.sh + $(Build.SourcesDirectory)/.pipelines/e2e-test/post-test-verify-pods.sh "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO" # Log deployment completion - bash: | diff --git a/.pipelines/e2e-test/post-test-verify-pods.sh b/.pipelines/e2e-test/post-test-verify-pods.sh new file mode 100644 index 0000000000..61170d4321 --- /dev/null +++ b/.pipelines/e2e-test/post-test-verify-pods.sh @@ -0,0 +1,123 @@ +#!/bin/bash +# Post-Test Pod Verification +# Performs a quick health check to ensure pods maintained correct images and are still healthy +# This script is used AFTER running E2E tests to detect any pod restarts or issues during testing + +set -e + +# Parse command line arguments +LINUX_IMAGE_TAG="${1}" +WINDOWS_IMAGE_TAG="${2}" +LINUX_MCR_REPO="${3}" +WINDOWS_MCR_REPO="${4}" + +if [ -z "$LINUX_IMAGE_TAG" ] || [ -z "$WINDOWS_IMAGE_TAG" ] || [ -z "$LINUX_MCR_REPO" ] || [ -z "$WINDOWS_MCR_REPO" ]; then + echo "Error: Missing required parameters" + echo "Usage: $0 " + exit 1 +fi + +LINUX_IMAGE="$LINUX_MCR_REPO:$LINUX_IMAGE_TAG" +WINDOWS_IMAGE="$WINDOWS_MCR_REPO:$WINDOWS_IMAGE_TAG" + +# Source shared functions +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/pod-verification-common.sh" + +echo "================================" +echo "Post-Test Pod Verification" +echo "================================" +echo "Verifying pods maintained correct images and are still healthy..." +echo "" +echo "Repository Configuration:" +echo " Linux MCR repo: $LINUX_MCR_REPO" +echo " Windows MCR repo: $WINDOWS_MCR_REPO" +echo "" +echo "Expected Images:" +echo " Linux image: $LINUX_IMAGE" +echo " Windows image: $WINDOWS_IMAGE" +echo "" + +# Build pod configurations using shared function +declare -a pod_configs +build_pod_configs "$LINUX_IMAGE" "$WINDOWS_IMAGE" + +# Perform instant health check on all pods +echo "Performing instant health check on all pods..." +echo "" + +declare -a issues +for config in "${pod_configs[@]}"; do + IFS='|' read -r pod_name expected_image container_name <<< "$config" + + # Get pod details + current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "ERROR") + pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown") + container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false") + + echo "Pod: $pod_name" + echo " Container: $container_name" + echo " Expected image: $expected_image" + echo " Current image: $current_image" + echo " Pod status: $pod_status" + echo " Container ready: $container_ready" + + # Check for any issues + has_issue=false + + if [[ "$current_image" != "$expected_image" ]]; then + echo " ✗ IMAGE MISMATCH!" + issues+=("$pod_name: expected image '$expected_image' but found '$current_image'") + has_issue=true + fi + + if [[ "$pod_status" != "Running" ]]; then + echo " ✗ POD NOT RUNNING!" + issues+=("$pod_name: pod status is '$pod_status' (expected 'Running')") + has_issue=true + fi + + if [[ "$container_ready" != "true" ]]; then + echo " ✗ CONTAINER NOT READY!" + issues+=("$pod_name: container '$container_name' is not ready") + has_issue=true + fi + + if [[ "$has_issue" = false ]]; then + echo " ✓ All checks passed" + fi + echo "" +done + +# Report results +echo "================================" +echo "Post-Test Verification Summary" +echo "================================" + +if [ ${#issues[@]} -eq 0 ]; then + echo "✓ SUCCESS: All pods maintained the correct images and are healthy!" + echo "" + echo "Final pod status:" + kubectl get pods -n kube-system | grep ama-logs + exit 0 +else + echo "✗ FAILURE: Some pods have issues after test execution!" + echo "" + echo "Issues detected:" + printf ' - %s\n' "${issues[@]}" + echo "" + echo "This indicates the pods may have been restarted or updated during testing." + echo "This could cause test instability or false results." + echo "" + echo "Current pod status:" + kubectl get pods -n kube-system | grep ama-logs + echo "" + echo "Detailed pod information:" + for issue in "${issues[@]}"; do + pod=$(echo "$issue" | cut -d: -f1) + echo "" + echo "--- Details for $pod ---" + kubectl describe pod "$pod" -n kube-system | grep -A 20 "Events:" || kubectl describe pod "$pod" -n kube-system | tail -30 + done + exit 1 +fi diff --git a/.pipelines/e2e-test/pre-test-verify-pods.sh b/.pipelines/e2e-test/pre-test-verify-pods.sh new file mode 100644 index 0000000000..89a26158c7 --- /dev/null +++ b/.pipelines/e2e-test/pre-test-verify-pods.sh @@ -0,0 +1,217 @@ +#!/bin/bash +# Pre-Test Pod Verification +# Waits for all ama-logs pods to be running with the correct images and ready +# This script is used BEFORE running E2E tests to ensure the new agent version is deployed + +set -e + +# Parse command line arguments +LINUX_IMAGE_TAG="${1}" +WINDOWS_IMAGE_TAG="${2}" +LINUX_MCR_REPO="${3}" +WINDOWS_MCR_REPO="${4}" + +if [ -z "$LINUX_IMAGE_TAG" ] || [ -z "$WINDOWS_IMAGE_TAG" ] || [ -z "$LINUX_MCR_REPO" ] || [ -z "$WINDOWS_MCR_REPO" ]; then + echo "Error: Missing required parameters" + echo "Usage: $0 " + exit 1 +fi + +LINUX_IMAGE="$LINUX_MCR_REPO:$LINUX_IMAGE_TAG" +WINDOWS_IMAGE="$WINDOWS_MCR_REPO:$WINDOWS_IMAGE_TAG" + +# Configuration +MAX_RETRIES=15 +CHECK_INTERVAL=60 # seconds +MAX_WAIT_MINUTES=$((MAX_RETRIES * CHECK_INTERVAL / 60)) + +# Source shared functions +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/util.sh" + +echo "================================" +echo "Pre-Test Pod Verification" +echo "================================" +echo "Waiting for pods to be running with new images and ready..." +echo "" +echo "Repository Configuration:" +echo " Linux MCR repo: $LINUX_MCR_REPO" +echo " Windows MCR repo: $WINDOWS_MCR_REPO" +echo "" +echo "Expected Images:" +echo " Linux image: $LINUX_IMAGE" +echo " Windows image: $WINDOWS_IMAGE" +echo "" + +# Build pod configurations using shared function +declare -a pod_configs +build_pod_configs "$LINUX_IMAGE" "$WINDOWS_IMAGE" + +# Wait for all pods to be ready +echo "================================" +echo "Waiting for all pods to be ready" +echo "================================" +echo "Total pods to check: ${#pod_configs[@]}" +echo "Maximum retries: $MAX_RETRIES" +echo "Check interval: ${CHECK_INTERVAL}s" +echo "Maximum wait time: $MAX_WAIT_MINUTES minutes" +echo "" + +# Track ready status for each pod +declare -A pod_ready_status +for config in "${pod_configs[@]}"; do + pod_name=$(echo "$config" | cut -d'|' -f1) + pod_ready_status["$pod_name"]=false +done + +attempt=1 +while [ $attempt -le $MAX_RETRIES ]; do + has_not_ready_pod=false + ready_count=0 + total_count=${#pod_configs[@]} + + # Check each pod + for config in "${pod_configs[@]}"; do + IFS='|' read -r pod_name expected_image container_name <<< "$config" + + # Skip if already marked as ready + if [ "${pod_ready_status[$pod_name]}" = "true" ]; then + ((ready_count++)) + continue + fi + + # Get pod details + current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "") + pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown") + container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false") + + # Check if pod is ready + if [[ "$current_image" == "$expected_image" ]] && [[ "$pod_status" == "Running" ]] && [[ "$container_ready" == "true" ]]; then + pod_ready_status["$pod_name"]=true + ((ready_count++)) + echo " ✓ $pod_name - Ready" + else + has_not_ready_pod=true + echo " ⏳ $pod_name - Waiting (Status: $pod_status, Container ready: $container_ready)" + if [[ "$current_image" != "$expected_image" ]]; then + echo " Image mismatch: expected $expected_image, got $current_image" + fi + fi + done + + # Show progress summary + elapsed_seconds=$(((attempt - 1) * CHECK_INTERVAL)) + minutes_elapsed=$((elapsed_seconds / 60)) + seconds_elapsed=$((elapsed_seconds % 60)) + remaining_retries=$((MAX_RETRIES - attempt)) + remaining_seconds=$((remaining_retries * CHECK_INTERVAL)) + minutes_remaining=$((remaining_seconds / 60)) + seconds_remaining=$((remaining_seconds % 60)) + + echo "" + echo "Attempt $attempt/$MAX_RETRIES (${minutes_elapsed}m${seconds_elapsed}s elapsed, ${minutes_remaining}m${seconds_remaining}s remaining)" + echo "Progress: $ready_count/$total_count pods ready" + echo "" + + # Exit early if all pods are ready + if [ "$has_not_ready_pod" = false ]; then + echo "================================" + echo "✓ SUCCESS: All pods are ready!" + echo "================================" + echo "Total attempts: $attempt" + echo "Total wait time: ${minutes_elapsed}m${seconds_elapsed}s" + echo "" + + # Capture container start times using local function + capture_container_start_times + + echo "" + echo "Final pod status:" + kubectl get pods -n kube-system | grep ama-logs + exit 0 + fi + + # Sleep before next retry (except after last attempt) + if [ $attempt -lt $MAX_RETRIES ]; then + sleep $CHECK_INTERVAL + fi + + ((attempt++)) +done + +# Max retries reached - report failed pods +echo "================================" +echo "✗ TIMEOUT: Not all pods became ready after $MAX_RETRIES attempts" +echo "================================" +echo "" +echo "Failed pods:" +for config in "${pod_configs[@]}"; do + IFS='|' read -r pod_name expected_image container_name <<< "$config" + if [ "${pod_ready_status[$pod_name]}" != "true" ]; then + current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "ERROR") + pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown") + container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false") + + echo " ✗ $pod_name" + echo " Expected image: $expected_image" + echo " Current image: $current_image" + echo " Pod status: $pod_status" + echo " Container ready: $container_ready" + fi +done +echo "" +echo "Final pod status:" +kubectl get pods -n kube-system | grep ama-logs +exit 1 + + + + +# Function to capture container start times +capture_container_start_times() { + echo "================================" + echo "Container Start Time Capture" + echo "================================" + echo "Capturing LATEST container start time for Log Analytics queries..." + + local latest_start_time="" + + for config in "${pod_configs[@]}"; do + IFS='|' read -r pod_name expected_image container_name <<< "$config" + + # Get container start time + local start_time + start_time=$(kubectl get pod "$pod_name" -n kube-system \ + -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].state.running.startedAt}" 2>/dev/null || echo "") + + if [ -z "$start_time" ]; then + start_time=$(kubectl get pod "$pod_name" -n kube-system \ + -o jsonpath="{.status.containerStatuses[0].state.running.startedAt}" 2>/dev/null || echo "") + fi + + if [ -n "$start_time" ]; then + echo " Pod $pod_name container started at: $start_time" + + # Track LATEST time (lexicographically later in ISO 8601 format) + if [ -z "$latest_start_time" ] || [[ "$start_time" > "$latest_start_time" ]]; then + latest_start_time="$start_time" + fi + else + echo "✗ ERROR: Could not determine container start time for pod $pod_name (container: $container_name)" + echo "This is required for Log Analytics query filtering" + exit 1 + fi + done + + if [ -n "$latest_start_time" ]; then + # Export for use in tests + echo "CONTAINER_START_TIME=$latest_start_time" > /tmp/container-deployment-time.env + echo "" + echo "✓ LATEST container start time: $latest_start_time" + echo "✓ Saved to /tmp/container-deployment-time.env" + echo "✓ Log Analytics queries should filter: TimeGenerated > datetime('$latest_start_time')" + else + echo "✗ ERROR: Could not determine container start times" + exit 1 + fi +} diff --git a/.pipelines/e2e-test/util.sh b/.pipelines/e2e-test/util.sh new file mode 100644 index 0000000000..b157451c54 --- /dev/null +++ b/.pipelines/e2e-test/util.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Shared functions for pod verification scripts +# This file should be sourced by pre-test and post-test verification scripts + +# Function to build pod configurations +# Parameters: +# $1 - LINUX_IMAGE (full image path with tag) +# $2 - WINDOWS_IMAGE (full image path with tag) +# Returns: +# pod_configs array populated with "pod_name|expected_image|container_name" +build_pod_configs() { + local LINUX_IMAGE="$1" + local WINDOWS_IMAGE="$2" + + echo "Getting list of ama-logs pods..." + local pod_list=$(kubectl get pods -n kube-system --no-headers | grep ama-logs | awk '{print $1}') + + # Clear the global pod_configs array + pod_configs=() + + for pod_name in $pod_list; do + local expected_image + local container_name + + # Determine expected image and container name based on pod type + if [[ "$pod_name" =~ ^ama-logs-windows ]]; then + expected_image="$WINDOWS_IMAGE" + container_name="ama-logs-windows" + elif [[ "$pod_name" =~ ^ama-logs-rs ]] || [[ "$pod_name" =~ ^ama-logs-[a-z0-9]{5}$ ]]; then + expected_image="$LINUX_IMAGE" + container_name="ama-logs" + else + echo "✗ ERROR: Unknown pod pattern: $pod_name" + echo "Expected pod names to match one of:" + echo " - ama-logs-windows-* (Windows pods)" + echo " - ama-logs-rs-* (Linux ReplicaSet pods)" + echo " - ama-logs-xxxxx (Linux DaemonSet pods, 5 alphanumeric chars)" + exit 1 + fi + + pod_configs+=("$pod_name|$expected_image|$container_name") + done + + echo "Found ${#pod_configs[@]} pods to verify" + echo "" +} diff --git a/.pipelines/e2e-test/verify-pod-images.sh b/.pipelines/e2e-test/verify-pod-images.sh index 39165ef700..f82e1a2e3c 100644 --- a/.pipelines/e2e-test/verify-pod-images.sh +++ b/.pipelines/e2e-test/verify-pod-images.sh @@ -11,6 +11,14 @@ WINDOWS_IMAGE_TAG="${3}" LINUX_MCR_REPO="${4}" WINDOWS_MCR_REPO="${5}" +# Validate MODE parameter +if [[ "$MODE" != "pre-test" && "$MODE" != "post-test" ]]; then + echo "Error: Invalid mode '$MODE'" + echo "MODE must be either 'pre-test' or 'post-test'" + echo "Usage: $0 " + exit 1 +fi + if [ -z "$LINUX_IMAGE_TAG" ] || [ -z "$WINDOWS_IMAGE_TAG" ] || [ -z "$LINUX_MCR_REPO" ] || [ -z "$WINDOWS_MCR_REPO" ]; then echo "Error: Missing required parameters" echo "Usage: $0 " @@ -241,8 +249,12 @@ for pod_name in $pod_list; do expected_image="$LINUX_IMAGE" container_name="ama-logs" else - echo "⚠ Unknown pod pattern: $pod_name - skipping verification" - continue + echo "✗ ERROR: Unknown pod pattern: $pod_name" + echo "Expected pod names to match one of:" + echo " - ama-logs-windows-* (Windows pods)" + echo " - ama-logs-rs-* (Linux ReplicaSet pods)" + echo " - ama-logs-xxxxx (Linux DaemonSet pods, 5 alphanumeric chars)" + exit 1 fi # Add to configurations for parallel checking @@ -257,7 +269,6 @@ echo "" if [ "$MODE" = "pre-test" ]; then # Pre-test: Wait for all pods to be ready (15 retries × 60s = 15 minutes max) if ! check_all_pods pod_configs 15; then - # Function already reports which pods failed failed_pods=true else failed_pods=false @@ -284,60 +295,7 @@ if [ "$MODE" = "pre-test" ]; then echo "Final pod status:" kubectl get pods -n kube-system | grep ama-logs echo "" - echo "================================" - echo "Container Start Time Capture" - echo "================================" - echo "Capturing LATEST container start time for Log Analytics queries..." - - # Get all container start times and find the LATEST one - latest_start_time="" - - pod_list=$(kubectl get pods -n kube-system --no-headers | grep ama-logs | awk '{print $1}') - for pod_name in $pod_list; do - # Get container name based on pod type - if [[ "$pod_name" =~ ^ama-logs-windows ]]; then - container_name="ama-logs-windows" - elif [[ "$pod_name" =~ ^ama-logs-rs ]] || [[ "$pod_name" =~ ^ama-logs-[a-z0-9]{5}$ ]]; then - container_name="ama-logs" - else - continue - fi - - # Get container start time - try first container if filter doesn't work - start_time=$(kubectl get pod "$pod_name" -n kube-system \ - -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].state.running.startedAt}" 2>/dev/null || echo "") - - if [ -z "$start_time" ]; then - start_time=$(kubectl get pod "$pod_name" -n kube-system \ - -o jsonpath="{.status.containerStatuses[0].state.running.startedAt}" 2>/dev/null || echo "") - fi - - if [ -n "$start_time" ]; then - echo " Pod $pod_name container started at: $start_time" - - # Track LATEST time (lexicographically later in ISO 8601 format) - if [ -z "$latest_start_time" ] || [[ "$start_time" > "$latest_start_time" ]]; then - latest_start_time="$start_time" - fi - else - echo "✗ ERROR: Could not determine container start time for pod $pod_name (container: $container_name)" - echo "This is required for Log Analytics query filtering" - exit 1 - fi - done - - if [ -n "$latest_start_time" ]; then - # Export for use in tests - echo "CONTAINER_START_TIME=$latest_start_time" > /tmp/container-deployment-time.env - echo "" - echo "✓ LATEST container start time: $latest_start_time" - echo "✓ Saved to /tmp/container-deployment-time.env" - echo "✓ Log Analytics queries should filter: TimeGenerated > datetime('$latest_start_time')" - else - echo "✗ ERROR: Could not determine container start times" - echo "This is required for Log Analytics query filtering" - exit 1 - fi + capture_container_start_times exit 0 else @@ -377,3 +335,63 @@ else exit 1 fi fi + +# Function to capture container start times for Log Analytics query filtering +capture_container_start_times() { + echo "================================" + echo "Container Start Time Capture" + echo "================================" + echo "Capturing LATEST container start time for Log Analytics queries..." + + # Get all container start times and find the LATEST one + local latest_start_time="" + + local pod_list=$(kubectl get pods -n kube-system --no-headers | grep ama-logs | awk '{print $1}') + for pod_name in $pod_list; do + # Get container name based on pod type + local container_name + if [[ "$pod_name" =~ ^ama-logs-windows ]]; then + container_name="ama-logs-windows" + elif [[ "$pod_name" =~ ^ama-logs-rs ]] || [[ "$pod_name" =~ ^ama-logs-[a-z0-9]{5}$ ]]; then + container_name="ama-logs" + else + continue + fi + + # Get container start time - try first container if filter doesn't work + local start_time + start_time=$(kubectl get pod "$pod_name" -n kube-system \ + -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].state.running.startedAt}" 2>/dev/null || echo "") + + if [ -z "$start_time" ]; then + start_time=$(kubectl get pod "$pod_name" -n kube-system \ + -o jsonpath="{.status.containerStatuses[0].state.running.startedAt}" 2>/dev/null || echo "") + fi + + if [ -n "$start_time" ]; then + echo " Pod $pod_name container started at: $start_time" + + # Track LATEST time (lexicographically later in ISO 8601 format) + if [ -z "$latest_start_time" ] || [[ "$start_time" > "$latest_start_time" ]]; then + latest_start_time="$start_time" + fi + else + echo "✗ ERROR: Could not determine container start time for pod $pod_name (container: $container_name)" + echo "This is required for Log Analytics query filtering" + exit 1 + fi + done + + if [ -n "$latest_start_time" ]; then + # Export for use in tests + echo "CONTAINER_START_TIME=$latest_start_time" > /tmp/container-deployment-time.env + echo "" + echo "✓ LATEST container start time: $latest_start_time" + echo "✓ Saved to /tmp/container-deployment-time.env" + echo "✓ Log Analytics queries should filter: TimeGenerated > datetime('$latest_start_time')" + else + echo "✗ ERROR: Could not determine container start times" + echo "This is required for Log Analytics query filtering" + exit 1 + fi +} From c0d8d7293f34758c2374f3c21c4d33d12b200e04 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 5 Dec 2025 10:47:00 -0800 Subject: [PATCH 11/40] move func --- .pipelines/e2e-test/pre-test-verify-pods.sh | 96 ++++++++++----------- 1 file changed, 44 insertions(+), 52 deletions(-) diff --git a/.pipelines/e2e-test/pre-test-verify-pods.sh b/.pipelines/e2e-test/pre-test-verify-pods.sh index 89a26158c7..48789ae056 100644 --- a/.pipelines/e2e-test/pre-test-verify-pods.sh +++ b/.pipelines/e2e-test/pre-test-verify-pods.sh @@ -29,6 +29,50 @@ MAX_WAIT_MINUTES=$((MAX_RETRIES * CHECK_INTERVAL / 60)) SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/util.sh" +# Function to capture container start times +capture_container_start_times() { + echo "================================" + echo "Container Start Time Capture" + echo "================================" + echo "Capturing LATEST container start time for Log Analytics queries..." + + local latest_start_time="" + + for config in "${pod_configs[@]}"; do + IFS='|' read -r pod_name expected_image container_name <<< "$config" + + # Get container start time for the specific container + local start_time + start_time=$(kubectl get pod "$pod_name" -n kube-system \ + -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].state.running.startedAt}" 2>/dev/null || echo "") + + if [ -n "$start_time" ]; then + echo " Pod $pod_name container started at: $start_time" + + # Track LATEST time (lexicographically later in ISO 8601 format) + if [ -z "$latest_start_time" ] || [[ "$start_time" > "$latest_start_time" ]]; then + latest_start_time="$start_time" + fi + else + echo "✗ ERROR: Could not determine container start time for pod $pod_name (container: $container_name)" + echo "This is required for Log Analytics query filtering" + exit 1 + fi + done + + if [ -n "$latest_start_time" ]; then + # Export for use in tests + echo "CONTAINER_START_TIME=$latest_start_time" > /tmp/container-deployment-time.env + echo "" + echo "✓ LATEST container start time: $latest_start_time" + echo "✓ Saved to /tmp/container-deployment-time.env" + echo "✓ Log Analytics queries should filter: TimeGenerated > datetime('$latest_start_time')" + else + echo "✗ ERROR: Could not determine container start times" + exit 1 + fi +} + echo "================================" echo "Pre-Test Pod Verification" echo "================================" @@ -163,55 +207,3 @@ echo "" echo "Final pod status:" kubectl get pods -n kube-system | grep ama-logs exit 1 - - - - -# Function to capture container start times -capture_container_start_times() { - echo "================================" - echo "Container Start Time Capture" - echo "================================" - echo "Capturing LATEST container start time for Log Analytics queries..." - - local latest_start_time="" - - for config in "${pod_configs[@]}"; do - IFS='|' read -r pod_name expected_image container_name <<< "$config" - - # Get container start time - local start_time - start_time=$(kubectl get pod "$pod_name" -n kube-system \ - -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].state.running.startedAt}" 2>/dev/null || echo "") - - if [ -z "$start_time" ]; then - start_time=$(kubectl get pod "$pod_name" -n kube-system \ - -o jsonpath="{.status.containerStatuses[0].state.running.startedAt}" 2>/dev/null || echo "") - fi - - if [ -n "$start_time" ]; then - echo " Pod $pod_name container started at: $start_time" - - # Track LATEST time (lexicographically later in ISO 8601 format) - if [ -z "$latest_start_time" ] || [[ "$start_time" > "$latest_start_time" ]]; then - latest_start_time="$start_time" - fi - else - echo "✗ ERROR: Could not determine container start time for pod $pod_name (container: $container_name)" - echo "This is required for Log Analytics query filtering" - exit 1 - fi - done - - if [ -n "$latest_start_time" ]; then - # Export for use in tests - echo "CONTAINER_START_TIME=$latest_start_time" > /tmp/container-deployment-time.env - echo "" - echo "✓ LATEST container start time: $latest_start_time" - echo "✓ Saved to /tmp/container-deployment-time.env" - echo "✓ Log Analytics queries should filter: TimeGenerated > datetime('$latest_start_time')" - else - echo "✗ ERROR: Could not determine container start times" - exit 1 - fi -} From ffb397fdca4f8903e6cba32b3fa01962f6eafa11 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 5 Dec 2025 10:47:56 -0800 Subject: [PATCH 12/40] fix --- .pipelines/e2e-test/post-test-verify-pods.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pipelines/e2e-test/post-test-verify-pods.sh b/.pipelines/e2e-test/post-test-verify-pods.sh index 61170d4321..1f20cd3c56 100644 --- a/.pipelines/e2e-test/post-test-verify-pods.sh +++ b/.pipelines/e2e-test/post-test-verify-pods.sh @@ -22,7 +22,7 @@ WINDOWS_IMAGE="$WINDOWS_MCR_REPO:$WINDOWS_IMAGE_TAG" # Source shared functions SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "$SCRIPT_DIR/pod-verification-common.sh" +source "$SCRIPT_DIR/util.sh" echo "================================" echo "Post-Test Pod Verification" From 89950523d576f4e19d4a56b1c46ad13ca9576028 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 5 Dec 2025 11:12:34 -0800 Subject: [PATCH 13/40] debug --- .pipelines/e2e-test/pre-test-verify-pods.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.pipelines/e2e-test/pre-test-verify-pods.sh b/.pipelines/e2e-test/pre-test-verify-pods.sh index 48789ae056..28b7ae5496 100644 --- a/.pipelines/e2e-test/pre-test-verify-pods.sh +++ b/.pipelines/e2e-test/pre-test-verify-pods.sh @@ -91,6 +91,13 @@ echo "" declare -a pod_configs build_pod_configs "$LINUX_IMAGE" "$WINDOWS_IMAGE" +# Validate array was populated +if [ ${#pod_configs[@]} -eq 0 ]; then + echo "✗ ERROR: No pods found to verify!" + echo "This likely means no ama-logs pods exist in the kube-system namespace." + exit 1 +fi + # Wait for all pods to be ready echo "================================" echo "Waiting for all pods to be ready" @@ -105,9 +112,11 @@ echo "" declare -A pod_ready_status for config in "${pod_configs[@]}"; do pod_name=$(echo "$config" | cut -d'|' -f1) + echo "DEBUG: Initializing pod $pod_name to not ready" pod_ready_status["$pod_name"]=false done +echo "DEBUG: All pods initialized, starting retry loop" attempt=1 while [ $attempt -le $MAX_RETRIES ]; do has_not_ready_pod=false From 71e9d642fb8a22283f300362ec481541e95775d0 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 5 Dec 2025 11:23:05 -0800 Subject: [PATCH 14/40] add debug --- .pipelines/e2e-test/pre-test-verify-pods.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.pipelines/e2e-test/pre-test-verify-pods.sh b/.pipelines/e2e-test/pre-test-verify-pods.sh index 28b7ae5496..d243cf9128 100644 --- a/.pipelines/e2e-test/pre-test-verify-pods.sh +++ b/.pipelines/e2e-test/pre-test-verify-pods.sh @@ -41,11 +41,15 @@ capture_container_start_times() { for config in "${pod_configs[@]}"; do IFS='|' read -r pod_name expected_image container_name <<< "$config" + echo "DEBUG: Querying start time for pod $pod_name, container $container_name" + # Get container start time for the specific container local start_time start_time=$(kubectl get pod "$pod_name" -n kube-system \ -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].state.running.startedAt}" 2>/dev/null || echo "") + echo "DEBUG: Got start_time='$start_time'" + if [ -n "$start_time" ]; then echo " Pod $pod_name container started at: $start_time" @@ -118,7 +122,10 @@ done echo "DEBUG: All pods initialized, starting retry loop" attempt=1 +echo "DEBUG: attempt=$attempt, MAX_RETRIES=$MAX_RETRIES" +echo "DEBUG: Condition check: [ $attempt -le $MAX_RETRIES ] = $([ $attempt -le $MAX_RETRIES ] && echo true || echo false)" while [ $attempt -le $MAX_RETRIES ]; do + echo "DEBUG: Inside while loop, attempt=$attempt" has_not_ready_pod=false ready_count=0 total_count=${#pod_configs[@]} From 8b4c392bafbc7a0b871a7ba3af3246402f196b9c Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 5 Dec 2025 11:33:00 -0800 Subject: [PATCH 15/40] add debug --- .pipelines/e2e-test/pre-test-verify-pods.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.pipelines/e2e-test/pre-test-verify-pods.sh b/.pipelines/e2e-test/pre-test-verify-pods.sh index d243cf9128..a15f79194d 100644 --- a/.pipelines/e2e-test/pre-test-verify-pods.sh +++ b/.pipelines/e2e-test/pre-test-verify-pods.sh @@ -129,10 +129,13 @@ while [ $attempt -le $MAX_RETRIES ]; do has_not_ready_pod=false ready_count=0 total_count=${#pod_configs[@]} + echo "DEBUG: Initialized loop variables, checking $total_count pods" # Check each pod for config in "${pod_configs[@]}"; do + echo "DEBUG: Processing config: $config" IFS='|' read -r pod_name expected_image container_name <<< "$config" + echo "DEBUG: Parsed - pod=$pod_name, container=$container_name" # Skip if already marked as ready if [ "${pod_ready_status[$pod_name]}" = "true" ]; then From 231ff81db14932462511c8d8a6b28254126b3570 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 5 Dec 2025 11:52:08 -0800 Subject: [PATCH 16/40] more debug --- .pipelines/e2e-test/pre-test-verify-pods.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.pipelines/e2e-test/pre-test-verify-pods.sh b/.pipelines/e2e-test/pre-test-verify-pods.sh index a15f79194d..504f36cb4b 100644 --- a/.pipelines/e2e-test/pre-test-verify-pods.sh +++ b/.pipelines/e2e-test/pre-test-verify-pods.sh @@ -126,8 +126,11 @@ echo "DEBUG: attempt=$attempt, MAX_RETRIES=$MAX_RETRIES" echo "DEBUG: Condition check: [ $attempt -le $MAX_RETRIES ] = $([ $attempt -le $MAX_RETRIES ] && echo true || echo false)" while [ $attempt -le $MAX_RETRIES ]; do echo "DEBUG: Inside while loop, attempt=$attempt" + echo "DEBUG: About to set has_not_ready_pod" has_not_ready_pod=false + echo "DEBUG: About to set ready_count" ready_count=0 + echo "DEBUG: About to set total_count" total_count=${#pod_configs[@]} echo "DEBUG: Initialized loop variables, checking $total_count pods" From 20c9267c1dc09c894f3605ccc8d7542fb6efab62 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 5 Dec 2025 12:01:06 -0800 Subject: [PATCH 17/40] more debugging --- .pipelines/e2e-test/pre-test-verify-pods.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.pipelines/e2e-test/pre-test-verify-pods.sh b/.pipelines/e2e-test/pre-test-verify-pods.sh index 504f36cb4b..ecdf04843a 100644 --- a/.pipelines/e2e-test/pre-test-verify-pods.sh +++ b/.pipelines/e2e-test/pre-test-verify-pods.sh @@ -141,15 +141,20 @@ while [ $attempt -le $MAX_RETRIES ]; do echo "DEBUG: Parsed - pod=$pod_name, container=$container_name" # Skip if already marked as ready + echo "DEBUG: Checking if $pod_name already marked ready" if [ "${pod_ready_status[$pod_name]}" = "true" ]; then ((ready_count++)) continue fi + echo "DEBUG: Getting pod details for $pod_name" # Get pod details current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "") + echo "DEBUG: current_image='$current_image'" pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown") + echo "DEBUG: pod_status='$pod_status'" container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false") + echo "DEBUG: container_ready='$container_ready'" # Check if pod is ready if [[ "$current_image" == "$expected_image" ]] && [[ "$pod_status" == "Running" ]] && [[ "$container_ready" == "true" ]]; then From 623099fb3f2c6bf87723baf86d54d4989b767616 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 5 Dec 2025 12:48:31 -0800 Subject: [PATCH 18/40] more debug --- .pipelines/e2e-test/pre-test-verify-pods.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.pipelines/e2e-test/pre-test-verify-pods.sh b/.pipelines/e2e-test/pre-test-verify-pods.sh index ecdf04843a..2fcea46e7d 100644 --- a/.pipelines/e2e-test/pre-test-verify-pods.sh +++ b/.pipelines/e2e-test/pre-test-verify-pods.sh @@ -143,7 +143,7 @@ while [ $attempt -le $MAX_RETRIES ]; do # Skip if already marked as ready echo "DEBUG: Checking if $pod_name already marked ready" if [ "${pod_ready_status[$pod_name]}" = "true" ]; then - ((ready_count++)) + ready_count=$((ready_count + 1)) continue fi @@ -158,8 +158,9 @@ while [ $attempt -le $MAX_RETRIES ]; do # Check if pod is ready if [[ "$current_image" == "$expected_image" ]] && [[ "$pod_status" == "Running" ]] && [[ "$container_ready" == "true" ]]; then + echo "DEBUG: Marking $pod_name as ready" pod_ready_status["$pod_name"]=true - ((ready_count++)) + ready_count=$((ready_count + 1)) echo " ✓ $pod_name - Ready" else has_not_ready_pod=true From b397fc0c6ef154dcf3e4dd892ed7707ee6201ac5 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 5 Dec 2025 13:23:39 -0800 Subject: [PATCH 19/40] refactor.. --- .pipelines/e2e-test/post-test-verify-pods.sh | 4 ++-- .pipelines/e2e-test/pre-test-verify-pods.sh | 25 ++++---------------- 2 files changed, 6 insertions(+), 23 deletions(-) diff --git a/.pipelines/e2e-test/post-test-verify-pods.sh b/.pipelines/e2e-test/post-test-verify-pods.sh index 1f20cd3c56..d3e5cf04a4 100644 --- a/.pipelines/e2e-test/post-test-verify-pods.sh +++ b/.pipelines/e2e-test/post-test-verify-pods.sh @@ -55,7 +55,7 @@ for config in "${pod_configs[@]}"; do pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown") container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false") - echo "Pod: $pod_name" + echo "Check pod: $pod_name" echo " Container: $container_name" echo " Expected image: $expected_image" echo " Current image: $current_image" @@ -84,7 +84,7 @@ for config in "${pod_configs[@]}"; do fi if [[ "$has_issue" = false ]]; then - echo " ✓ All checks passed" + echo " ✓ Pod: $pod_name passed checks" fi echo "" done diff --git a/.pipelines/e2e-test/pre-test-verify-pods.sh b/.pipelines/e2e-test/pre-test-verify-pods.sh index 2fcea46e7d..816a50a498 100644 --- a/.pipelines/e2e-test/pre-test-verify-pods.sh +++ b/.pipelines/e2e-test/pre-test-verify-pods.sh @@ -41,15 +41,11 @@ capture_container_start_times() { for config in "${pod_configs[@]}"; do IFS='|' read -r pod_name expected_image container_name <<< "$config" - echo "DEBUG: Querying start time for pod $pod_name, container $container_name" - # Get container start time for the specific container local start_time start_time=$(kubectl get pod "$pod_name" -n kube-system \ -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].state.running.startedAt}" 2>/dev/null || echo "") - echo "DEBUG: Got start_time='$start_time'" - if [ -n "$start_time" ]; then echo " Pod $pod_name container started at: $start_time" @@ -116,49 +112,36 @@ echo "" declare -A pod_ready_status for config in "${pod_configs[@]}"; do pod_name=$(echo "$config" | cut -d'|' -f1) - echo "DEBUG: Initializing pod $pod_name to not ready" pod_ready_status["$pod_name"]=false done -echo "DEBUG: All pods initialized, starting retry loop" attempt=1 -echo "DEBUG: attempt=$attempt, MAX_RETRIES=$MAX_RETRIES" -echo "DEBUG: Condition check: [ $attempt -le $MAX_RETRIES ] = $([ $attempt -le $MAX_RETRIES ] && echo true || echo false)" while [ $attempt -le $MAX_RETRIES ]; do - echo "DEBUG: Inside while loop, attempt=$attempt" - echo "DEBUG: About to set has_not_ready_pod" has_not_ready_pod=false - echo "DEBUG: About to set ready_count" ready_count=0 - echo "DEBUG: About to set total_count" total_count=${#pod_configs[@]} - echo "DEBUG: Initialized loop variables, checking $total_count pods" # Check each pod for config in "${pod_configs[@]}"; do - echo "DEBUG: Processing config: $config" IFS='|' read -r pod_name expected_image container_name <<< "$config" - echo "DEBUG: Parsed - pod=$pod_name, container=$container_name" + echo " Checking pod: $pod_name" + echo " Container: $container_name" + echo " Expected image: $expected_image" # Skip if already marked as ready - echo "DEBUG: Checking if $pod_name already marked ready" if [ "${pod_ready_status[$pod_name]}" = "true" ]; then + echo " Pod: $pod_name has expected image ready. Skipping check." ready_count=$((ready_count + 1)) continue fi - echo "DEBUG: Getting pod details for $pod_name" # Get pod details current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "") - echo "DEBUG: current_image='$current_image'" pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown") - echo "DEBUG: pod_status='$pod_status'" container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false") - echo "DEBUG: container_ready='$container_ready'" # Check if pod is ready if [[ "$current_image" == "$expected_image" ]] && [[ "$pod_status" == "Running" ]] && [[ "$container_ready" == "true" ]]; then - echo "DEBUG: Marking $pod_name as ready" pod_ready_status["$pod_name"]=true ready_count=$((ready_count + 1)) echo " ✓ $pod_name - Ready" From 6f0f06374be9bb20bcaef77958d99d4338aa82e4 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 5 Dec 2025 13:43:02 -0800 Subject: [PATCH 20/40] add comments --- .pipelines/e2e-test/pre-test-verify-pods.sh | 23 +++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/.pipelines/e2e-test/pre-test-verify-pods.sh b/.pipelines/e2e-test/pre-test-verify-pods.sh index 816a50a498..14d8caa7ec 100644 --- a/.pipelines/e2e-test/pre-test-verify-pods.sh +++ b/.pipelines/e2e-test/pre-test-verify-pods.sh @@ -124,13 +124,17 @@ while [ $attempt -le $MAX_RETRIES ]; do # Check each pod for config in "${pod_configs[@]}"; do IFS='|' read -r pod_name expected_image container_name <<< "$config" - echo " Checking pod: $pod_name" + echo "" + echo "" + echo " Start checking pod: $pod_name" echo " Container: $container_name" echo " Expected image: $expected_image" - + # Skip if already marked as ready if [ "${pod_ready_status[$pod_name]}" = "true" ]; then - echo " Pod: $pod_name has expected image ready. Skipping check." + echo " Finished checking pod: $pod_name" + echo " Pod: $pod_name has expected image ready. Skipping check." + echo " ✓ $pod_name - Ready" ready_count=$((ready_count + 1)) continue fi @@ -144,13 +148,20 @@ while [ $attempt -le $MAX_RETRIES ]; do if [[ "$current_image" == "$expected_image" ]] && [[ "$pod_status" == "Running" ]] && [[ "$container_ready" == "true" ]]; then pod_ready_status["$pod_name"]=true ready_count=$((ready_count + 1)) - echo " ✓ $pod_name - Ready" + echo " Finished checking pod: $pod_name" + echo " Image: $current_image" + echo " Expected image: $expected_image" + echo " Status: $pod_status" + echo " Container ready: $container_ready" + echo " ✓ $pod_name - Ready" else has_not_ready_pod=true - echo " ⏳ $pod_name - Waiting (Status: $pod_status, Container ready: $container_ready)" + echo " Finished checking pod: $pod_name" + echo " ⏳ $pod_name - Waiting (Status: $pod_status, Container ready: $container_ready)" if [[ "$current_image" != "$expected_image" ]]; then - echo " Image mismatch: expected $expected_image, got $current_image" + echo " Image mismatch: expected $expected_image, got $current_image" fi + echo " x $pod_name - NOT Ready" fi done From f2f8839152d9be53db1011ecca539d89424bd70a Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 5 Dec 2025 14:19:49 -0800 Subject: [PATCH 21/40] move container start time capture --- ...eploy-and-test-ci-image-in-aks-cluster.yml | 31 +- .../e2e-test/capture-container-start-time.sh | 101 +++++ ...pods.sh => verify-ci-images-after-test.sh} | 0 ...ods.sh => verify-ci-images-before-test.sh} | 49 --- .pipelines/e2e-test/verify-pod-images.sh | 397 ------------------ 5 files changed, 123 insertions(+), 455 deletions(-) create mode 100644 .pipelines/e2e-test/capture-container-start-time.sh rename .pipelines/e2e-test/{post-test-verify-pods.sh => verify-ci-images-after-test.sh} (100%) rename .pipelines/e2e-test/{pre-test-verify-pods.sh => verify-ci-images-before-test.sh} (78%) delete mode 100644 .pipelines/e2e-test/verify-pod-images.sh diff --git a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml index ce9ae339f4..db1c9a2a73 100644 --- a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml +++ b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml @@ -162,9 +162,7 @@ jobs: kubectl get pods -n kube-system | grep ama-logs # Pre-test verification: Wait for pods to be ready with new images - # Outputs container start time for Log Analytics query filtering - task: Bash@3 - name: VerifyPods displayName: 'Pre-Test: Wait for pods to be ready with new images' env: LINUX_IMAGE_TAG: ${{ parameters.linuxImageTag }} @@ -174,8 +172,23 @@ jobs: inputs: targetType: 'inline' script: | - chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/pre-test-verify-pods.sh - $(Build.SourcesDirectory)/.pipelines/e2e-test/pre-test-verify-pods.sh "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO" + chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-ci-images-before-test.sh + $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-ci-images-before-test.sh "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO" + + # Capture container start times for Log Analytics query filtering + - task: Bash@3 + name: CaptureStartTime + displayName: 'Capture container start times for Log Analytics filtering' + env: + LINUX_IMAGE_TAG: ${{ parameters.linuxImageTag }} + WINDOWS_IMAGE_TAG: ${{ parameters.windowsImageTag }} + LINUX_MCR_REPO: $(DetermineMcrRepo.linuxMcrRepo) + WINDOWS_MCR_REPO: $(DetermineMcrRepo.windowsMcrRepo) + inputs: + targetType: 'inline' + script: | + chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/capture-container-start-time.sh + $(Build.SourcesDirectory)/.pipelines/e2e-test/capture-container-start-time.sh "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO" # Export container start time for use in tests if [ -f /tmp/container-deployment-time.env ]; then @@ -197,7 +210,7 @@ jobs: echo "Waiting for Log Analytics Ingestion" echo "========================================" echo "Cluster: ${{ parameters.clusterName }}" - echo "Container start time: $(VerifyPods.CONTAINER_START_TIME)" + echo "Container start time: $(CaptureStartTime.CONTAINER_START_TIME)" echo "" echo "Waiting 20 minutes to allow logs to be ingested..." echo "This ensures queries will find logs from the newly deployed containers." @@ -218,12 +231,12 @@ jobs: echo "" echo "✓ Wait complete! Logs should now be available in Log Analytics." - echo "✓ Tests will query logs with filter: TimeGenerated > datetime('$(VerifyPods.CONTAINER_START_TIME)')" + echo "✓ Tests will query logs with filter: TimeGenerated > datetime('$(CaptureStartTime.CONTAINER_START_TIME)')" echo "========================================" # TODO (improvement): container start time is captured in previous step, but not used for now. Consider passing container start time to test script to use in log queries - bash: | # Pass container start time to tests - export CONTAINER_START_TIME="$(VerifyPods.CONTAINER_START_TIME)" + export CONTAINER_START_TIME="$(CaptureStartTime.CONTAINER_START_TIME)" echo "Running tests for cluster: ${{ parameters.clusterName }}" echo "Container start time: $CONTAINER_START_TIME" @@ -248,8 +261,8 @@ jobs: inputs: targetType: 'inline' script: | - chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/post-test-verify-pods.sh - $(Build.SourcesDirectory)/.pipelines/e2e-test/post-test-verify-pods.sh "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO" + chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-ci-images-after-test.sh + $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-ci-images-after-test.sh "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO" # Log deployment completion - bash: | diff --git a/.pipelines/e2e-test/capture-container-start-time.sh b/.pipelines/e2e-test/capture-container-start-time.sh new file mode 100644 index 0000000000..733bb77962 --- /dev/null +++ b/.pipelines/e2e-test/capture-container-start-time.sh @@ -0,0 +1,101 @@ +#!/bin/bash +# Capture Container Start Times +# Captures the LATEST container start time across all ama-logs pods +# This is used to filter Log Analytics queries to only show logs from the newly deployed containers + +set -e + +# Parse command line arguments +LINUX_IMAGE_TAG="${1}" +WINDOWS_IMAGE_TAG="${2}" +LINUX_MCR_REPO="${3}" +WINDOWS_MCR_REPO="${4}" + +if [ -z "$LINUX_IMAGE_TAG" ] || [ -z "$WINDOWS_IMAGE_TAG" ] || [ -z "$LINUX_MCR_REPO" ] || [ -z "$WINDOWS_MCR_REPO" ]; then + echo "Error: Missing required parameters" + echo "Usage: $0 " + exit 1 +fi + +LINUX_IMAGE="$LINUX_MCR_REPO:$LINUX_IMAGE_TAG" +WINDOWS_IMAGE="$WINDOWS_MCR_REPO:$WINDOWS_IMAGE_TAG" + +# Source shared functions +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/util.sh" + +echo "================================" +echo "Container Start Time Capture" +echo "================================" +echo "Capturing LATEST container start time for Log Analytics queries..." +echo "" + +# Build pod configurations using shared function +declare -a pod_configs +build_pod_configs "$LINUX_IMAGE" "$WINDOWS_IMAGE" + +if [ ${#pod_configs[@]} -eq 0 ]; then + echo "✗ ERROR: No pods found!" + exit 1 +fi + +latest_start_time="" + +for config in "${pod_configs[@]}"; do + IFS='|' read -r pod_name expected_image container_name <<< "$config" + + # Get container start time for the specific container + start_time=$(kubectl get pod "$pod_name" -n kube-system \ + -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].state.running.startedAt}" 2>/dev/null || echo "") + + if [ -n "$start_time" ]; then + echo " Pod $pod_name (container: $container_name) started at: $start_time" + + # Track LATEST time (lexicographically later in ISO 8601 format) + if [ -z "$latest_start_time" ] || [[ "$start_time" > "$latest_start_time" ]]; then + latest_start_time="$start_time" + fi + else + echo "✗ ERROR: Could not determine container start time for pod $pod_name (container: $container_name)" + echo "This is required for Log Analytics query filtering" + exit 1 + fi +done + +if [ -n "$latest_start_time" ]; then + # Validate that start time is recent (within last 30 minutes) + # This ensures we captured the newly deployed containers, not old ones + current_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + current_epoch=$(date -u -d "$current_time" +%s 2>/dev/null || date -u -j -f "%Y-%m-%dT%H:%M:%SZ" "$current_time" +%s 2>/dev/null) + start_epoch=$(date -u -d "$latest_start_time" +%s 2>/dev/null || date -u -j -f "%Y-%m-%dT%H:%M:%SZ" "$latest_start_time" +%s 2>/dev/null) + time_diff=$((current_epoch - start_epoch)) + time_diff_minutes=$((time_diff / 60)) + + echo "" + echo "Time validation:" + echo " Current UTC time: $current_time" + echo " Latest start time: $latest_start_time" + echo " Time difference: $time_diff_minutes minutes ago" + + if [ $time_diff_minutes -gt 30 ]; then + echo "" + echo "⚠ WARNING: Container start time is $time_diff_minutes minutes old!" + echo "This suggests the containers may not have been restarted with the new images." + echo "Expected: Within ~2-5 minutes (time for pods to restart after patching)" + echo "Consider investigating if the image patch actually triggered pod restarts." + else + echo " ✓ Start time is recent (within expected range)" + fi + + # Export for use in tests + echo "CONTAINER_START_TIME=$latest_start_time" > /tmp/container-deployment-time.env + echo "" + echo "✓ LATEST container start time: $latest_start_time" + echo "✓ Saved to /tmp/container-deployment-time.env" + echo "✓ Log Analytics queries should filter: TimeGenerated > datetime('$latest_start_time')" + echo "" + exit 0 +else + echo "✗ ERROR: Could not determine container start times" + exit 1 +fi diff --git a/.pipelines/e2e-test/post-test-verify-pods.sh b/.pipelines/e2e-test/verify-ci-images-after-test.sh similarity index 100% rename from .pipelines/e2e-test/post-test-verify-pods.sh rename to .pipelines/e2e-test/verify-ci-images-after-test.sh diff --git a/.pipelines/e2e-test/pre-test-verify-pods.sh b/.pipelines/e2e-test/verify-ci-images-before-test.sh similarity index 78% rename from .pipelines/e2e-test/pre-test-verify-pods.sh rename to .pipelines/e2e-test/verify-ci-images-before-test.sh index 14d8caa7ec..d0f4a4f25b 100644 --- a/.pipelines/e2e-test/pre-test-verify-pods.sh +++ b/.pipelines/e2e-test/verify-ci-images-before-test.sh @@ -29,50 +29,6 @@ MAX_WAIT_MINUTES=$((MAX_RETRIES * CHECK_INTERVAL / 60)) SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/util.sh" -# Function to capture container start times -capture_container_start_times() { - echo "================================" - echo "Container Start Time Capture" - echo "================================" - echo "Capturing LATEST container start time for Log Analytics queries..." - - local latest_start_time="" - - for config in "${pod_configs[@]}"; do - IFS='|' read -r pod_name expected_image container_name <<< "$config" - - # Get container start time for the specific container - local start_time - start_time=$(kubectl get pod "$pod_name" -n kube-system \ - -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].state.running.startedAt}" 2>/dev/null || echo "") - - if [ -n "$start_time" ]; then - echo " Pod $pod_name container started at: $start_time" - - # Track LATEST time (lexicographically later in ISO 8601 format) - if [ -z "$latest_start_time" ] || [[ "$start_time" > "$latest_start_time" ]]; then - latest_start_time="$start_time" - fi - else - echo "✗ ERROR: Could not determine container start time for pod $pod_name (container: $container_name)" - echo "This is required for Log Analytics query filtering" - exit 1 - fi - done - - if [ -n "$latest_start_time" ]; then - # Export for use in tests - echo "CONTAINER_START_TIME=$latest_start_time" > /tmp/container-deployment-time.env - echo "" - echo "✓ LATEST container start time: $latest_start_time" - echo "✓ Saved to /tmp/container-deployment-time.env" - echo "✓ Log Analytics queries should filter: TimeGenerated > datetime('$latest_start_time')" - else - echo "✗ ERROR: Could not determine container start times" - exit 1 - fi -} - echo "================================" echo "Pre-Test Pod Verification" echo "================================" @@ -186,11 +142,6 @@ while [ $attempt -le $MAX_RETRIES ]; do echo "================================" echo "Total attempts: $attempt" echo "Total wait time: ${minutes_elapsed}m${seconds_elapsed}s" - echo "" - - # Capture container start times using local function - capture_container_start_times - echo "" echo "Final pod status:" kubectl get pods -n kube-system | grep ama-logs diff --git a/.pipelines/e2e-test/verify-pod-images.sh b/.pipelines/e2e-test/verify-pod-images.sh deleted file mode 100644 index f82e1a2e3c..0000000000 --- a/.pipelines/e2e-test/verify-pod-images.sh +++ /dev/null @@ -1,397 +0,0 @@ -#!/bin/bash -# Script to verify AKS pod images match expected tags -# Can be used for both pre-test and post-test verification - -set -e - -# Parse command line arguments -MODE="${1:-pre-test}" # pre-test or post-test -LINUX_IMAGE_TAG="${2}" -WINDOWS_IMAGE_TAG="${3}" -LINUX_MCR_REPO="${4}" -WINDOWS_MCR_REPO="${5}" - -# Validate MODE parameter -if [[ "$MODE" != "pre-test" && "$MODE" != "post-test" ]]; then - echo "Error: Invalid mode '$MODE'" - echo "MODE must be either 'pre-test' or 'post-test'" - echo "Usage: $0 " - exit 1 -fi - -if [ -z "$LINUX_IMAGE_TAG" ] || [ -z "$WINDOWS_IMAGE_TAG" ] || [ -z "$LINUX_MCR_REPO" ] || [ -z "$WINDOWS_MCR_REPO" ]; then - echo "Error: Missing required parameters" - echo "Usage: $0 " - exit 1 -fi - -LINUX_IMAGE="$LINUX_MCR_REPO:$LINUX_IMAGE_TAG" -WINDOWS_IMAGE="$WINDOWS_MCR_REPO:$WINDOWS_IMAGE_TAG" - -if [ "$MODE" = "pre-test" ]; then - echo "================================" - echo "Pre-Test Image Verification" - echo "================================" - echo "Verifying pods are running with new images and are ready..." -else - echo "================================" - echo "Post-Test Image Verification" - echo "================================" - echo "Verifying pods still have the correct images after test execution..." -fi - -echo "" -echo "Repository Configuration:" -echo " Linux MCR repo: $LINUX_MCR_REPO" -echo " Windows MCR repo: $WINDOWS_MCR_REPO" -echo "" -echo "Expected Images:" -echo " Linux image: $LINUX_IMAGE" -echo " Windows image: $WINDOWS_IMAGE" -echo "" - -# Unified function to check all pods (with optional retry attempts) -# max_retries of 0 means instant check (no wait), otherwise retries up to max_retries times -check_all_pods() { - local -n configs_ref=$1 # Use different name to avoid circular reference - local max_retries=${2:-0} # Default to 0 (instant check, no retry) - local check_interval=60 # Wait 60 seconds between retries - - if [ $max_retries -gt 0 ]; then - # Wait mode (pre-test): Monitor pods with retries - local attempt=1 - - echo "================================" - echo "Waiting for all pods to be ready" - echo "================================" - echo "Total pods to check: ${#configs_ref[@]}" - echo "Maximum retries: $max_retries" - echo "Check interval: ${check_interval}s" - echo "Maximum wait time: $(((max_retries * check_interval) / 60)) minutes" - echo "" - - # Track ready status for each pod - declare -A pod_ready_status - for config in "${configs_ref[@]}"; do - pod_name=$(echo "$config" | cut -d'|' -f1) - pod_ready_status["$pod_name"]=false - done - - while [ $attempt -le $max_retries ]; do - local has_not_ready_pod=false - local ready_count=0 - local total_count=${#configs_ref[@]} - - # Check each pod in this iteration - for config in "${configs_ref[@]}"; do - echo " Raw config string: '$config'" - IFS='|' read -r pod_name expected_image container_name <<< "$config" - echo " Parsed values:" - echo " pod_name='$pod_name'" - echo " expected_image='$expected_image'" - echo " container_name='$container_name'" - - # Skip if already marked as ready - if [ "${pod_ready_status[$pod_name]}" = "true" ]; then - ((ready_count++)) - continue - fi - - current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "") - pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown") - container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false") - - # Check if pod is ready - if [[ "$current_image" == "$expected_image" ]] && [[ "$pod_status" == "Running" ]] && [[ "$container_ready" == "true" ]]; then - pod_ready_status["$pod_name"]=true - ((ready_count++)) - echo " ✓ $pod_name - Ready" - else - has_not_ready_pod=true - echo " ⏳ $pod_name - Waiting (Status: $pod_status, Container ready: $container_ready)" - if [[ "$current_image" != "$expected_image" ]]; then - echo " Image mismatch: expected $expected_image, got $current_image" - fi - fi - done - - # Show progress summary - local elapsed_seconds=$(((attempt - 1) * check_interval)) - local minutes_elapsed=$((elapsed_seconds / 60)) - local seconds_elapsed=$((elapsed_seconds % 60)) - local remaining_retries=$((max_retries - attempt)) - local remaining_seconds=$((remaining_retries * check_interval)) - local minutes_remaining=$((remaining_seconds / 60)) - local seconds_remaining=$((remaining_seconds % 60)) - - echo "" - echo "Attempt $attempt/$max_retries (${minutes_elapsed}m${seconds_elapsed}s elapsed, ${minutes_remaining}m${seconds_remaining}s remaining)" - echo "Progress: $ready_count/$total_count pods ready" - echo "" - - # Exit early if all pods are ready - if [ "$has_not_ready_pod" = false ]; then - echo "================================" - echo "✓ SUCCESS: All pods are ready!" - echo "================================" - echo "Total attempts: $attempt" - echo "Total wait time: ${minutes_elapsed}m${seconds_elapsed}s" - echo "" - return 0 - fi - - # Don't sleep after the last attempt - if [ $attempt -lt $max_retries ]; then - sleep $check_interval - fi - - ((attempt++)) - done - - # Max retries reached - report which pods failed - echo "================================" - echo "✗ MAX RETRIES REACHED: Not all pods became ready after $max_retries attempts" - echo "================================" - echo "" - echo "Failed pods:" - for config in "${configs_ref[@]}"; do - IFS='|' read -r pod_name expected_image container_name <<< "$config" - if [ "${pod_ready_status[$pod_name]}" != "true" ]; then - current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "ERROR") - pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown") - container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false") - - echo " ✗ $pod_name" - echo " Expected image: $expected_image" - echo " Current image: $current_image" - echo " Pod status: $pod_status" - echo " Container ready: $container_ready" - fi - done - echo "" - - return 1 - else - # Instant check mode (post-test): Single check, no waiting - local mismatches=() - - echo "Performing instant verification of all pods..." - echo "" - - for config in "${configs_ref[@]}"; do - IFS='|' read -r pod_name expected_image container_name <<< "$config" - - # Get pod details - current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "ERROR") - pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown") - container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false") - - echo "Pod: $pod_name" - echo " Container: $container_name" - echo " Expected image: $expected_image" - echo " Current image: $current_image" - echo " Pod status: $pod_status" - echo " Container ready: $container_ready" - - # Check for any issues - local has_issue=false - - if [[ "$current_image" != "$expected_image" ]]; then - echo " ✗ IMAGE MISMATCH!" - mismatches+=("$pod_name: expected image '$expected_image' but found '$current_image'") - has_issue=true - fi - - if [[ "$pod_status" != "Running" ]]; then - echo " ✗ POD NOT RUNNING!" - mismatches+=("$pod_name: pod status is '$pod_status' (expected 'Running')") - has_issue=true - fi - - if [[ "$container_ready" != "true" ]]; then - echo " ✗ CONTAINER NOT READY!" - mismatches+=("$pod_name: container '$container_name' is not ready") - has_issue=true - fi - - if [[ "$has_issue" = false ]]; then - echo " ✓ All checks passed" - fi - echo "" - done - - # Return mismatches via global array (bash limitation workaround) - image_mismatches=("${mismatches[@]}") - - if [ ${#mismatches[@]} -eq 0 ]; then - return 0 - else - return 1 - fi - fi -} - -# Get all ama-logs pods -echo "Getting list of ama-logs pods..." -pod_list=$(kubectl get pods -n kube-system --no-headers | grep ama-logs | awk '{print $1}') - -# Build configurations for all pods -pod_configs=() -image_mismatches=() - -for pod_name in $pod_list; do - # Determine expected image based on pod type - if [[ "$pod_name" =~ ^ama-logs-windows ]]; then - expected_image="$WINDOWS_IMAGE" - container_name="ama-logs-windows" - elif [[ "$pod_name" =~ ^ama-logs-rs ]] || [[ "$pod_name" =~ ^ama-logs-[a-z0-9]{5}$ ]]; then - # Matches both ReplicaSet pods (ama-logs-rs-*) and DaemonSet pods (ama-logs-xxxxx) - expected_image="$LINUX_IMAGE" - container_name="ama-logs" - else - echo "✗ ERROR: Unknown pod pattern: $pod_name" - echo "Expected pod names to match one of:" - echo " - ama-logs-windows-* (Windows pods)" - echo " - ama-logs-rs-* (Linux ReplicaSet pods)" - echo " - ama-logs-xxxxx (Linux DaemonSet pods, 5 alphanumeric chars)" - exit 1 - fi - - # Add to configurations for parallel checking - # Use | as delimiter since colons appear in image tags (e.g., ciprod:3.1.31) - pod_configs+=("$pod_name|$expected_image|$container_name") -done - -echo "Found ${#pod_configs[@]} pods to verify" -echo "" - -# Use different check based on mode -if [ "$MODE" = "pre-test" ]; then - # Pre-test: Wait for all pods to be ready (15 retries × 60s = 15 minutes max) - if ! check_all_pods pod_configs 15; then - failed_pods=true - else - failed_pods=false - fi -else - # Post-test: Instant check of all pods (no retry) - check_all_pods pod_configs 0 -fi - -echo "" -echo "================================" -if [ "$MODE" = "pre-test" ]; then - echo "Pre-Test Verification Summary" -else - echo "Post-Test Verification Summary" -fi -echo "================================" - -# Report results based on mode -if [ "$MODE" = "pre-test" ]; then - if [ "$failed_pods" = false ]; then - echo "✓ All pods are running with the correct images and are ready!" - echo "" - echo "Final pod status:" - kubectl get pods -n kube-system | grep ama-logs - echo "" - capture_container_start_times - - exit 0 - else - echo "✗ Pod verification failed (see details above)" - echo "" - echo "Final pod status:" - kubectl get pods -n kube-system | grep ama-logs - exit 1 - fi -else - # Post-test mode - if [ ${#image_mismatches[@]} -eq 0 ]; then - echo "✓ SUCCESS: All pods maintained the correct images throughout the test execution!" - echo "" - echo "Final pod status:" - kubectl get pods -n kube-system | grep ama-logs - exit 0 - else - echo "✗ FAILURE: Some pods changed images during test execution!" - echo "" - echo "Pods with image mismatches:" - printf ' - %s\n' "${image_mismatches[@]}" - echo "" - echo "This indicates the pods may have been restarted or updated during testing." - echo "This could cause test instability or false results." - echo "" - echo "Current pod status:" - kubectl get pods -n kube-system | grep ama-logs - echo "" - echo "Detailed pod information:" - for mismatch in "${image_mismatches[@]}"; do - pod=$(echo "$mismatch" | cut -d: -f1) - echo "" - echo "--- Details for $pod ---" - kubectl describe pod "$pod" -n kube-system | grep -A 20 "Events:" || kubectl describe pod "$pod" -n kube-system | tail -30 - done - exit 1 - fi -fi - -# Function to capture container start times for Log Analytics query filtering -capture_container_start_times() { - echo "================================" - echo "Container Start Time Capture" - echo "================================" - echo "Capturing LATEST container start time for Log Analytics queries..." - - # Get all container start times and find the LATEST one - local latest_start_time="" - - local pod_list=$(kubectl get pods -n kube-system --no-headers | grep ama-logs | awk '{print $1}') - for pod_name in $pod_list; do - # Get container name based on pod type - local container_name - if [[ "$pod_name" =~ ^ama-logs-windows ]]; then - container_name="ama-logs-windows" - elif [[ "$pod_name" =~ ^ama-logs-rs ]] || [[ "$pod_name" =~ ^ama-logs-[a-z0-9]{5}$ ]]; then - container_name="ama-logs" - else - continue - fi - - # Get container start time - try first container if filter doesn't work - local start_time - start_time=$(kubectl get pod "$pod_name" -n kube-system \ - -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].state.running.startedAt}" 2>/dev/null || echo "") - - if [ -z "$start_time" ]; then - start_time=$(kubectl get pod "$pod_name" -n kube-system \ - -o jsonpath="{.status.containerStatuses[0].state.running.startedAt}" 2>/dev/null || echo "") - fi - - if [ -n "$start_time" ]; then - echo " Pod $pod_name container started at: $start_time" - - # Track LATEST time (lexicographically later in ISO 8601 format) - if [ -z "$latest_start_time" ] || [[ "$start_time" > "$latest_start_time" ]]; then - latest_start_time="$start_time" - fi - else - echo "✗ ERROR: Could not determine container start time for pod $pod_name (container: $container_name)" - echo "This is required for Log Analytics query filtering" - exit 1 - fi - done - - if [ -n "$latest_start_time" ]; then - # Export for use in tests - echo "CONTAINER_START_TIME=$latest_start_time" > /tmp/container-deployment-time.env - echo "" - echo "✓ LATEST container start time: $latest_start_time" - echo "✓ Saved to /tmp/container-deployment-time.env" - echo "✓ Log Analytics queries should filter: TimeGenerated > datetime('$latest_start_time')" - else - echo "✗ ERROR: Could not determine container start times" - echo "This is required for Log Analytics query filtering" - exit 1 - fi -} From 6d513d2f99b8ef25ec618074a1ab50bae850c933 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 5 Dec 2025 14:48:18 -0800 Subject: [PATCH 22/40] add delay --- .pipelines/e2e-test/capture-container-start-time.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.pipelines/e2e-test/capture-container-start-time.sh b/.pipelines/e2e-test/capture-container-start-time.sh index 733bb77962..b9e25b0d67 100644 --- a/.pipelines/e2e-test/capture-container-start-time.sh +++ b/.pipelines/e2e-test/capture-container-start-time.sh @@ -29,6 +29,10 @@ echo "Container Start Time Capture" echo "================================" echo "Capturing LATEST container start time for Log Analytics queries..." echo "" +echo "Waiting 60 seconds for Kubernetes API to update container status..." +sleep 60 +echo "Proceeding with container start time capture..." + # Build pod configurations using shared function declare -a pod_configs From db6732ea654b9702ee64c7cb96f5515246045c20 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 5 Dec 2025 15:08:06 -0800 Subject: [PATCH 23/40] wait time 15 mins --- .../azure-template-deploy-and-test-ci-image-in-aks-cluster.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml index db1c9a2a73..a9f744da61 100644 --- a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml +++ b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml @@ -216,7 +216,7 @@ jobs: echo "This ensures queries will find logs from the newly deployed containers." echo "" - wait_time=60 #TODO: change back to 1200 (20 minutes) after testing + wait_time=900 #TODO: change back to 1200 (20 minutes) after testing interval=60 elapsed=0 From 93057f7caff41547c61331070a14433d1791d28b Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 5 Dec 2025 15:40:14 -0800 Subject: [PATCH 24/40] minor fix --- .config/guardian/.gdnbaselines | 1 + .pipelines/azure_pipeline_mergedbranches.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/.config/guardian/.gdnbaselines b/.config/guardian/.gdnbaselines index eff01b8012..a7976d8fde 100644 --- a/.config/guardian/.gdnbaselines +++ b/.config/guardian/.gdnbaselines @@ -155,3 +155,4 @@ } } } + diff --git a/.pipelines/azure_pipeline_mergedbranches.yaml b/.pipelines/azure_pipeline_mergedbranches.yaml index db5c896550..9fa66fbe49 100644 --- a/.pipelines/azure_pipeline_mergedbranches.yaml +++ b/.pipelines/azure_pipeline_mergedbranches.yaml @@ -889,6 +889,7 @@ extends: FileDirPath: '$(Build.ArtifactStagingDirectory)' DisableRemediation: false AcceptableOutdatedSignatureInHours: 72 + - stage: Deploy_and_Test_Images_In_Dev_Clusters displayName: Deploy and Test Images in Dev Clusters lockBehavior: sequential From 01783abdee184b5a25bee0db8771a7828eb987ff Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 5 Dec 2025 16:04:06 -0800 Subject: [PATCH 25/40] remove doc --- ...I-Agent-Auto-Deploy-Implementation-Plan.md | 417 ------------------ 1 file changed, 417 deletions(-) delete mode 100644 Documentation/CI-Agent-Auto-Deploy-Implementation-Plan.md diff --git a/Documentation/CI-Agent-Auto-Deploy-Implementation-Plan.md b/Documentation/CI-Agent-Auto-Deploy-Implementation-Plan.md deleted file mode 100644 index 98bbaf4319..0000000000 --- a/Documentation/CI-Agent-Auto-Deploy-Implementation-Plan.md +++ /dev/null @@ -1,417 +0,0 @@ -# CI Agent Auto-Deploy Implementation Plan - -## Overview -This document outlines the implementation plan for enabling auto-deployment of CI Agent to a dev cluster on every PR merge to main branch, following the Prom Agent pattern. - -**Goal:** Automatically deploy freshly built CI agent images to a dev cluster after each successful build on main branch. - -**Pattern:** Based on Prom Agent's `azure-pipeline-build.yml` approach - sequential deployments using `helm upgrade --install`. - ---- - -## Key Findings - -### ✅ No Chart Modifications Needed -- **ServiceAccount**: Hardcoded `ama-logs` works fine for sequential deployments -- **Image Tags**: Can be overridden via `--set` flags at deployment time -- **Release Name**: Using same release name (`ama-logs-dev`) for all deployments allows Helm to upgrade in place - -### ✅ Prom Agent Pattern -- Uses `helm upgrade --install` with same release name every time -- Deploys to different clusters (not multiple releases per cluster) -- Each cluster has exactly ONE release -- No ServiceAccount conflicts with sequential deployments - ---- - -## Implementation Changes - -### 1. Pipeline Modification - -**File:** `Docker-Provider/.pipelines/azure_pipeline_mergedbranches.yaml` - -**Add Deployment Stage** after existing build stages: - -```yaml -- stage: Deploy_Dev_Cluster - displayName: Deploy to Dev Cluster - dependsOn: - - BuildLinuxImages - - BuildWindowsImages - # Only deploy on main branch merges (not PRs) - condition: and(succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main')) - - jobs: - - deployment: Deploy_AKS_Chart - displayName: "Deploy: AKS dev cluster" - environment: CI-Agent-Dev # Create this environment in Azure DevOps - pool: - name: Azure-Pipelines-CI-Test-EO - - variables: - # Get image tags from build stages - linuxImageTag: $[ stageDependencies.BuildLinuxImages.Build.outputs['setImageTag.linuxTag'] ] - windowsImageTag: $[ stageDependencies.BuildWindowsImages.Build.outputs['setImageTag.windowsTag'] ] - - strategy: - runOnce: - deploy: - steps: - - checkout: self - - - task: HelmDeploy@0 - displayName: "Deploy to dev cluster" - inputs: - connectionType: 'Azure Resource Manager' - azureSubscription: 'ContainerInsights_Build_Subscription(9b96ebbd-c57a-42d1-bbe9-b69296e4c7fb)' - azureResourceGroup: 'YOUR-DEV-CLUSTER-RG' - kubernetesCluster: 'YOUR-DEV-CLUSTER-NAME' - useClusterAdmin: true - namespace: 'kube-system' - command: 'upgrade' - chartType: 'FilePath' - chartPath: '$(Build.SourcesDirectory)/charts/azuremonitor-containers/' - releaseName: 'ama-logs-dev' - overrideValues: | - amalogs.image.repo=mcr.microsoft.com/azuremonitor/containerinsights/cidev - amalogs.image.tag=$(linuxImageTag) - amalogs.image.tagWindows=$(windowsImageTag) - arguments: '--install --create-namespace' -``` - ---- - -### 2. Ensure Build Stages Export Image Tags - -**Verify in BuildLinuxImages stage:** - -```yaml -- stage: BuildLinuxImages - jobs: - - job: Build - steps: - # ... existing build steps ... - - # Add this step to export tag - - script: | - echo "##vso[task.setvariable variable=linuxTag;isOutput=true]$(IMAGE_TAG)" - name: setImageTag - displayName: Export Linux image tag -``` - -**Verify in BuildWindowsImages stage:** - -```yaml -- stage: BuildWindowsImages - jobs: - - job: Build - steps: - # ... existing build steps ... - - # Add this step to export tag - - script: | - echo "##vso[task.setvariable variable=windowsTag;isOutput=true]$(IMAGE_TAG)" - name: setImageTag - displayName: Export Windows image tag -``` - ---- - -### 3. Configuration Updates - -**Replace these placeholders with actual values:** - -| Placeholder | Description | Example Value | -|-------------|-------------|---------------| -| `YOUR-DEV-CLUSTER-RG` | Resource group containing dev cluster | `ci-dev-aks-rg` | -| `YOUR-DEV-CLUSTER-NAME` | Name of dev AKS cluster | `ci-dev-aks-eus` | - -**Optional: Add more overrides for dev-specific configuration:** - -```yaml -overrideValues: | - amalogs.image.repo=mcr.microsoft.com/azuremonitor/containerinsights/cidev - amalogs.image.tag=$(linuxImageTag) - amalogs.image.tagWindows=$(windowsImageTag) - amalogs.secret.wsid=YOUR-DEV-WORKSPACE-ID - amalogs.secret.key=YOUR-DEV-WORKSPACE-KEY - amalogs.env.clusterName=ci-dev-cluster - amalogs.ISTEST=true -``` - ---- - -### 4. Azure DevOps Environment Setup - -**Create deployment environment:** -1. Navigate to: Azure DevOps → Pipelines → Environments -2. Click "New environment" -3. Name: `CI-Agent-Dev` -4. Resource: None (environment-only) -5. (Optional) Add approval gates if needed - ---- - -## Chart Details - No Modifications Required - -### ServiceAccount Handling -- **Current:** Hardcoded as `ama-logs` -- **Works because:** Sequential deployments reuse same ServiceAccount -- **Pattern:** `helm upgrade` updates existing resources, doesn't recreate - -### Image Tag Handling -- **Current:** Hardcoded in `values.yaml` -- **Override:** Via `--set` flags at deployment time -- **Files affected:** None (pure runtime override) - -### Files with ServiceAccount References (No changes needed) -1. `templates/ama-logs-rbac.yaml` - Creates ServiceAccount `ama-logs` -2. `templates/ama-logs-daemonset.yaml` - References `serviceAccountName: ama-logs` -3. `templates/ama-logs-daemonset-windows.yaml` - References `serviceAccountName: ama-logs` -4. `templates/ama-logs-deployment.yaml` - References `serviceAccountName: ama-logs` - ---- - -## How It Works - -### Deployment Flow - -``` -┌─────────────────────────────────────────────────────────────┐ -│ 1. PR Merged to Main Branch │ -└─────────────────────────────────────────────────────────────┘ - ▼ -┌─────────────────────────────────────────────────────────────┐ -│ 2. Build Pipeline Triggered │ -│ - BuildLinuxImages stage → produces linuxImageTag │ -│ - BuildWindowsImages stage → produces windowsImageTag │ -└─────────────────────────────────────────────────────────────┘ - ▼ -┌─────────────────────────────────────────────────────────────┐ -│ 3. Deploy_Dev_Cluster Stage │ -│ - Gets image tags from build stages │ -│ - Runs: helm upgrade ama-logs-dev --install │ -│ - Overrides: image.tag=$(linuxImageTag) │ -└─────────────────────────────────────────────────────────────┘ - ▼ -┌─────────────────────────────────────────────────────────────┐ -│ 4. Helm Deployment on Dev Cluster │ -│ - First run: Creates new release "ama-logs-dev" │ -│ - Subsequent runs: Updates existing release │ -│ - ServiceAccount "ama-logs" reused (no conflicts) │ -└─────────────────────────────────────────────────────────────┘ - ▼ -┌─────────────────────────────────────────────────────────────┐ -│ 5. Dev Cluster Running Latest Build │ -│ - DaemonSet updated with new image tags │ -│ - Windows DaemonSet updated with new image tags │ -│ - Deployment (ReplicaSet) updated with new image tags │ -└─────────────────────────────────────────────────────────────┘ -``` - -### Sequential Deployment Example - -```bash -# Build 1 - Creates initial deployment -helm upgrade ama-logs-dev ./chart --install \ - --set amalogs.image.tag=3.1.30-20231101 \ - --set amalogs.image.tagWindows=win-3.1.30-20231101 -# Result: New release created, ServiceAccount "ama-logs" created - -# Build 2 - Updates existing deployment -helm upgrade ama-logs-dev ./chart --install \ - --set amalogs.image.tag=3.1.30-20231102 \ - --set amalogs.image.tagWindows=win-3.1.30-20231102 -# Result: Release updated, ServiceAccount "ama-logs" reused ✅ - -# Build 3 - Updates existing deployment -helm upgrade ama-logs-dev ./chart --install \ - --set amalogs.image.tag=3.1.30-20231103 \ - --set amalogs.image.tagWindows=win-3.1.30-20231103 -# Result: Release updated, ServiceAccount "ama-logs" reused ✅ -``` - ---- - -## Testing Plan - -### Pre-Deployment Testing - -1. **Validate Chart Templates:** -```bash -cd Docker-Provider/charts/azuremonitor-containers -helm template ama-logs-dev . \ - --set amalogs.image.tag=test-tag \ - --set amalogs.image.tagWindows=test-tag-win \ - --debug -``` - -2. **Dry Run Deployment:** -```bash -helm upgrade ama-logs-dev . --install \ - --namespace kube-system \ - --set amalogs.image.tag=test-tag \ - --dry-run --debug -``` - -### Post-Deployment Validation - -1. **Check Pipeline Execution:** - - Verify Deploy_Dev_Cluster stage runs - - Check image tags are passed correctly - - Confirm Helm deployment succeeds - -2. **Verify Cluster Deployment:** -```bash -# Check pods are running -kubectl get pods -n kube-system | grep ama-logs - -# Verify DaemonSet -kubectl describe daemonset ama-logs -n kube-system - -# Verify Windows DaemonSet -kubectl describe daemonset ama-logs-win -n kube-system - -# Verify Deployment (ReplicaSet) -kubectl describe deployment ama-logs-rs -n kube-system - -# Check image tags match build -kubectl get daemonset ama-logs -n kube-system -o jsonpath='{.spec.template.spec.containers[0].image}' -``` - -3. **Verify ServiceAccount:** -```bash -# Confirm ServiceAccount exists and is used -kubectl get serviceaccount ama-logs -n kube-system -kubectl get pods -n kube-system -l dsName=ama-logs-ds -o jsonpath='{.items[0].spec.serviceAccountName}' -``` - ---- - -## Rollback Plan - -If deployment fails or causes issues: - -### Option 1: Rollback via Helm -```bash -# List releases -helm list -n kube-system - -# Rollback to previous version -helm rollback ama-logs-dev -n kube-system -``` - -### Option 2: Manual Revert -```bash -# Revert to specific image version -helm upgrade ama-logs-dev ./chart --install \ - --set amalogs.image.tag=PREVIOUS-WORKING-TAG \ - --set amalogs.image.tagWindows=PREVIOUS-WORKING-TAG-win -``` - -### Option 3: Remove Pipeline Stage -- Comment out `Deploy_Dev_Cluster` stage in pipeline -- Commit and push -- Cluster remains at current version - ---- - -## Comparison: CI Agent vs Prom Agent - -| Aspect | Prom Agent | CI Agent (This Plan) | -|--------|-----------|---------------------| -| **Chart Changes** | None | None | -| **ServiceAccount** | Hardcoded `ama-metrics-serviceaccount` | Hardcoded `ama-logs` | -| **Deployment Method** | `helm upgrade --install` | `helm upgrade --install` | -| **Release Name** | `ama-metrics` | `ama-logs-dev` | -| **Image Override** | `--set image.tag=...` | `--set amalogs.image.tag=...` | -| **Multiple Versions** | ❌ Not supported | ❌ Not supported (sequential only) | -| **Cluster Strategy** | One release per cluster | One release per cluster | - ---- - -## Estimated Effort - -| Task | Effort | Notes | -|------|--------|-------| -| Add deployment stage to pipeline | 30 min | Copy from Prom agent pattern | -| Update cluster name/RG variables | 5 min | Simple config update | -| Create Azure DevOps environment | 5 min | One-time setup | -| Verify build tag exports | 15 min | May already exist | -| Test dry-run deployment | 15 min | Validate before merge | -| Deploy and validate | 30 min | First deployment + verification | -| **Total** | **~2 hours** | Including testing and validation | - ---- - -## Future Enhancements (Optional) - -### 1. Add E2E Tests Post-Deployment -Similar to Prom agent's TestKube integration: -```yaml -- job: Run_E2E_Tests - dependsOn: Deploy_AKS_Chart - steps: - - script: kubectl testkube run testsuite ci-agent-e2e-tests -``` - -### 2. Deploy to Multiple Dev Clusters -Add additional deployment jobs for different regions: -```yaml -- deployment: Deploy_EUS_Cluster - cluster: ci-dev-aks-eus - -- deployment: Deploy_WUS_Cluster - cluster: ci-dev-aks-wus -``` - -### 3. Slack/Teams Notifications -Notify team of successful deployments: -```yaml -- task: SlackNotification@1 - inputs: - message: "✅ CI Agent $(linuxImageTag) deployed to dev cluster" -``` - ---- - -## References - -- **Prom Agent Build Pipeline:** `prometheus-collector/.pipelines/azure-pipeline-build.yml` -- **CI Agent Current Pipeline:** `Docker-Provider/.pipelines/azure_pipeline_mergedbranches.yaml` -- **Helm Chart:** `Docker-Provider/charts/azuremonitor-containers/` -- **Prom Agent Chart:** `prometheus-collector/otelcollector/deploy/addon-chart/azure-monitor-metrics-addon/` - ---- - -## Questions & Answers - -### Q: Why not use Release.Name for ServiceAccount? -**A:** Not needed for sequential deployments. Same release name = same ServiceAccount = no conflicts. Only needed for parallel deployments (multiple versions simultaneously). - -### Q: Can we deploy multiple versions to same cluster? -**A:** No, with current approach (hardcoded ServiceAccount). Would require chart modifications to use `{{ .Release.Name }}` pattern. Not recommended unless specifically needed. - -### Q: What if build fails? -**A:** Deploy stage has `condition: succeeded()` - won't run if build fails. Cluster stays at previous version. - -### Q: How to deploy to production? -**A:** This plan is for dev cluster only. Production deployments should continue using existing release pipeline with proper approvals and phased rollouts. - ---- - -## Status - -- [x] Research Prom agent pattern -- [x] Document findings -- [x] Create implementation plan -- [ ] Update pipeline with deployment stage -- [ ] Test deployment to dev cluster -- [ ] Validate with team -- [ ] Merge to main branch - ---- - -**Last Updated:** 2025-11-07 -**Author:** Implementation plan based on Prom agent analysis -**Status:** Ready for implementation From 6a09f91eca9cc69dea72a529cce937acaf146e63 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 5 Dec 2025 16:06:08 -0800 Subject: [PATCH 26/40] minor --- test/testkube/helm-testkube-values.yaml | 4 ---- test/testkube/install-and-execute-testkube-tests.sh | 1 + 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/test/testkube/helm-testkube-values.yaml b/test/testkube/helm-testkube-values.yaml index 29727dcfba..bd8d5f5796 100644 --- a/test/testkube/helm-testkube-values.yaml +++ b/test/testkube/helm-testkube-values.yaml @@ -1304,7 +1304,3 @@ testkube-operator: # ref: https://cloud.google.com/kubernetes-engine/docs/how-to/prepare-arm-workloads-for-deployment#node-affinity-multi-arch-arm # -- Tolerations to schedule a workload to nodes with any architecture type. Required for deployment to GKE cluster. tolerations: [] -<<<<<<< HEAD - -======= ->>>>>>> efd34efac (add bebugging) diff --git a/test/testkube/install-and-execute-testkube-tests.sh b/test/testkube/install-and-execute-testkube-tests.sh index 1df4bccf47..7f5c8334d3 100644 --- a/test/testkube/install-and-execute-testkube-tests.sh +++ b/test/testkube/install-and-execute-testkube-tests.sh @@ -126,3 +126,4 @@ EOF # Explicitly fail the ADO task since at least one test failed exit 1 fi + From eb7c9a942fe4ce3b0aae15881fb13b9505d9dc75 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 5 Dec 2025 16:07:54 -0800 Subject: [PATCH 27/40] pull from ci_prod --- .config/guardian/.gdnbaselines | 3 +-- test/testkube/helm-testkube-values.yaml | 7 ++++--- .../install-and-execute-testkube-tests.sh | 18 ++---------------- 3 files changed, 7 insertions(+), 21 deletions(-) diff --git a/.config/guardian/.gdnbaselines b/.config/guardian/.gdnbaselines index a7976d8fde..2b12b418dd 100644 --- a/.config/guardian/.gdnbaselines +++ b/.config/guardian/.gdnbaselines @@ -154,5 +154,4 @@ "justification": "This error is baselined with an expiration date of 180 days from 2025-05-20 23:41:13Z" } } -} - +} \ No newline at end of file diff --git a/test/testkube/helm-testkube-values.yaml b/test/testkube/helm-testkube-values.yaml index bd8d5f5796..8039c50e56 100644 --- a/test/testkube/helm-testkube-values.yaml +++ b/test/testkube/helm-testkube-values.yaml @@ -110,11 +110,11 @@ mongodb: # Currently Bitnami doesn't support ARM: https://github.com/bitnami/charts/issues/7305 image: # -- MongoDB image registry - registry: docker.io + registry: mcr.microsoft.com # -- MongoDB image repository - repository: bitnami/mongodb + repository: azuremonitor/containerinsights/cidev # -- MongoDB image tag - tag: latest + tag: mongodb_6.0.5-debian-11-r64 # -- MongoDB image pull Secret pullSecrets: [] nodeSelector: @@ -1304,3 +1304,4 @@ testkube-operator: # ref: https://cloud.google.com/kubernetes-engine/docs/how-to/prepare-arm-workloads-for-deployment#node-affinity-multi-arch-arm # -- Tolerations to schedule a workload to nodes with any architecture type. Required for deployment to GKE cluster. tolerations: [] + diff --git a/test/testkube/install-and-execute-testkube-tests.sh b/test/testkube/install-and-execute-testkube-tests.sh index 7f5c8334d3..fb8f55b89c 100644 --- a/test/testkube/install-and-execute-testkube-tests.sh +++ b/test/testkube/install-and-execute-testkube-tests.sh @@ -24,19 +24,6 @@ echo "deb https://repo.testkube.io/linux linux main" | sudo tee -a /etc/apt/sour sudo apt-get update sudo apt-get install -y testkube -echo "Checking for existing Testkube installation..." -if helm list -n testkube 2>/dev/null | grep -q testkube; then - echo "Found existing Testkube installation. Cleaning up..." - helm uninstall testkube -n testkube || true - echo "Deleting testkube namespace..." - kubectl delete namespace testkube --wait=true --timeout=120s || true - echo "Waiting for namespace to fully terminate..." - sleep 30 - echo "Cleanup complete!" -else - echo "No existing Testkube installation found." -fi - echo "Install testkube on the cluster" helm repo add kubeshop https://kubeshop.github.io/helm-charts helm repo update @@ -52,7 +39,7 @@ envsubst < ./testkube-test-crs.yaml > ./testkube-test-crs-updated.yaml kubectl apply -f ./testkube-test-crs-updated.yaml echo "Wait for cluster to be ready" -sleep 200 +sleep 120 echo "Run testkube tests" execution_id="" @@ -125,5 +112,4 @@ EOF # Explicitly fail the ADO task since at least one test failed exit 1 -fi - +fi \ No newline at end of file From abb61eb5552f8543ca3ab1eb99d447c6d142561c Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 5 Dec 2025 16:12:31 -0800 Subject: [PATCH 28/40] rebase --- test/testkube/helm-testkube-values.yaml | 6 +++--- .../install-and-execute-testkube-tests.sh | 15 ++++++++++++++- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/test/testkube/helm-testkube-values.yaml b/test/testkube/helm-testkube-values.yaml index 8039c50e56..04a273fe69 100644 --- a/test/testkube/helm-testkube-values.yaml +++ b/test/testkube/helm-testkube-values.yaml @@ -110,11 +110,11 @@ mongodb: # Currently Bitnami doesn't support ARM: https://github.com/bitnami/charts/issues/7305 image: # -- MongoDB image registry - registry: mcr.microsoft.com + registry: docker.io # -- MongoDB image repository - repository: azuremonitor/containerinsights/cidev + repository: bitnami/mongodb # -- MongoDB image tag - tag: mongodb_6.0.5-debian-11-r64 + tag: latest # -- MongoDB image pull Secret pullSecrets: [] nodeSelector: diff --git a/test/testkube/install-and-execute-testkube-tests.sh b/test/testkube/install-and-execute-testkube-tests.sh index fb8f55b89c..164be7fe69 100644 --- a/test/testkube/install-and-execute-testkube-tests.sh +++ b/test/testkube/install-and-execute-testkube-tests.sh @@ -24,6 +24,19 @@ echo "deb https://repo.testkube.io/linux linux main" | sudo tee -a /etc/apt/sour sudo apt-get update sudo apt-get install -y testkube +echo "Checking for existing Testkube installation..." +if helm list -n testkube 2>/dev/null | grep -q testkube; then + echo "Found existing Testkube installation. Cleaning up..." + helm uninstall testkube -n testkube || true + echo "Deleting testkube namespace..." + kubectl delete namespace testkube --wait=true --timeout=120s || true + echo "Waiting for namespace to fully terminate..." + sleep 30 + echo "Cleanup complete!" +else + echo "No existing Testkube installation found." +fi + echo "Install testkube on the cluster" helm repo add kubeshop https://kubeshop.github.io/helm-charts helm repo update @@ -39,7 +52,7 @@ envsubst < ./testkube-test-crs.yaml > ./testkube-test-crs-updated.yaml kubectl apply -f ./testkube-test-crs-updated.yaml echo "Wait for cluster to be ready" -sleep 120 +sleep 200 echo "Run testkube tests" execution_id="" From 5c2b3e376acd75b81897bec909fe8fd5fbda9769 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 5 Dec 2025 19:50:23 -0800 Subject: [PATCH 29/40] wait 5 minutes --- ...eploy-and-test-ci-image-in-aks-cluster.yml | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml index a9f744da61..9d6c5483eb 100644 --- a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml +++ b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml @@ -161,6 +161,34 @@ jobs: echo "Current ama-logs pods:" kubectl get pods -n kube-system | grep ama-logs + # Wait for Kubernetes API to update and pods to begin restarting + - task: Bash@3 + displayName: 'Wait for pod patch to propagate (5 minutes)' + inputs: + targetType: 'inline' + script: | + echo "========================================" + echo "Waiting for Pod Patch Propagation" + echo "========================================" + echo "" + echo "Waiting 5 minutes" + echo "" + + wait_time=300 + interval=30 + elapsed=0 + + while [ $elapsed -lt $wait_time ]; do + remaining=$((wait_time - elapsed)) + echo "⏳ Waiting... ($elapsed/$wait_time seconds elapsed, $remaining seconds remaining)" + sleep $interval + elapsed=$((elapsed + interval)) + done + + echo "" + echo "✓ Wait complete! Now checking actual pod readiness status..." + echo "========================================" + # Pre-test verification: Wait for pods to be ready with new images - task: Bash@3 displayName: 'Pre-Test: Wait for pods to be ready with new images' From bf5126beb2404ad953970fcecccb1df9ecbe7a57 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 5 Dec 2025 20:39:05 -0800 Subject: [PATCH 30/40] remove container start time check wait --- .pipelines/e2e-test/capture-container-start-time.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/.pipelines/e2e-test/capture-container-start-time.sh b/.pipelines/e2e-test/capture-container-start-time.sh index b9e25b0d67..b3c9b4322d 100644 --- a/.pipelines/e2e-test/capture-container-start-time.sh +++ b/.pipelines/e2e-test/capture-container-start-time.sh @@ -29,9 +29,6 @@ echo "Container Start Time Capture" echo "================================" echo "Capturing LATEST container start time for Log Analytics queries..." echo "" -echo "Waiting 60 seconds for Kubernetes API to update container status..." -sleep 60 -echo "Proceeding with container start time capture..." # Build pod configurations using shared function From 97d2d8ca73b0d67938ea7280bc7b3f30caf60048 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 5 Dec 2025 20:43:04 -0800 Subject: [PATCH 31/40] 20 min wait time --- .../azure-template-deploy-and-test-ci-image-in-aks-cluster.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml index 9d6c5483eb..22af526ac2 100644 --- a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml +++ b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml @@ -244,7 +244,7 @@ jobs: echo "This ensures queries will find logs from the newly deployed containers." echo "" - wait_time=900 #TODO: change back to 1200 (20 minutes) after testing + wait_time=1200 #TODO: change back to 1200 (20 minutes) after testing interval=60 elapsed=0 From b0ea65e2af6e132f89a91423fc0759eb1027c22c Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 5 Dec 2025 20:48:07 -0800 Subject: [PATCH 32/40] echo utc timing --- .../azure-template-deploy-and-test-ci-image-in-aks-cluster.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml index 22af526ac2..f25bdc793d 100644 --- a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml +++ b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml @@ -158,6 +158,7 @@ jobs: echo "" echo "Pod patching complete!" + echo "Current UTC time: $(date -u +"%Y-%m-%dT%H:%M:%SZ")" echo "Current ama-logs pods:" kubectl get pods -n kube-system | grep ama-logs From 8bcad01b9ed648fd0f25a8b594c52195602344eb Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Mon, 8 Dec 2025 12:34:02 -0800 Subject: [PATCH 33/40] fix trigger condition --- .pipelines/azure_pipeline_mergedbranches.yaml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/.pipelines/azure_pipeline_mergedbranches.yaml b/.pipelines/azure_pipeline_mergedbranches.yaml index 9fa66fbe49..19c4e240f3 100644 --- a/.pipelines/azure_pipeline_mergedbranches.yaml +++ b/.pipelines/azure_pipeline_mergedbranches.yaml @@ -896,14 +896,19 @@ extends: dependsOn: - Build_And_Publish_Images # Deploy runs when Build succeeds OR when Build is skipped with valid overrides - # TODO: remove eq(variables['Build.SourceBranch'], 'refs/heads/zane/ci-agent-auto-deploy'), - # this stage runs when Build_And_Publish_Images succeeds or is skipped with valid overrides. + # This stage runs when: + # 1. Direct push to ci_prod or zane/ci-agent-auto-deploy or branches containing 'run-e2e' + # 2. PR from zane/ci-agent-auto-deploy branch condition: | and( or( eq(variables['Build.SourceBranch'], 'refs/heads/ci_prod'), eq(variables['Build.SourceBranch'], 'refs/heads/zane/ci-agent-auto-deploy'), - contains(variables['Build.SourceBranch'], 'run-e2e') + contains(variables['Build.SourceBranch'], 'run-e2e'), + and( + eq(variables['Build.Reason'], 'PullRequest'), + eq(variables['System.PullRequest.SourceBranch'], 'refs/heads/zane/ci-agent-auto-deploy') + ) ), or( eq(dependencies.Build_And_Publish_Images.result, 'Succeeded'), From d1379607c4bda628432b7aa78d3f2f760d895088 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Mon, 8 Dec 2025 13:45:41 -0800 Subject: [PATCH 34/40] trigger when PR branch contains run-e2e --- .pipelines/azure_pipeline_mergedbranches.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.pipelines/azure_pipeline_mergedbranches.yaml b/.pipelines/azure_pipeline_mergedbranches.yaml index 19c4e240f3..1284da3eae 100644 --- a/.pipelines/azure_pipeline_mergedbranches.yaml +++ b/.pipelines/azure_pipeline_mergedbranches.yaml @@ -898,7 +898,7 @@ extends: # Deploy runs when Build succeeds OR when Build is skipped with valid overrides # This stage runs when: # 1. Direct push to ci_prod or zane/ci-agent-auto-deploy or branches containing 'run-e2e' - # 2. PR from zane/ci-agent-auto-deploy branch + # 2. PR from zane/ci-agent-auto-deploy branch OR PR from branch containing 'run-e2e' condition: | and( or( @@ -907,7 +907,10 @@ extends: contains(variables['Build.SourceBranch'], 'run-e2e'), and( eq(variables['Build.Reason'], 'PullRequest'), - eq(variables['System.PullRequest.SourceBranch'], 'refs/heads/zane/ci-agent-auto-deploy') + or( + eq(variables['System.PullRequest.SourceBranch'], 'refs/heads/zane/ci-agent-auto-deploy'), + contains(variables['System.PullRequest.SourceBranch'], 'run-e2e') + ) ) ), or( From 8c7d4cb4bf3c0cb6382dee14536e5dac6cd37040 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Mon, 8 Dec 2025 14:08:45 -0800 Subject: [PATCH 35/40] fix trigger --- .pipelines/azure_pipeline_mergedbranches.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pipelines/azure_pipeline_mergedbranches.yaml b/.pipelines/azure_pipeline_mergedbranches.yaml index 1284da3eae..4f28c9b6aa 100644 --- a/.pipelines/azure_pipeline_mergedbranches.yaml +++ b/.pipelines/azure_pipeline_mergedbranches.yaml @@ -908,7 +908,7 @@ extends: and( eq(variables['Build.Reason'], 'PullRequest'), or( - eq(variables['System.PullRequest.SourceBranch'], 'refs/heads/zane/ci-agent-auto-deploy'), + eq(variables['System.PullRequest.SourceBranch'], 'zane/ci-agent-auto-deploy'), contains(variables['System.PullRequest.SourceBranch'], 'run-e2e') ) ) From 8a1bc5a3a98beaffac4f0dba74200376c66c13d8 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Mon, 8 Dec 2025 23:29:38 -0800 Subject: [PATCH 36/40] rename stage name --- .pipelines/azure_pipeline_mergedbranches.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.pipelines/azure_pipeline_mergedbranches.yaml b/.pipelines/azure_pipeline_mergedbranches.yaml index 4f28c9b6aa..aba3589b76 100644 --- a/.pipelines/azure_pipeline_mergedbranches.yaml +++ b/.pipelines/azure_pipeline_mergedbranches.yaml @@ -44,7 +44,7 @@ extends: stages: # This stage will be skipped when LinuxImageOverride and WindowsImageOverride are both set # This feature allows bypassing the build stage when using pre-built images for testing, which saves time and resources. - - stage: Build_And_Publish_Images + - stage: stage displayName: 'Build and Publish Container Images' condition: | or( @@ -894,7 +894,7 @@ extends: displayName: Deploy and Test Images in Dev Clusters lockBehavior: sequential dependsOn: - - Build_And_Publish_Images + - stage # Deploy runs when Build succeeds OR when Build is skipped with valid overrides # This stage runs when: # 1. Direct push to ci_prod or zane/ci-agent-auto-deploy or branches containing 'run-e2e' @@ -914,9 +914,9 @@ extends: ) ), or( - eq(dependencies.Build_And_Publish_Images.result, 'Succeeded'), + eq(dependencies.stage.result, 'Succeeded'), and( - eq(dependencies.Build_And_Publish_Images.result, 'Skipped'), + eq(dependencies.stage.result, 'Skipped'), ne(variables['LinuxImageOverride'], ''), ne(variables['WindowsImageOverride'], '') ) @@ -925,8 +925,8 @@ extends: variables: # Use images built from previous build stage by default # To override: Set pipeline variables 'LinuxImageOverride' and 'WindowsImageOverride' when queuing - linuxImageTagUnderTest: $[coalesce(variables['LinuxImageOverride'], stageDependencies.Build_And_Publish_Images.common.outputs['setup.linuxImagetag'])] - windowsImageTagUnderTest: $[coalesce(variables['WindowsImageOverride'], stageDependencies.Build_And_Publish_Images.common.outputs['setup.windowsImageTag'])] + linuxImageTagUnderTest: $[coalesce(variables['LinuxImageOverride'], stageDependencies.stage.common.outputs['setup.linuxImagetag'])] + windowsImageTagUnderTest: $[coalesce(variables['WindowsImageOverride'], stageDependencies.stage.common.outputs['setup.windowsImageTag'])] jobs: # TODO: gradually add more clusters from test automation framework when the tests are stable # TODO: TeamsWebhookUri to be added From 96ef037d77a9a747726cfd70f63ff35976ea2b71 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 12 Dec 2025 13:24:18 -0800 Subject: [PATCH 37/40] push image to acr during pr for particiluar branch --- .pipelines/azure_pipeline_mergedbranches.yaml | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/.pipelines/azure_pipeline_mergedbranches.yaml b/.pipelines/azure_pipeline_mergedbranches.yaml index aba3589b76..a578302d33 100644 --- a/.pipelines/azure_pipeline_mergedbranches.yaml +++ b/.pipelines/azure_pipeline_mergedbranches.yaml @@ -193,7 +193,17 @@ extends: docker pull mcr.microsoft.com/azuremonitor/containerinsights/cidev/prometheus-collector/images:buildx-stable-1 docker buildx create --name dockerbuilder --driver docker-container --driver-opt image=mcr.microsoft.com/azuremonitor/containerinsights/cidev/prometheus-collector/images:buildx-stable-1 --use docker buildx inspect --bootstrap + # Determine if we should push to ACR + # Push when: NOT a PR, OR when PR is from specific branches (zane/ci-agent-auto-deploy or branches containing 'run-e2e') + SHOULD_PUSH="false" if [ "$(Build.Reason)" != "PullRequest" ]; then + SHOULD_PUSH="true" + elif [[ "$(System.PullRequest.SourceBranch)" == "zane/ci-agent-auto-deploy" ]] || [[ "$(System.PullRequest.SourceBranch)" == *"run-e2e"* ]]; then + SHOULD_PUSH="true" + echo "PR from branch $(System.PullRequest.SourceBranch) - will push image to ACR for E2E testing" + fi + + if [ "$SHOULD_PUSH" == "true" ]; then docker buildx build --platform $(BUILD_PLATFORMS) --tag ${{ variables.repoImageName }}:$(linuxImagetag) -f kubernetes/linux/Dockerfile.multiarch --metadata-file $(Build.ArtifactStagingDirectory)/linux/metadata.json --build-arg IMAGE_TAG=$(linuxTelemetryTag) --build-arg GOLANG_BASE_IMAGE=$(GOLANG_BASE_IMAGE) --build-arg CI_BASE_IMAGE=$(CI_BASE_IMAGE) --push --provenance=false . echo "##vso[task.logissue type=warning]Linux image built with tag: ${{ variables.repoImageName }}:$(linuxImagetag)" docker pull ${{ variables.repoImageName }}:$(linuxImagetag) @@ -551,7 +561,16 @@ extends: inputs: targetType: 'inline' script: | + # Push when: NOT a PR, OR when PR is from specific branches (zane/ci-agent-auto-deploy or branches containing 'run-e2e') + $shouldPush = $false if ("$(Build.Reason)" -ne "PullRequest") { + $shouldPush = $true + } elseif ("$(System.PullRequest.SourceBranch)" -eq "zane/ci-agent-auto-deploy" -or "$(System.PullRequest.SourceBranch)" -like "*run-e2e*") { + $shouldPush = $true + Write-Host "PR from branch $(System.PullRequest.SourceBranch) - will push image to ACR for E2E testing" + } + + if ($shouldPush) { docker push ${{ variables.repoImageName }}:$(windowsImageTag)-$(windows2019BaseImageVersion) } - task: CodeQL3000Finalize@0 @@ -759,7 +778,16 @@ extends: inputs: targetType: 'inline' script: | + # Push when: NOT a PR, OR when PR is from specific branches (zane/ci-agent-auto-deploy or branches containing 'run-e2e') + $shouldPush = $false if ("$(Build.Reason)" -ne "PullRequest") { + $shouldPush = $true + } elseif ("$(System.PullRequest.SourceBranch)" -eq "zane/ci-agent-auto-deploy" -or "$(System.PullRequest.SourceBranch)" -like "*run-e2e*") { + $shouldPush = $true + Write-Host "PR from branch $(System.PullRequest.SourceBranch) - will push image to ACR for E2E testing" + } + + if ($shouldPush) { docker push ${{ variables.repoImageName }}:$(windowsImageTag)-$(windows2022BaseImageVersion) } - task: CodeQL3000Finalize@0 @@ -800,7 +828,16 @@ extends: az account set -s ${{ variables.subscription }} az acr login -n ${{ variables.containerRegistry }} @{"image.name"="${{ variables.repoImageName }}:$(windowsImageTag)"} | ConvertTo-Json -Compress | Out-File -Encoding ascii $(Build.ArtifactStagingDirectory)/windows/metadata.json + # Push when: NOT a PR, OR when PR is from specific branches (zane/ci-agent-auto-deploy or branches containing 'run-e2e') + $shouldPush = $false if ("$(Build.Reason)" -ne "PullRequest") { + $shouldPush = $true + } elseif ("$(System.PullRequest.SourceBranch)" -eq "zane/ci-agent-auto-deploy" -or "$(System.PullRequest.SourceBranch)" -like "*run-e2e*") { + $shouldPush = $true + Write-Host "PR from branch $(System.PullRequest.SourceBranch) - will push multi-arch image to ACR for E2E testing" + } + + if ($shouldPush) { docker manifest create ${{ variables.repoImageName }}:$(windowsImageTag) ${{ variables.repoImageName }}:$(windowsImageTag)-$(windows2019BaseImageVersion) ${{ variables.repoImageName }}:$(windowsImageTag)-$(windows2022BaseImageVersion) docker manifest push ${{ variables.repoImageName }}:$(windowsImageTag) Write-Host "##vso[task.logissue type=warning]Windows image built with tag: ${{ variables.repoImageName }}:$(windowsImageTag)" From c3a811cba2bb1d771ddcfa9e044d8410e50a4062 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 12 Dec 2025 15:18:55 -0800 Subject: [PATCH 38/40] pin fluentd version --- kubernetes/linux/setup.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index ed037d598c..37a46d487c 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -78,6 +78,12 @@ echo "$(fluent-bit --version)" >> packages_version.txt # install fluentd fluentd_version="1.16.3" + +# Pre-install cool.io to avoid ARM64 build issues (segfault during native extension compilation) +if [ "$ARCH" == "arm64" ]; then + gem install cool.io -v "1.8.0" --no-document +fi + gem install fluentd -v $fluentd_version --no-document # remove the test directory from fluentd From ca3cb540a1ba79edace9945a9fc30bfe4ca30e52 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 12 Dec 2025 16:21:51 -0800 Subject: [PATCH 39/40] pin cool.io to 1.9.0 --- kubernetes/linux/setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index 37a46d487c..71bf8f38ff 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -81,7 +81,7 @@ fluentd_version="1.16.3" # Pre-install cool.io to avoid ARM64 build issues (segfault during native extension compilation) if [ "$ARCH" == "arm64" ]; then - gem install cool.io -v "1.8.0" --no-document + gem install cool.io -v "1.9.0" --no-document fi gem install fluentd -v $fluentd_version --no-document From bea28083a22487987b93db476b856d4256fa09b4 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 12 Dec 2025 17:15:31 -0800 Subject: [PATCH 40/40] remove cool.io install --- kubernetes/linux/setup.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index 71bf8f38ff..31883f858c 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -80,9 +80,9 @@ echo "$(fluent-bit --version)" >> packages_version.txt fluentd_version="1.16.3" # Pre-install cool.io to avoid ARM64 build issues (segfault during native extension compilation) -if [ "$ARCH" == "arm64" ]; then - gem install cool.io -v "1.9.0" --no-document -fi +# if [ "$ARCH" == "arm64" ]; then +# gem install cool.io -v "1.9.0" --no-document +# fi gem install fluentd -v $fluentd_version --no-document