From 17e48339a30958302c4e54bb809135af6762f90b Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Thu, 4 Dec 2025 15:17:46 -0800
Subject: [PATCH 01/40] add bebugging

---
 .config/guardian/.gdnbaselines                |   2 +-
 .pipelines/azure_pipeline_mergedbranches.yaml |  69 ++-
 ...eploy-and-test-ci-image-in-aks-cluster.yml | 263 +++++++++++
 .pipelines/e2e-test/verify-pod-images.sh      | 388 ++++++++++++++++
 ...I-Agent-Auto-Deploy-Implementation-Plan.md | 417 ++++++++++++++++++
 test/testkube/helm-testkube-values.yaml       |   3 +
 .../install-and-execute-testkube-tests.sh     |   2 +-
 7 files changed, 1140 insertions(+), 4 deletions(-)
 create mode 100644 .pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml
 create mode 100644 .pipelines/e2e-test/verify-pod-images.sh
 create mode 100644 Documentation/CI-Agent-Auto-Deploy-Implementation-Plan.md

diff --git a/.config/guardian/.gdnbaselines b/.config/guardian/.gdnbaselines
index 2b12b418dd..eff01b8012 100644
--- a/.config/guardian/.gdnbaselines
+++ b/.config/guardian/.gdnbaselines
@@ -154,4 +154,4 @@
       "justification": "This error is baselined with an expiration date of 180 days from 2025-05-20 23:41:13Z"
     }
   }
-}
\ No newline at end of file
+}
diff --git a/.pipelines/azure_pipeline_mergedbranches.yaml b/.pipelines/azure_pipeline_mergedbranches.yaml
index 291772961a..db5c896550 100644
--- a/.pipelines/azure_pipeline_mergedbranches.yaml
+++ b/.pipelines/azure_pipeline_mergedbranches.yaml
@@ -42,7 +42,15 @@ extends:
     customBuildTags:
     - ES365AIMigrationTooling
     stages:
-    - stage: stage
+    # This stage will be skipped when LinuxImageOverride and WindowsImageOverride are both set
+    # This feature allows bypassing the build stage when using pre-built images for testing, which saves time and resources.
+    - stage: Build_And_Publish_Images
+      displayName: 'Build and Publish Container Images'
+      condition: |
+        or(
+          eq(variables['LinuxImageOverride'], ''),
+          eq(variables['WindowsImageOverride'], '')
+        )
       jobs:
       - job: common
         pool:
@@ -880,4 +888,61 @@ extends:
             ScanType: CustomScan
             FileDirPath: '$(Build.ArtifactStagingDirectory)'
             DisableRemediation: false
-            AcceptableOutdatedSignatureInHours: 72
\ No newline at end of file
+            AcceptableOutdatedSignatureInHours: 72
+    - stage: Deploy_and_Test_Images_In_Dev_Clusters
+      displayName: Deploy and Test Images in Dev Clusters
+      lockBehavior: sequential
+      dependsOn: 
+        - Build_And_Publish_Images
+      # Deploy runs when Build succeeds OR when Build is skipped with valid overrides
+      # TODO: remove eq(variables['Build.SourceBranch'], 'refs/heads/zane/ci-agent-auto-deploy'),
+      # this stage runs when Build_And_Publish_Images succeeds or is skipped with valid overrides.
+      condition: |
+        and(
+          or(
+            eq(variables['Build.SourceBranch'], 'refs/heads/ci_prod'),
+            eq(variables['Build.SourceBranch'], 'refs/heads/zane/ci-agent-auto-deploy'),
+            contains(variables['Build.SourceBranch'], 'run-e2e')
+          ),
+          or(
+            eq(dependencies.Build_And_Publish_Images.result, 'Succeeded'),
+            and(
+              eq(dependencies.Build_And_Publish_Images.result, 'Skipped'),
+              ne(variables['LinuxImageOverride'], ''),
+              ne(variables['WindowsImageOverride'], '')
+            )
+          )
+        )
+      variables:
+        # Use images built from previous build stage by default
+        # To override: Set pipeline variables 'LinuxImageOverride' and 'WindowsImageOverride' when queuing
+        linuxImageTagUnderTest: $[coalesce(variables['LinuxImageOverride'], stageDependencies.Build_And_Publish_Images.common.outputs['setup.linuxImagetag'])]
+        windowsImageTagUnderTest: $[coalesce(variables['WindowsImageOverride'], stageDependencies.Build_And_Publish_Images.common.outputs['setup.windowsImageTag'])]
+      jobs:
+      # TODO: gradually add more clusters from test automation framework when the tests are stable
+      # TODO: TeamsWebhookUri to be added
+      # Cluster 1: zane-test Cluster
+      - template: /.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml@self
+        parameters:
+          clusterName: 'zane-test'
+          resourceGroup: 'zane-test'
+          azureSubscription: 'ContainerInsights_Build_Subscription_CI'
+          environmentName: 'CI-Agent-Dev'
+          linuxImageTag: $(linuxImageTagUnderTest)
+          windowsImageTag: $(windowsImageTagUnderTest)
+          azureClientId: $(AksZaneTestClientId)
+          azureTenantId: $(AzureZaneTestTenantId)
+          teamsWebhookUri: $(TeamsWebhookUri)
+
+      # Cluster 2: zane-test2 Cluster
+      - template: /.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml@self
+        parameters:
+          clusterName: 'zane-test2'
+          resourceGroup: 'zane-test'
+          azureSubscription: 'ContainerInsights_Build_Subscription_CI'
+          environmentName: 'CI-Agent-Dev2'
+          linuxImageTag: $(linuxImageTagUnderTest)
+          windowsImageTag: $(windowsImageTagUnderTest)
+          azureClientId: $(AksZaneTest2ClientId)
+          azureTenantId: $(AzureZaneTestTenantId)
+          teamsWebhookUri: $(TeamsWebhookUri)
diff --git a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml
new file mode 100644
index 0000000000..69f557c8b4
--- /dev/null
+++ b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml
@@ -0,0 +1,263 @@
+parameters:
+- name: clusterName
+  type: string
+- name: resourceGroup
+  type: string
+- name: azureSubscription
+  type: string
+  default: 'ContainerInsights_Build_Subscription_CI'
+- name: environmentName
+  type: string
+- name: linuxImageTag
+  type: string
+- name: windowsImageTag
+  type: string
+- name: azureClientId
+  type: string
+- name: azureTenantId
+  type: string
+- name: teamsWebhookUri
+  type: string
+  default: '$(TeamsWebhookUri)'
+- name: additionalTestParams
+  type: string
+  default: ''
+
+jobs:
+- deployment: Deploy_${{ replace(parameters.clusterName, '-', '_') }}
+  displayName: 'Deploy & Test: ${{ parameters.clusterName }}'
+  environment: ${{ parameters.environmentName }}
+  pool:
+    name: Azure-Pipelines-CI-Test-EO
+    image: ci-1es-managed-ubuntu-2204
+    os: linux
+  variables:
+    skipComponentGovernanceDetection: true
+  strategy:
+    runOnce:
+      deploy:
+        steps:
+        # Log deployment start
+        - bash: |
+            set -euo pipefail
+            
+            echo "========================================="
+            echo "CLUSTER DEPLOYMENT STARTING"
+            echo "========================================="
+            echo "Cluster: ${{ parameters.clusterName }}"
+            echo "Environment: ${{ parameters.environmentName }}"
+            echo "Build ID: $(Build.BuildId)"
+            echo "Pipeline Run: $(Build.BuildNumber)"
+            echo ""
+            echo "✓ Sequential deployment locking enabled at stage level"
+            echo "✓ Multiple pipeline runs will execute sequentially"
+            echo "========================================="
+          displayName: 'Deployment Start'
+          
+        - checkout: self
+          persistCredentials: true
+        
+        - script: |
+            set -euo pipefail
+            echo "Ensuring kubectl & helm are installed"
+            if ! command -v kubectl >/dev/null 2>&1; then
+              echo "Installing kubectl"
+              sudo az aks install-cli
+            else
+              echo "kubectl already installed: $(kubectl version --client --short || true)"
+            fi
+            if ! command -v helm >/dev/null 2>&1; then
+              echo "Installing Helm 3"
+              curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
+            else
+              echo "Helm already installed: $(helm version --short || true)"
+            fi
+          displayName: 'Install kubectl and Helm'
+        
+        - task: AzureCLI@2
+          displayName: 'Get credentials for ${{ parameters.clusterName }}'
+          inputs:
+            azureSubscription: ${{ parameters.azureSubscription }}
+            scriptLocation: 'inlineScript'
+            scriptType: 'bash'
+            inlineScript: 'az aks get-credentials -g ${{ parameters.resourceGroup }} -n ${{ parameters.clusterName }}'
+        
+        # Determine MCR repository paths based on image tags.
+        - task: Bash@3
+          name: DetermineMcrRepo
+          displayName: 'Determine MCR Repository Paths'
+          env:
+            LINUX_IMAGE_TAG: ${{ parameters.linuxImageTag }}
+            WINDOWS_IMAGE_TAG: ${{ parameters.windowsImageTag }}
+          inputs:
+            targetType: 'inline'
+            script: |
+              # Function to determine registry path based on image tag
+              # CI dev builds contain git hash pattern (e.g., -gbdc2f3f42-20250701203056)
+              # Production releases are simple versions (e.g., 3.1.32)
+              get_mcr_repo() {
+                local image_tag="$1"
+                if [[ "$image_tag" =~ -g[a-f0-9]+-[0-9]+ ]]; then
+                  echo "mcr.microsoft.com/azuremonitor/containerinsights/cidev"
+                else
+                  echo "mcr.microsoft.com/azuremonitor/containerinsights/ciprod"
+                fi
+              }
+              
+              LINUX_MCR_REPO=$(get_mcr_repo "$LINUX_IMAGE_TAG")
+              WINDOWS_MCR_REPO=$(get_mcr_repo "$WINDOWS_IMAGE_TAG")
+              
+              echo "Repository Path Detection:"
+              echo "  Linux image tag: $LINUX_IMAGE_TAG"
+              echo "  → Linux MCR repo: $LINUX_MCR_REPO"
+              echo "  Windows image tag: $WINDOWS_IMAGE_TAG"
+              echo "  → Windows MCR repo: $WINDOWS_MCR_REPO"
+              
+              # Export for subsequent steps
+              echo "##vso[task.setvariable variable=linuxMcrRepo;isOutput=true]$LINUX_MCR_REPO"
+              echo "##vso[task.setvariable variable=windowsMcrRepo;isOutput=true]$WINDOWS_MCR_REPO"
+        
+        # TODO: consider to use helm chart when it is ready for aks deployment
+        - task: Bash@3
+          displayName: 'Patch ama-logs pods with new images'
+          env:
+            LINUX_IMAGE_TAG: ${{ parameters.linuxImageTag }}
+            WINDOWS_IMAGE_TAG: ${{ parameters.windowsImageTag }}
+            LINUX_MCR_REPO: $(DetermineMcrRepo.linuxMcrRepo)
+            WINDOWS_MCR_REPO: $(DetermineMcrRepo.windowsMcrRepo)
+          inputs:
+            targetType: 'inline'
+            script: |
+              echo "Deploying to cluster: ${{ parameters.clusterName }}"
+              echo "  Linux image: $LINUX_MCR_REPO:$LINUX_IMAGE_TAG"
+              echo "  Windows image: $WINDOWS_MCR_REPO:$WINDOWS_IMAGE_TAG"
+              echo ""
+              echo "Finding and patching ama-logs pods in kube-system namespace..."
+              
+              kubectl get pods -n kube-system --no-headers | grep ama-logs | awk '{print $1}' | while read pod_name; do
+                echo "Processing pod: $pod_name"
+                
+                if [[ "$pod_name" =~ ^ama-logs-windows ]]; then
+                  IMG_URL="$WINDOWS_MCR_REPO:$WINDOWS_IMAGE_TAG"
+                  container_name="ama-logs-windows"
+                elif [[ "$pod_name" =~ ^ama-logs-rs ]] || [[ "$pod_name" =~ ^ama-logs-[a-z0-9]{5}$ ]]; then
+                  IMG_URL="$LINUX_MCR_REPO:$LINUX_IMAGE_TAG"
+                  container_name="ama-logs"
+                else
+                  echo "  ⚠ Unknown pod pattern: $pod_name - skipping"
+                  continue
+                fi
+                
+                echo "  → Patching with image: $IMG_URL (container: $container_name)"
+                
+                kubectl patch pod "$pod_name" -n kube-system \
+                  --patch "{\"spec\": {\"containers\": [{\"name\": \"$container_name\", \"image\": \"$IMG_URL\"}]}}" \
+                  && echo "  ✓ Successfully patched $pod_name" \
+                  || echo "  ✗ Failed to patch $pod_name"
+              done
+              
+              echo ""
+              echo "Pod patching complete!"
+              echo "Current ama-logs pods:"
+              kubectl get pods -n kube-system | grep ama-logs
+        
+        # verify ci agent gets the new images
+        # output container start time for log analytics filtering
+        - task: Bash@3
+          name: VerifyPods
+          displayName: 'Wait for pods to be ready with new images'
+          env:
+            LINUX_IMAGE_TAG: ${{ parameters.linuxImageTag }}
+            WINDOWS_IMAGE_TAG: ${{ parameters.windowsImageTag }}
+            LINUX_MCR_REPO: $(DetermineMcrRepo.linuxMcrRepo)
+            WINDOWS_MCR_REPO: $(DetermineMcrRepo.windowsMcrRepo)
+          inputs:
+            targetType: 'inline'
+            script: |
+              chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-pod-images.sh
+              $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-pod-images.sh pre-test "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO"
+              
+              # Export container start time for use in tests
+              if [ -f /tmp/container-deployment-time.env ]; then
+                source /tmp/container-deployment-time.env
+                echo "Container start time captured: $CONTAINER_START_TIME"
+                echo "##vso[task.setvariable variable=CONTAINER_START_TIME;isOutput=true]$CONTAINER_START_TIME"
+              else
+                echo "ERROR: Container start time not found at /tmp/container-deployment-time.env"
+                echo "This is required for Log Analytics query filtering"
+                exit 1
+              fi
+        
+        - task: Bash@3
+          displayName: 'Wait for logs to be ingested into Log Analytics (20 min)'
+          inputs:
+            targetType: 'inline'
+            script: |
+              echo "========================================"
+              echo "Waiting for Log Analytics Ingestion"
+              echo "========================================"
+              echo "Cluster: ${{ parameters.clusterName }}"
+              echo "Container start time: $(VerifyPods.CONTAINER_START_TIME)"
+              echo ""
+              echo "Waiting 20 minutes to allow logs to be ingested..."
+              echo "This ensures queries will find logs from the newly deployed containers."
+              echo ""
+              
+              wait_time=1200
+              interval=60
+              elapsed=0
+              
+              while [ $elapsed -lt $wait_time ]; do
+                remaining=$((wait_time - elapsed))
+                minutes_elapsed=$((elapsed / 60))
+                minutes_remaining=$((remaining / 60))
+                echo "⏳ Waiting... ($minutes_elapsed/$((wait_time / 60)) minutes elapsed, $minutes_remaining minutes remaining)"
+                sleep $interval
+                elapsed=$((elapsed + interval))
+              done
+              
+              echo ""
+              echo "✓ Wait complete! Logs should now be available in Log Analytics."
+              echo "✓ Tests will query logs with filter: TimeGenerated > datetime('$(VerifyPods.CONTAINER_START_TIME)')"
+              echo "========================================"
+        # TODO (improvement): container start time is captured in previous step, but not used for now. Consider passing container start time to test script to use in log queries
+        - bash: |
+            # Pass container start time to tests
+            export CONTAINER_START_TIME="$(VerifyPods.CONTAINER_START_TIME)"
+            echo "Running tests for cluster: ${{ parameters.clusterName }}"
+            echo "Container start time: $CONTAINER_START_TIME"
+            
+            chmod +x ./install-and-execute-testkube-tests.sh
+            ./install-and-execute-testkube-tests.sh \
+              AzureClientId=${{ parameters.azureClientId }} \
+              AzureTenantId=${{ parameters.azureTenantId }} \
+              TeamsWebhookUri=${{ parameters.teamsWebhookUri }} \
+              ${{ parameters.additionalTestParams }}
+          workingDirectory: $(Build.SourcesDirectory)/test/testkube/
+          displayName: 'Install Testkube and run E2E tests'
+        
+        - task: Bash@3
+          displayName: 'Verify images remained stable after tests'
+          condition: always()
+          env:
+            LINUX_IMAGE_TAG: ${{ parameters.linuxImageTag }}
+            WINDOWS_IMAGE_TAG: ${{ parameters.windowsImageTag }}
+            LINUX_MCR_REPO: $(DetermineMcrRepo.linuxMcrRepo)
+            WINDOWS_MCR_REPO: $(DetermineMcrRepo.windowsMcrRepo)
+          inputs:
+            targetType: 'inline'
+            script: |
+              chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-pod-images.sh
+              $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-pod-images.sh post-test "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO"
+        
+        # Log deployment completion
+        - bash: |
+            echo "========================================="
+            echo "DEPLOYMENT COMPLETE"
+            echo "========================================="
+            echo "Cluster: ${{ parameters.clusterName }}"
+            echo "Build ID: $(Build.BuildId)"
+            echo "✓ Deployment finished for: ${{ parameters.clusterName }}"
+            echo "========================================="
+          displayName: 'Deployment Completion'
+          condition: always()
diff --git a/.pipelines/e2e-test/verify-pod-images.sh b/.pipelines/e2e-test/verify-pod-images.sh
new file mode 100644
index 0000000000..c7b4df2eb4
--- /dev/null
+++ b/.pipelines/e2e-test/verify-pod-images.sh
@@ -0,0 +1,388 @@
+#!/bin/bash
+# Script to verify AKS pod images match expected tags
+# Can be used for both pre-test and post-test verification
+
+set -e
+
+# Parse command line arguments
+MODE="${1:-pre-test}"  # pre-test or post-test
+LINUX_IMAGE_TAG="${2}"
+WINDOWS_IMAGE_TAG="${3}"
+LINUX_MCR_REPO="${4}"
+WINDOWS_MCR_REPO="${5}"
+
+if [ -z "$LINUX_IMAGE_TAG" ] || [ -z "$WINDOWS_IMAGE_TAG" ] || [ -z "$LINUX_MCR_REPO" ] || [ -z "$WINDOWS_MCR_REPO" ]; then
+  echo "Error: Missing required parameters"
+  echo "Usage: $0 <pre-test|post-test> <linux-image-tag> <windows-image-tag> <linux-mcr-repo> <windows-mcr-repo>"
+  exit 1
+fi
+
+LINUX_IMAGE="$LINUX_MCR_REPO:$LINUX_IMAGE_TAG"
+WINDOWS_IMAGE="$WINDOWS_MCR_REPO:$WINDOWS_IMAGE_TAG"
+
+if [ "$MODE" = "pre-test" ]; then
+  echo "================================"
+  echo "Pre-Test Image Verification"
+  echo "================================"
+  echo "Verifying pods are running with new images and are ready..."
+else
+  echo "================================"
+  echo "Post-Test Image Verification"
+  echo "================================"
+  echo "Verifying pods still have the correct images after test execution..."
+fi
+
+echo ""
+echo "Repository Configuration:"
+echo "  Linux MCR repo:   $LINUX_MCR_REPO"
+echo "  Windows MCR repo: $WINDOWS_MCR_REPO"
+echo ""
+echo "Expected Images:"
+echo "  Linux image:   $LINUX_IMAGE"
+echo "  Windows image: $WINDOWS_IMAGE"
+echo ""
+
+# Unified function to check all pods (with optional retry attempts)
+# max_retries of 0 means instant check (no wait), otherwise retries up to max_retries times
+check_all_pods() {
+  local -n configs_ref=$1  # Use different name to avoid circular reference
+  local max_retries=${2:-0}  # Default to 0 (instant check, no retry)
+  local check_interval=15  # Wait 15 seconds between retries
+  
+  if [ $max_retries -gt 0 ]; then
+    # Wait mode (pre-test): Monitor pods with retries
+    local attempt=1
+    
+    echo "================================"
+    echo "Waiting for all pods to be ready"
+    echo "================================"
+    echo "Total pods to check: ${#configs_ref[@]}"
+    echo "Maximum retries: $max_retries"
+    echo "Check interval: ${check_interval}s"
+    echo "Maximum wait time: $(((max_retries * check_interval) / 60)) minutes"
+    echo ""
+    
+    # Track ready status for each pod
+    declare -A pod_ready_status
+    for config in "${configs_ref[@]}"; do
+      pod_name=$(echo "$config" | cut -d: -f1)
+      pod_ready_status["$pod_name"]=false
+    done
+    
+    while [ $attempt -le $max_retries ]; do
+      local all_ready=true
+      local ready_count=0
+      local total_count=${#configs_ref[@]}
+      
+      # Check each pod in this iteration
+      for config in "${configs_ref[@]}"; do
+        IFS=':' read -r pod_name expected_image container_name <<< "$config"
+        
+        # Skip if already marked as ready
+        if [ "${pod_ready_status[$pod_name]}" = "true" ]; then
+          ((ready_count++))
+          continue
+        fi
+        
+        # DEBUG: Try alternative methods to get the image
+        # Method 1: Original jsonpath (what we've been using)
+        current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "")
+        
+        # Method 2: If method 1 is empty, try getting first container image
+        if [ -z "$current_image" ]; then
+          current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[0].image}" 2>/dev/null || echo "")
+          echo "  [DEBUG] Method 1 (jsonpath filter) returned empty, trying method 2 (first container)"
+          echo "  [DEBUG] Method 2 result: $current_image"
+        fi
+        
+        # Method 3: If still empty, try go-template
+        if [ -z "$current_image" ]; then
+          current_image=$(kubectl get pod "$pod_name" -n kube-system -o go-template='{{range .spec.containers}}{{if eq .name "'"$container_name"'"}}{{.image}}{{end}}{{end}}' 2>/dev/null || echo "")
+          echo "  [DEBUG] Method 2 also empty, trying method 3 (go-template)"
+          echo "  [DEBUG] Method 3 result: $current_image"
+        fi
+        
+        pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown")
+        
+        # Try similar methods for container ready
+        container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "")
+        if [ -z "$container_ready" ]; then
+          container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[0].ready}" 2>/dev/null || echo "false")
+        fi
+        
+        # Check if pod is ready
+        if [[ "$current_image" == "$expected_image" ]] && [[ "$pod_status" == "Running" ]] && [[ "$container_ready" == "true" ]]; then
+          pod_ready_status["$pod_name"]=true
+          ((ready_count++))
+          echo "  ✓ $pod_name - Ready"
+        else
+          all_ready=false
+          
+          # Show status for pods that aren't ready yet
+          if [ $((attempt % 4)) -eq 1 ] || [ $attempt -eq 1 ]; then  # Log every 60 seconds
+            echo "  ⏳ $pod_name - Waiting (Status: $pod_status, Container ready: $container_ready)"
+            if [[ "$current_image" != "$expected_image" ]]; then
+              echo "      Image mismatch: expected $expected_image, got $current_image"
+            fi
+          fi
+        fi
+      done
+      
+      # Show progress summary
+      local elapsed_seconds=$(((attempt - 1) * check_interval))
+      local minutes_elapsed=$((elapsed_seconds / 60))
+      local seconds_elapsed=$((elapsed_seconds % 60))
+      local remaining_retries=$((max_retries - attempt))
+      local remaining_seconds=$((remaining_retries * check_interval))
+      local minutes_remaining=$((remaining_seconds / 60))
+      local seconds_remaining=$((remaining_seconds % 60))
+      
+      if [ $((attempt % 4)) -eq 1 ] || [ $attempt -eq 1 ] || [ "$all_ready" = true ]; then
+        echo ""
+        echo "Attempt $attempt/$max_retries (${minutes_elapsed}m${seconds_elapsed}s elapsed, ${minutes_remaining}m${seconds_remaining}s remaining)"
+        echo "Progress: $ready_count/$total_count pods ready"
+        echo ""
+      fi
+      
+      # Exit early if all pods are ready
+      if [ "$all_ready" = true ]; then
+        echo "================================"
+        echo "✓ SUCCESS: All pods are ready!"
+        echo "================================"
+        echo "Total attempts: $attempt"
+        echo "Total wait time: ${minutes_elapsed}m${seconds_elapsed}s"
+        echo ""
+        return 0
+      fi
+      
+      # Don't sleep after the last attempt
+      if [ $attempt -lt $max_retries ]; then
+        sleep $check_interval
+      fi
+      
+      ((attempt++))
+    done
+    
+    # Max retries reached - report which pods failed
+    echo "================================"
+    echo "✗ MAX RETRIES REACHED: Not all pods became ready after $max_retries attempts"
+    echo "================================"
+    echo ""
+    echo "Failed pods:"
+    for config in "${configs_ref[@]}"; do
+      IFS=':' read -r pod_name expected_image container_name <<< "$config"
+      if [ "${pod_ready_status[$pod_name]}" != "true" ]; then
+        current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[0].image}" 2>/dev/null || echo "ERROR")
+        pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown")
+        container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[0].ready}" 2>/dev/null || echo "false")
+        
+        echo "  ✗ $pod_name"
+        echo "      Expected image: $expected_image"
+        echo "      Current image:  $current_image"
+        echo "      Pod status:     $pod_status"
+        echo "      Container ready: $container_ready"
+      fi
+    done
+    echo ""
+    
+    return 1
+  else
+    # Instant check mode (post-test): Single check, no waiting
+    local mismatches=()
+    
+    echo "Performing instant verification of all pods..."
+    echo ""
+    
+    for config in "${configs_ref[@]}"; do
+      IFS=':' read -r pod_name expected_image container_name <<< "$config"
+      
+      # Use first container image as fallback
+      current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[0].image}" 2>/dev/null || echo "ERROR")
+      pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown")
+      
+      echo "Pod: $pod_name"
+      echo "  Container: $container_name"
+      echo "  Expected image: $expected_image"
+      echo "  Current image:  $current_image"
+      echo "  Pod status: $pod_status"
+      
+      if [[ "$current_image" != "$expected_image" ]]; then
+        echo "  ✗ IMAGE MISMATCH DETECTED!"
+        mismatches+=("$pod_name: expected '$expected_image' but found '$current_image'")
+      else
+        echo "  ✓ Image is correct"
+      fi
+      echo ""
+    done
+    
+    # Return mismatches via global array (bash limitation workaround)
+    image_mismatches=("${mismatches[@]}")
+    
+    if [ ${#mismatches[@]} -eq 0 ]; then
+      return 0
+    else
+      return 1
+    fi
+  fi
+}
+
+# Get all ama-logs pods
+echo "Getting list of ama-logs pods..."
+pod_list=$(kubectl get pods -n kube-system --no-headers | grep ama-logs | awk '{print $1}')
+
+# Build configurations for all pods
+pod_configs=()
+image_mismatches=()
+
+for pod_name in $pod_list; do
+  # Determine expected image based on pod type
+  if [[ "$pod_name" =~ ^ama-logs-windows ]]; then
+    expected_image="$WINDOWS_IMAGE"
+    container_name="ama-logs-windows"
+  elif [[ "$pod_name" =~ ^ama-logs-rs ]] || [[ "$pod_name" =~ ^ama-logs-[a-z0-9]{5}$ ]]; then
+    # Matches both ReplicaSet pods (ama-logs-rs-*) and DaemonSet pods (ama-logs-xxxxx)
+    expected_image="$LINUX_IMAGE"
+    container_name="ama-logs"
+  else
+    echo "⚠ Unknown pod pattern: $pod_name - skipping verification"
+    continue
+  fi
+  
+  # Add to configurations for parallel checking
+  pod_configs+=("$pod_name:$expected_image:$container_name")
+done
+
+echo "Found ${#pod_configs[@]} pods to verify"
+echo ""
+
+# Use different check based on mode
+if [ "$MODE" = "pre-test" ]; then
+  # Pre-test: Wait for all pods to be ready (60 retries × 15s = 15 minutes max)
+  if ! check_all_pods pod_configs 60; then
+    # Function already reports which pods failed
+    failed_pods=true
+  else
+    failed_pods=false
+  fi
+else
+  # Post-test: Instant check of all pods (no retry)
+  check_all_pods pod_configs 0
+fi
+
+echo ""
+echo "================================"
+if [ "$MODE" = "pre-test" ]; then
+  echo "Pre-Test Verification Summary"
+else
+  echo "Post-Test Verification Summary"
+fi
+echo "================================"
+
+# Report results based on mode
+if [ "$MODE" = "pre-test" ]; then
+  if [ "$failed_pods" = false ]; then
+    echo "✓ All pods are running with the correct images and are ready!"
+    echo ""
+    echo "Final pod status:"
+    kubectl get pods -n kube-system | grep ama-logs
+    echo ""
+    echo "Image verification:"
+    kubectl get pods -n kube-system -l component=ama-logs-agent -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.containers[0].image}{"\t"}{.status.phase}{"\t"}{.status.containerStatuses[0].ready}{"\n"}{end}' | column -t 2>/dev/null || true
+    
+    echo ""
+    echo "================================"
+    echo "Container Start Time Capture"
+    echo "================================"
+    echo "Capturing LATEST container start time for Log Analytics queries..."
+    
+    # Get all container start times and find the LATEST one
+    latest_start_time=""
+    
+    pod_list=$(kubectl get pods -n kube-system --no-headers | grep ama-logs | awk '{print $1}')
+    for pod_name in $pod_list; do
+      # Get container name based on pod type
+      if [[ "$pod_name" =~ ^ama-logs-windows ]]; then
+        container_name="ama-logs-windows"
+      elif [[ "$pod_name" =~ ^ama-logs-rs ]] || [[ "$pod_name" =~ ^ama-logs-[a-z0-9]{5}$ ]]; then
+        container_name="ama-logs"
+      else
+        continue
+      fi
+      
+      # Get container start time - try first container if filter doesn't work
+      start_time=$(kubectl get pod "$pod_name" -n kube-system \
+        -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].state.running.startedAt}" 2>/dev/null || echo "")
+      
+      if [ -z "$start_time" ]; then
+        start_time=$(kubectl get pod "$pod_name" -n kube-system \
+          -o jsonpath="{.status.containerStatuses[0].state.running.startedAt}" 2>/dev/null || echo "")
+      fi
+      
+      if [ -n "$start_time" ]; then
+        echo "  Pod $pod_name container started at: $start_time"
+        
+        # Track LATEST time (lexicographically later in ISO 8601 format)
+        if [ -z "$latest_start_time" ] || [[ "$start_time" > "$latest_start_time" ]]; then
+          latest_start_time="$start_time"
+        fi
+      else
+        echo "✗ ERROR: Could not determine container start time for pod $pod_name (container: $container_name)"
+        echo "This is required for Log Analytics query filtering"
+        exit 1
+      fi
+    done
+    
+    if [ -n "$latest_start_time" ]; then
+      # Export for use in tests
+      echo "CONTAINER_START_TIME=$latest_start_time" > /tmp/container-deployment-time.env
+      echo ""
+      echo "✓ LATEST container start time: $latest_start_time"
+      echo "✓ Saved to /tmp/container-deployment-time.env"
+      echo "✓ Log Analytics queries should filter: TimeGenerated > datetime('$latest_start_time')"
+    else
+      echo "✗ ERROR: Could not determine container start times"
+      echo "This is required for Log Analytics query filtering"
+      exit 1
+    fi
+    
+    exit 0
+  else
+    echo "✗ Pod verification failed (see details above)"
+    echo ""
+    echo "Final pod status:"
+    kubectl get pods -n kube-system | grep ama-logs
+    exit 1
+  fi
+else
+  # Post-test mode
+  if [ ${#image_mismatches[@]} -eq 0 ]; then
+    echo "✓ SUCCESS: All pods maintained the correct images throughout the test execution!"
+    echo ""
+    echo "Final pod status:"
+    kubectl get pods -n kube-system | grep ama-logs
+    echo ""
+    echo "Image summary:"
+    kubectl get pods -n kube-system -l component=ama-logs-agent -o custom-columns=NAME:.metadata.name,IMAGE:.spec.containers[0].image,STATUS:.status.phase,READY:.status.containerStatuses[0].ready 2>/dev/null || true
+    exit 0
+  else
+    echo "✗ FAILURE: Some pods changed images during test execution!"
+    echo ""
+    echo "Pods with image mismatches:"
+    printf '  - %s\n' "${image_mismatches[@]}"
+    echo ""
+    echo "This indicates the pods may have been restarted or updated during testing."
+    echo "This could cause test instability or false results."
+    echo ""
+    echo "Current pod status:"
+    kubectl get pods -n kube-system | grep ama-logs
+    echo ""
+    echo "Detailed pod information:"
+    for mismatch in "${image_mismatches[@]}"; do
+      pod=$(echo "$mismatch" | cut -d: -f1)
+      echo ""
+      echo "--- Details for $pod ---"
+      kubectl describe pod "$pod" -n kube-system | grep -A 20 "Events:" || kubectl describe pod "$pod" -n kube-system | tail -30
+    done
+    exit 1
+  fi
+fi
diff --git a/Documentation/CI-Agent-Auto-Deploy-Implementation-Plan.md b/Documentation/CI-Agent-Auto-Deploy-Implementation-Plan.md
new file mode 100644
index 0000000000..98bbaf4319
--- /dev/null
+++ b/Documentation/CI-Agent-Auto-Deploy-Implementation-Plan.md
@@ -0,0 +1,417 @@
+# CI Agent Auto-Deploy Implementation Plan
+
+## Overview
+This document outlines the implementation plan for enabling auto-deployment of CI Agent to a dev cluster on every PR merge to main branch, following the Prom Agent pattern.
+
+**Goal:** Automatically deploy freshly built CI agent images to a dev cluster after each successful build on main branch.
+
+**Pattern:** Based on Prom Agent's `azure-pipeline-build.yml` approach - sequential deployments using `helm upgrade --install`.
+
+---
+
+## Key Findings
+
+### ✅ No Chart Modifications Needed
+- **ServiceAccount**: Hardcoded `ama-logs` works fine for sequential deployments
+- **Image Tags**: Can be overridden via `--set` flags at deployment time
+- **Release Name**: Using same release name (`ama-logs-dev`) for all deployments allows Helm to upgrade in place
+
+### ✅ Prom Agent Pattern
+- Uses `helm upgrade --install` with same release name every time
+- Deploys to different clusters (not multiple releases per cluster)
+- Each cluster has exactly ONE release
+- No ServiceAccount conflicts with sequential deployments
+
+---
+
+## Implementation Changes
+
+### 1. Pipeline Modification
+
+**File:** `Docker-Provider/.pipelines/azure_pipeline_mergedbranches.yaml`
+
+**Add Deployment Stage** after existing build stages:
+
+```yaml
+- stage: Deploy_Dev_Cluster
+  displayName: Deploy to Dev Cluster
+  dependsOn: 
+    - BuildLinuxImages
+    - BuildWindowsImages
+  # Only deploy on main branch merges (not PRs)
+  condition: and(succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
+  
+  jobs:
+  - deployment: Deploy_AKS_Chart
+    displayName: "Deploy: AKS dev cluster"
+    environment: CI-Agent-Dev  # Create this environment in Azure DevOps
+    pool:
+      name: Azure-Pipelines-CI-Test-EO
+    
+    variables:
+      # Get image tags from build stages
+      linuxImageTag: $[ stageDependencies.BuildLinuxImages.Build.outputs['setImageTag.linuxTag'] ]
+      windowsImageTag: $[ stageDependencies.BuildWindowsImages.Build.outputs['setImageTag.windowsTag'] ]
+    
+    strategy:
+      runOnce:
+        deploy:
+          steps:
+          - checkout: self
+          
+          - task: HelmDeploy@0
+            displayName: "Deploy to dev cluster"
+            inputs:
+              connectionType: 'Azure Resource Manager'
+              azureSubscription: 'ContainerInsights_Build_Subscription(9b96ebbd-c57a-42d1-bbe9-b69296e4c7fb)'
+              azureResourceGroup: 'YOUR-DEV-CLUSTER-RG'
+              kubernetesCluster: 'YOUR-DEV-CLUSTER-NAME'
+              useClusterAdmin: true
+              namespace: 'kube-system'
+              command: 'upgrade'
+              chartType: 'FilePath'
+              chartPath: '$(Build.SourcesDirectory)/charts/azuremonitor-containers/'
+              releaseName: 'ama-logs-dev'
+              overrideValues: |
+                amalogs.image.repo=mcr.microsoft.com/azuremonitor/containerinsights/cidev
+                amalogs.image.tag=$(linuxImageTag)
+                amalogs.image.tagWindows=$(windowsImageTag)
+              arguments: '--install --create-namespace'
+```
+
+---
+
+### 2. Ensure Build Stages Export Image Tags
+
+**Verify in BuildLinuxImages stage:**
+
+```yaml
+- stage: BuildLinuxImages
+  jobs:
+  - job: Build
+    steps:
+    # ... existing build steps ...
+    
+    # Add this step to export tag
+    - script: |
+        echo "##vso[task.setvariable variable=linuxTag;isOutput=true]$(IMAGE_TAG)"
+      name: setImageTag
+      displayName: Export Linux image tag
+```
+
+**Verify in BuildWindowsImages stage:**
+
+```yaml
+- stage: BuildWindowsImages
+  jobs:
+  - job: Build
+    steps:
+    # ... existing build steps ...
+    
+    # Add this step to export tag
+    - script: |
+        echo "##vso[task.setvariable variable=windowsTag;isOutput=true]$(IMAGE_TAG)"
+      name: setImageTag
+      displayName: Export Windows image tag
+```
+
+---
+
+### 3. Configuration Updates
+
+**Replace these placeholders with actual values:**
+
+| Placeholder | Description | Example Value |
+|-------------|-------------|---------------|
+| `YOUR-DEV-CLUSTER-RG` | Resource group containing dev cluster | `ci-dev-aks-rg` |
+| `YOUR-DEV-CLUSTER-NAME` | Name of dev AKS cluster | `ci-dev-aks-eus` |
+
+**Optional: Add more overrides for dev-specific configuration:**
+
+```yaml
+overrideValues: |
+  amalogs.image.repo=mcr.microsoft.com/azuremonitor/containerinsights/cidev
+  amalogs.image.tag=$(linuxImageTag)
+  amalogs.image.tagWindows=$(windowsImageTag)
+  amalogs.secret.wsid=YOUR-DEV-WORKSPACE-ID
+  amalogs.secret.key=YOUR-DEV-WORKSPACE-KEY
+  amalogs.env.clusterName=ci-dev-cluster
+  amalogs.ISTEST=true
+```
+
+---
+
+### 4. Azure DevOps Environment Setup
+
+**Create deployment environment:**
+1. Navigate to: Azure DevOps → Pipelines → Environments
+2. Click "New environment"
+3. Name: `CI-Agent-Dev`
+4. Resource: None (environment-only)
+5. (Optional) Add approval gates if needed
+
+---
+
+## Chart Details - No Modifications Required
+
+### ServiceAccount Handling
+- **Current:** Hardcoded as `ama-logs`
+- **Works because:** Sequential deployments reuse same ServiceAccount
+- **Pattern:** `helm upgrade` updates existing resources, doesn't recreate
+
+### Image Tag Handling
+- **Current:** Hardcoded in `values.yaml`
+- **Override:** Via `--set` flags at deployment time
+- **Files affected:** None (pure runtime override)
+
+### Files with ServiceAccount References (No changes needed)
+1. `templates/ama-logs-rbac.yaml` - Creates ServiceAccount `ama-logs`
+2. `templates/ama-logs-daemonset.yaml` - References `serviceAccountName: ama-logs`
+3. `templates/ama-logs-daemonset-windows.yaml` - References `serviceAccountName: ama-logs`
+4. `templates/ama-logs-deployment.yaml` - References `serviceAccountName: ama-logs`
+
+---
+
+## How It Works
+
+### Deployment Flow
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│ 1. PR Merged to Main Branch                                │
+└─────────────────────────────────────────────────────────────┘
+                         ▼
+┌─────────────────────────────────────────────────────────────┐
+│ 2. Build Pipeline Triggered                                 │
+│    - BuildLinuxImages stage → produces linuxImageTag        │
+│    - BuildWindowsImages stage → produces windowsImageTag    │
+└─────────────────────────────────────────────────────────────┘
+                         ▼
+┌─────────────────────────────────────────────────────────────┐
+│ 3. Deploy_Dev_Cluster Stage                                 │
+│    - Gets image tags from build stages                      │
+│    - Runs: helm upgrade ama-logs-dev --install              │
+│    - Overrides: image.tag=$(linuxImageTag)                  │
+└─────────────────────────────────────────────────────────────┘
+                         ▼
+┌─────────────────────────────────────────────────────────────┐
+│ 4. Helm Deployment on Dev Cluster                           │
+│    - First run: Creates new release "ama-logs-dev"          │
+│    - Subsequent runs: Updates existing release              │
+│    - ServiceAccount "ama-logs" reused (no conflicts)        │
+└─────────────────────────────────────────────────────────────┘
+                         ▼
+┌─────────────────────────────────────────────────────────────┐
+│ 5. Dev Cluster Running Latest Build                         │
+│    - DaemonSet updated with new image tags                  │
+│    - Windows DaemonSet updated with new image tags          │
+│    - Deployment (ReplicaSet) updated with new image tags    │
+└─────────────────────────────────────────────────────────────┘
+```
+
+### Sequential Deployment Example
+
+```bash
+# Build 1 - Creates initial deployment
+helm upgrade ama-logs-dev ./chart --install \
+  --set amalogs.image.tag=3.1.30-20231101 \
+  --set amalogs.image.tagWindows=win-3.1.30-20231101
+# Result: New release created, ServiceAccount "ama-logs" created
+
+# Build 2 - Updates existing deployment
+helm upgrade ama-logs-dev ./chart --install \
+  --set amalogs.image.tag=3.1.30-20231102 \
+  --set amalogs.image.tagWindows=win-3.1.30-20231102
+# Result: Release updated, ServiceAccount "ama-logs" reused ✅
+
+# Build 3 - Updates existing deployment
+helm upgrade ama-logs-dev ./chart --install \
+  --set amalogs.image.tag=3.1.30-20231103 \
+  --set amalogs.image.tagWindows=win-3.1.30-20231103
+# Result: Release updated, ServiceAccount "ama-logs" reused ✅
+```
+
+---
+
+## Testing Plan
+
+### Pre-Deployment Testing
+
+1. **Validate Chart Templates:**
+```bash
+cd Docker-Provider/charts/azuremonitor-containers
+helm template ama-logs-dev . \
+  --set amalogs.image.tag=test-tag \
+  --set amalogs.image.tagWindows=test-tag-win \
+  --debug
+```
+
+2. **Dry Run Deployment:**
+```bash
+helm upgrade ama-logs-dev . --install \
+  --namespace kube-system \
+  --set amalogs.image.tag=test-tag \
+  --dry-run --debug
+```
+
+### Post-Deployment Validation
+
+1. **Check Pipeline Execution:**
+   - Verify Deploy_Dev_Cluster stage runs
+   - Check image tags are passed correctly
+   - Confirm Helm deployment succeeds
+
+2. **Verify Cluster Deployment:**
+```bash
+# Check pods are running
+kubectl get pods -n kube-system | grep ama-logs
+
+# Verify DaemonSet
+kubectl describe daemonset ama-logs -n kube-system
+
+# Verify Windows DaemonSet  
+kubectl describe daemonset ama-logs-win -n kube-system
+
+# Verify Deployment (ReplicaSet)
+kubectl describe deployment ama-logs-rs -n kube-system
+
+# Check image tags match build
+kubectl get daemonset ama-logs -n kube-system -o jsonpath='{.spec.template.spec.containers[0].image}'
+```
+
+3. **Verify ServiceAccount:**
+```bash
+# Confirm ServiceAccount exists and is used
+kubectl get serviceaccount ama-logs -n kube-system
+kubectl get pods -n kube-system -l dsName=ama-logs-ds -o jsonpath='{.items[0].spec.serviceAccountName}'
+```
+
+---
+
+## Rollback Plan
+
+If deployment fails or causes issues:
+
+### Option 1: Rollback via Helm
+```bash
+# List releases
+helm list -n kube-system
+
+# Rollback to previous version
+helm rollback ama-logs-dev -n kube-system
+```
+
+### Option 2: Manual Revert
+```bash
+# Revert to specific image version
+helm upgrade ama-logs-dev ./chart --install \
+  --set amalogs.image.tag=PREVIOUS-WORKING-TAG \
+  --set amalogs.image.tagWindows=PREVIOUS-WORKING-TAG-win
+```
+
+### Option 3: Remove Pipeline Stage
+- Comment out `Deploy_Dev_Cluster` stage in pipeline
+- Commit and push
+- Cluster remains at current version
+
+---
+
+## Comparison: CI Agent vs Prom Agent
+
+| Aspect | Prom Agent | CI Agent (This Plan) |
+|--------|-----------|---------------------|
+| **Chart Changes** | None | None |
+| **ServiceAccount** | Hardcoded `ama-metrics-serviceaccount` | Hardcoded `ama-logs` |
+| **Deployment Method** | `helm upgrade --install` | `helm upgrade --install` |
+| **Release Name** | `ama-metrics` | `ama-logs-dev` |
+| **Image Override** | `--set image.tag=...` | `--set amalogs.image.tag=...` |
+| **Multiple Versions** | ❌ Not supported | ❌ Not supported (sequential only) |
+| **Cluster Strategy** | One release per cluster | One release per cluster |
+
+---
+
+## Estimated Effort
+
+| Task | Effort | Notes |
+|------|--------|-------|
+| Add deployment stage to pipeline | 30 min | Copy from Prom agent pattern |
+| Update cluster name/RG variables | 5 min | Simple config update |
+| Create Azure DevOps environment | 5 min | One-time setup |
+| Verify build tag exports | 15 min | May already exist |
+| Test dry-run deployment | 15 min | Validate before merge |
+| Deploy and validate | 30 min | First deployment + verification |
+| **Total** | **~2 hours** | Including testing and validation |
+
+---
+
+## Future Enhancements (Optional)
+
+### 1. Add E2E Tests Post-Deployment
+Similar to Prom agent's TestKube integration:
+```yaml
+- job: Run_E2E_Tests
+  dependsOn: Deploy_AKS_Chart
+  steps:
+  - script: kubectl testkube run testsuite ci-agent-e2e-tests
+```
+
+### 2. Deploy to Multiple Dev Clusters
+Add additional deployment jobs for different regions:
+```yaml
+- deployment: Deploy_EUS_Cluster
+  cluster: ci-dev-aks-eus
+  
+- deployment: Deploy_WUS_Cluster
+  cluster: ci-dev-aks-wus
+```
+
+### 3. Slack/Teams Notifications
+Notify team of successful deployments:
+```yaml
+- task: SlackNotification@1
+  inputs:
+    message: "✅ CI Agent $(linuxImageTag) deployed to dev cluster"
+```
+
+---
+
+## References
+
+- **Prom Agent Build Pipeline:** `prometheus-collector/.pipelines/azure-pipeline-build.yml`
+- **CI Agent Current Pipeline:** `Docker-Provider/.pipelines/azure_pipeline_mergedbranches.yaml`
+- **Helm Chart:** `Docker-Provider/charts/azuremonitor-containers/`
+- **Prom Agent Chart:** `prometheus-collector/otelcollector/deploy/addon-chart/azure-monitor-metrics-addon/`
+
+---
+
+## Questions & Answers
+
+### Q: Why not use Release.Name for ServiceAccount?
+**A:** Not needed for sequential deployments. Same release name = same ServiceAccount = no conflicts. Only needed for parallel deployments (multiple versions simultaneously).
+
+### Q: Can we deploy multiple versions to same cluster?
+**A:** No, with current approach (hardcoded ServiceAccount). Would require chart modifications to use `{{ .Release.Name }}` pattern. Not recommended unless specifically needed.
+
+### Q: What if build fails?
+**A:** Deploy stage has `condition: succeeded()` - won't run if build fails. Cluster stays at previous version.
+
+### Q: How to deploy to production?
+**A:** This plan is for dev cluster only. Production deployments should continue using existing release pipeline with proper approvals and phased rollouts.
+
+---
+
+## Status
+
+- [x] Research Prom agent pattern
+- [x] Document findings
+- [x] Create implementation plan
+- [ ] Update pipeline with deployment stage
+- [ ] Test deployment to dev cluster
+- [ ] Validate with team
+- [ ] Merge to main branch
+
+---
+
+**Last Updated:** 2025-11-07  
+**Author:** Implementation plan based on Prom agent analysis  
+**Status:** Ready for implementation
diff --git a/test/testkube/helm-testkube-values.yaml b/test/testkube/helm-testkube-values.yaml
index 04a273fe69..29727dcfba 100644
--- a/test/testkube/helm-testkube-values.yaml
+++ b/test/testkube/helm-testkube-values.yaml
@@ -1304,4 +1304,7 @@ testkube-operator:
     # ref: https://cloud.google.com/kubernetes-engine/docs/how-to/prepare-arm-workloads-for-deployment#node-affinity-multi-arch-arm
     # -- Tolerations to schedule a workload to nodes with any architecture type. Required for deployment to GKE cluster.
     tolerations: []
+<<<<<<< HEAD
 
+=======
+>>>>>>> efd34efac (add bebugging)
diff --git a/test/testkube/install-and-execute-testkube-tests.sh b/test/testkube/install-and-execute-testkube-tests.sh
index 164be7fe69..1df4bccf47 100644
--- a/test/testkube/install-and-execute-testkube-tests.sh
+++ b/test/testkube/install-and-execute-testkube-tests.sh
@@ -125,4 +125,4 @@ EOF
 
     # Explicitly fail the ADO task since at least one test failed
     exit 1
-fi
\ No newline at end of file
+fi

From 0586eec790d6bd270622e07e204b5eada50999da Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Thu, 4 Dec 2025 15:55:52 -0800
Subject: [PATCH 02/40] debug

---
 .pipelines/e2e-test/verify-pod-images.sh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.pipelines/e2e-test/verify-pod-images.sh b/.pipelines/e2e-test/verify-pod-images.sh
index c7b4df2eb4..86ac0f3410 100644
--- a/.pipelines/e2e-test/verify-pod-images.sh
+++ b/.pipelines/e2e-test/verify-pod-images.sh
@@ -76,7 +76,12 @@ check_all_pods() {
       
       # Check each pod in this iteration
       for config in "${configs_ref[@]}"; do
+        echo "  [DEBUG] Raw config string: '$config'"
         IFS=':' read -r pod_name expected_image container_name <<< "$config"
+        echo "  [DEBUG] Parsed values:"
+        echo "    pod_name='$pod_name'"
+        echo "    expected_image='$expected_image'"
+        echo "    container_name='$container_name'"
         
         # Skip if already marked as ready
         if [ "${pod_ready_status[$pod_name]}" = "true" ]; then
@@ -86,7 +91,9 @@ check_all_pods() {
         
         # DEBUG: Try alternative methods to get the image
         # Method 1: Original jsonpath (what we've been using)
+        echo "  [DEBUG] Attempting kubectl jsonpath query..."
         current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "")
+        echo "  [DEBUG] Method 1 result: '$current_image'"
         
         # Method 2: If method 1 is empty, try getting first container image
         if [ -z "$current_image" ]; then

From e89108833dbeaf29859f38f2b32f7ffb5392e7a8 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Thu, 4 Dec 2025 16:13:41 -0800
Subject: [PATCH 03/40] fix ifs spliting

---
 .pipelines/e2e-test/verify-pod-images.sh | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/.pipelines/e2e-test/verify-pod-images.sh b/.pipelines/e2e-test/verify-pod-images.sh
index 86ac0f3410..97fdb67ca7 100644
--- a/.pipelines/e2e-test/verify-pod-images.sh
+++ b/.pipelines/e2e-test/verify-pod-images.sh
@@ -65,7 +65,7 @@ check_all_pods() {
     # Track ready status for each pod
     declare -A pod_ready_status
     for config in "${configs_ref[@]}"; do
-      pod_name=$(echo "$config" | cut -d: -f1)
+      pod_name=$(echo "$config" | cut -d'|' -f1)
       pod_ready_status["$pod_name"]=false
     done
     
@@ -77,7 +77,7 @@ check_all_pods() {
       # Check each pod in this iteration
       for config in "${configs_ref[@]}"; do
         echo "  [DEBUG] Raw config string: '$config'"
-        IFS=':' read -r pod_name expected_image container_name <<< "$config"
+        IFS='|' read -r pod_name expected_image container_name <<< "$config"
         echo "  [DEBUG] Parsed values:"
         echo "    pod_name='$pod_name'"
         echo "    expected_image='$expected_image'"
@@ -177,7 +177,7 @@ check_all_pods() {
     echo ""
     echo "Failed pods:"
     for config in "${configs_ref[@]}"; do
-      IFS=':' read -r pod_name expected_image container_name <<< "$config"
+      IFS='|' read -r pod_name expected_image container_name <<< "$config"
       if [ "${pod_ready_status[$pod_name]}" != "true" ]; then
         current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[0].image}" 2>/dev/null || echo "ERROR")
         pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown")
@@ -201,7 +201,7 @@ check_all_pods() {
     echo ""
     
     for config in "${configs_ref[@]}"; do
-      IFS=':' read -r pod_name expected_image container_name <<< "$config"
+      IFS='|' read -r pod_name expected_image container_name <<< "$config"
       
       # Use first container image as fallback
       current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[0].image}" 2>/dev/null || echo "ERROR")
@@ -256,7 +256,8 @@ for pod_name in $pod_list; do
   fi
   
   # Add to configurations for parallel checking
-  pod_configs+=("$pod_name:$expected_image:$container_name")
+  # Use | as delimiter since colons appear in image tags (e.g., ciprod:3.1.31)
+  pod_configs+=("$pod_name|$expected_image|$container_name")
 done
 
 echo "Found ${#pod_configs[@]} pods to verify"

From a2e61c995156c4e0e68dc273f33d36a9ffc7dd27 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Thu, 4 Dec 2025 16:54:50 -0800
Subject: [PATCH 04/40] more parsing fix

---
 .pipelines/e2e-test/verify-pod-images.sh | 31 +++++-------------------
 1 file changed, 6 insertions(+), 25 deletions(-)

diff --git a/.pipelines/e2e-test/verify-pod-images.sh b/.pipelines/e2e-test/verify-pod-images.sh
index 97fdb67ca7..db16ff7da9 100644
--- a/.pipelines/e2e-test/verify-pod-images.sh
+++ b/.pipelines/e2e-test/verify-pod-images.sh
@@ -76,9 +76,9 @@ check_all_pods() {
       
       # Check each pod in this iteration
       for config in "${configs_ref[@]}"; do
-        echo "  [DEBUG] Raw config string: '$config'"
+        echo "  Raw config string: '$config'"
         IFS='|' read -r pod_name expected_image container_name <<< "$config"
-        echo "  [DEBUG] Parsed values:"
+        echo "  Parsed values:"
         echo "    pod_name='$pod_name'"
         echo "    expected_image='$expected_image'"
         echo "    container_name='$container_name'"
@@ -89,29 +89,10 @@ check_all_pods() {
           continue
         fi
         
-        # DEBUG: Try alternative methods to get the image
-        # Method 1: Original jsonpath (what we've been using)
-        echo "  [DEBUG] Attempting kubectl jsonpath query..."
         current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "")
-        echo "  [DEBUG] Method 1 result: '$current_image'"
-        
-        # Method 2: If method 1 is empty, try getting first container image
-        if [ -z "$current_image" ]; then
-          current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[0].image}" 2>/dev/null || echo "")
-          echo "  [DEBUG] Method 1 (jsonpath filter) returned empty, trying method 2 (first container)"
-          echo "  [DEBUG] Method 2 result: $current_image"
-        fi
-        
-        # Method 3: If still empty, try go-template
-        if [ -z "$current_image" ]; then
-          current_image=$(kubectl get pod "$pod_name" -n kube-system -o go-template='{{range .spec.containers}}{{if eq .name "'"$container_name"'"}}{{.image}}{{end}}{{end}}' 2>/dev/null || echo "")
-          echo "  [DEBUG] Method 2 also empty, trying method 3 (go-template)"
-          echo "  [DEBUG] Method 3 result: $current_image"
-        fi
         
         pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown")
         
-        # Try similar methods for container ready
         container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "")
         if [ -z "$container_ready" ]; then
           container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[0].ready}" 2>/dev/null || echo "false")
@@ -179,9 +160,9 @@ check_all_pods() {
     for config in "${configs_ref[@]}"; do
       IFS='|' read -r pod_name expected_image container_name <<< "$config"
       if [ "${pod_ready_status[$pod_name]}" != "true" ]; then
-        current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[0].image}" 2>/dev/null || echo "ERROR")
+        current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "ERROR")
         pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown")
-        container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[0].ready}" 2>/dev/null || echo "false")
+        container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false")
         
         echo "  ✗ $pod_name"
         echo "      Expected image: $expected_image"
@@ -203,8 +184,8 @@ check_all_pods() {
     for config in "${configs_ref[@]}"; do
       IFS='|' read -r pod_name expected_image container_name <<< "$config"
       
-      # Use first container image as fallback
-      current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[0].image}" 2>/dev/null || echo "ERROR")
+      # Use correct container name from config
+      current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "ERROR")
       pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown")
       
       echo "Pod: $pod_name"

From c0b44d53e72d114531a4378fb82b87b4d01d6fa2 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Thu, 4 Dec 2025 18:02:58 -0800
Subject: [PATCH 05/40] fix image verfication container name

---
 .pipelines/e2e-test/verify-pod-images.sh | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/.pipelines/e2e-test/verify-pod-images.sh b/.pipelines/e2e-test/verify-pod-images.sh
index db16ff7da9..b3be17596a 100644
--- a/.pipelines/e2e-test/verify-pod-images.sh
+++ b/.pipelines/e2e-test/verify-pod-images.sh
@@ -274,10 +274,6 @@ if [ "$MODE" = "pre-test" ]; then
     echo ""
     echo "Final pod status:"
     kubectl get pods -n kube-system | grep ama-logs
-    echo ""
-    echo "Image verification:"
-    kubectl get pods -n kube-system -l component=ama-logs-agent -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.containers[0].image}{"\t"}{.status.phase}{"\t"}{.status.containerStatuses[0].ready}{"\n"}{end}' | column -t 2>/dev/null || true
-    
     echo ""
     echo "================================"
     echo "Container Start Time Capture"
@@ -349,9 +345,6 @@ else
     echo ""
     echo "Final pod status:"
     kubectl get pods -n kube-system | grep ama-logs
-    echo ""
-    echo "Image summary:"
-    kubectl get pods -n kube-system -l component=ama-logs-agent -o custom-columns=NAME:.metadata.name,IMAGE:.spec.containers[0].image,STATUS:.status.phase,READY:.status.containerStatuses[0].ready 2>/dev/null || true
     exit 0
   else
     echo "✗ FAILURE: Some pods changed images during test execution!"

From 039301f5807792db164aef3b8fb465eb217f9155 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Thu, 4 Dec 2025 18:14:16 -0800
Subject: [PATCH 06/40] use short wait time temporarily

---
 .../azure-template-deploy-and-test-ci-image-in-aks-cluster.yml  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml
index 69f557c8b4..82e4718237 100644
--- a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml
+++ b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml
@@ -203,7 +203,7 @@ jobs:
               echo "This ensures queries will find logs from the newly deployed containers."
               echo ""
               
-              wait_time=1200
+              wait_time=60 #TODO: change back to 1200 (20 minutes) after testing
               interval=60
               elapsed=0
               

From 33f8b32c116c14d836fe2a8886a75ba9c1e6f55b Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Thu, 4 Dec 2025 20:05:38 -0800
Subject: [PATCH 07/40] simplify

---
 .pipelines/e2e-test/verify-pod-images.sh | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/.pipelines/e2e-test/verify-pod-images.sh b/.pipelines/e2e-test/verify-pod-images.sh
index b3be17596a..8485f85eb3 100644
--- a/.pipelines/e2e-test/verify-pod-images.sh
+++ b/.pipelines/e2e-test/verify-pod-images.sh
@@ -90,13 +90,8 @@ check_all_pods() {
         fi
         
         current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "")
-        
         pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown")
-        
-        container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "")
-        if [ -z "$container_ready" ]; then
-          container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[0].ready}" 2>/dev/null || echo "false")
-        fi
+        container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false")
         
         # Check if pod is ready
         if [[ "$current_image" == "$expected_image" ]] && [[ "$pod_status" == "Running" ]] && [[ "$container_ready" == "true" ]]; then

From 4fb699db1ab69e15a75156b8a54209a27c9fdfa0 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Thu, 4 Dec 2025 20:18:49 -0800
Subject: [PATCH 08/40] refactor

---
 .pipelines/e2e-test/verify-pod-images.sh | 32 ++++++++++--------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/.pipelines/e2e-test/verify-pod-images.sh b/.pipelines/e2e-test/verify-pod-images.sh
index 8485f85eb3..0b6de6995f 100644
--- a/.pipelines/e2e-test/verify-pod-images.sh
+++ b/.pipelines/e2e-test/verify-pod-images.sh
@@ -47,7 +47,7 @@ echo ""
 check_all_pods() {
   local -n configs_ref=$1  # Use different name to avoid circular reference
   local max_retries=${2:-0}  # Default to 0 (instant check, no retry)
-  local check_interval=15  # Wait 15 seconds between retries
+  local check_interval=60  # Wait 60 seconds between retries
   
   if [ $max_retries -gt 0 ]; then
     # Wait mode (pre-test): Monitor pods with retries
@@ -70,7 +70,7 @@ check_all_pods() {
     done
     
     while [ $attempt -le $max_retries ]; do
-      local all_ready=true
+      local has_not_ready_pod=false
       local ready_count=0
       local total_count=${#configs_ref[@]}
       
@@ -99,14 +99,10 @@ check_all_pods() {
           ((ready_count++))
           echo "  ✓ $pod_name - Ready"
         else
-          all_ready=false
-          
-          # Show status for pods that aren't ready yet
-          if [ $((attempt % 4)) -eq 1 ] || [ $attempt -eq 1 ]; then  # Log every 60 seconds
-            echo "  ⏳ $pod_name - Waiting (Status: $pod_status, Container ready: $container_ready)"
-            if [[ "$current_image" != "$expected_image" ]]; then
-              echo "      Image mismatch: expected $expected_image, got $current_image"
-            fi
+          has_not_ready_pod=true
+          echo "  ⏳ $pod_name - Waiting (Status: $pod_status, Container ready: $container_ready)"
+          if [[ "$current_image" != "$expected_image" ]]; then
+            echo "      Image mismatch: expected $expected_image, got $current_image"
           fi
         fi
       done
@@ -120,15 +116,13 @@ check_all_pods() {
       local minutes_remaining=$((remaining_seconds / 60))
       local seconds_remaining=$((remaining_seconds % 60))
       
-      if [ $((attempt % 4)) -eq 1 ] || [ $attempt -eq 1 ] || [ "$all_ready" = true ]; then
-        echo ""
-        echo "Attempt $attempt/$max_retries (${minutes_elapsed}m${seconds_elapsed}s elapsed, ${minutes_remaining}m${seconds_remaining}s remaining)"
-        echo "Progress: $ready_count/$total_count pods ready"
-        echo ""
-      fi
+      echo ""
+      echo "Attempt $attempt/$max_retries (${minutes_elapsed}m${seconds_elapsed}s elapsed, ${minutes_remaining}m${seconds_remaining}s remaining)"
+      echo "Progress: $ready_count/$total_count pods ready"
+      echo ""
       
       # Exit early if all pods are ready
-      if [ "$all_ready" = true ]; then
+      if [ "$has_not_ready_pod" = false ]; then
         echo "================================"
         echo "✓ SUCCESS: All pods are ready!"
         echo "================================"
@@ -241,8 +235,8 @@ echo ""
 
 # Use different check based on mode
 if [ "$MODE" = "pre-test" ]; then
-  # Pre-test: Wait for all pods to be ready (60 retries × 15s = 15 minutes max)
-  if ! check_all_pods pod_configs 60; then
+  # Pre-test: Wait for all pods to be ready (15 retries × 60s = 15 minutes max)
+  if ! check_all_pods pod_configs 15; then
     # Function already reports which pods failed
     failed_pods=true
   else

From 354b53c57613fda21d36fa1f32df5277904d7f35 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Thu, 4 Dec 2025 23:04:06 -0800
Subject: [PATCH 09/40] post check improve

---
 .pipelines/e2e-test/verify-pod-images.sh | 30 ++++++++++++++++++++----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/.pipelines/e2e-test/verify-pod-images.sh b/.pipelines/e2e-test/verify-pod-images.sh
index 0b6de6995f..39165ef700 100644
--- a/.pipelines/e2e-test/verify-pod-images.sh
+++ b/.pipelines/e2e-test/verify-pod-images.sh
@@ -173,21 +173,41 @@ check_all_pods() {
     for config in "${configs_ref[@]}"; do
       IFS='|' read -r pod_name expected_image container_name <<< "$config"
       
-      # Use correct container name from config
+      # Get pod details
       current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "ERROR")
       pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown")
+      container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false")
       
       echo "Pod: $pod_name"
       echo "  Container: $container_name"
       echo "  Expected image: $expected_image"
       echo "  Current image:  $current_image"
       echo "  Pod status: $pod_status"
+      echo "  Container ready: $container_ready"
+      
+      # Check for any issues
+      local has_issue=false
       
       if [[ "$current_image" != "$expected_image" ]]; then
-        echo "  ✗ IMAGE MISMATCH DETECTED!"
-        mismatches+=("$pod_name: expected '$expected_image' but found '$current_image'")
-      else
-        echo "  ✓ Image is correct"
+        echo "  ✗ IMAGE MISMATCH!"
+        mismatches+=("$pod_name: expected image '$expected_image' but found '$current_image'")
+        has_issue=true
+      fi
+      
+      if [[ "$pod_status" != "Running" ]]; then
+        echo "  ✗ POD NOT RUNNING!"
+        mismatches+=("$pod_name: pod status is '$pod_status' (expected 'Running')")
+        has_issue=true
+      fi
+      
+      if [[ "$container_ready" != "true" ]]; then
+        echo "  ✗ CONTAINER NOT READY!"
+        mismatches+=("$pod_name: container '$container_name' is not ready")
+        has_issue=true
+      fi
+      
+      if [[ "$has_issue" = false ]]; then
+        echo "  ✓ All checks passed"
       fi
       echo ""
     done

From 744de21bf53e4f899beeccb3928efa59818abf71 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 5 Dec 2025 10:30:33 -0800
Subject: [PATCH 10/40] refactor pod verify script

---
 ...eploy-and-test-ci-image-in-aks-cluster.yml |  17 +-
 .pipelines/e2e-test/post-test-verify-pods.sh  | 123 ++++++++++
 .pipelines/e2e-test/pre-test-verify-pods.sh   | 217 ++++++++++++++++++
 .pipelines/e2e-test/util.sh                   |  46 ++++
 .pipelines/e2e-test/verify-pod-images.sh      | 132 ++++++-----
 5 files changed, 470 insertions(+), 65 deletions(-)
 create mode 100644 .pipelines/e2e-test/post-test-verify-pods.sh
 create mode 100644 .pipelines/e2e-test/pre-test-verify-pods.sh
 create mode 100644 .pipelines/e2e-test/util.sh

diff --git a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml
index 82e4718237..ce9ae339f4 100644
--- a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml
+++ b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml
@@ -161,11 +161,11 @@ jobs:
               echo "Current ama-logs pods:"
               kubectl get pods -n kube-system | grep ama-logs
         
-        # verify ci agent gets the new images
-        # output container start time for log analytics filtering
+        # Pre-test verification: Wait for pods to be ready with new images
+        # Outputs container start time for Log Analytics query filtering
         - task: Bash@3
           name: VerifyPods
-          displayName: 'Wait for pods to be ready with new images'
+          displayName: 'Pre-Test: Wait for pods to be ready with new images'
           env:
             LINUX_IMAGE_TAG: ${{ parameters.linuxImageTag }}
             WINDOWS_IMAGE_TAG: ${{ parameters.windowsImageTag }}
@@ -174,8 +174,8 @@ jobs:
           inputs:
             targetType: 'inline'
             script: |
-              chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-pod-images.sh
-              $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-pod-images.sh pre-test "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO"
+              chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/pre-test-verify-pods.sh
+              $(Build.SourcesDirectory)/.pipelines/e2e-test/pre-test-verify-pods.sh "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO"
               
               # Export container start time for use in tests
               if [ -f /tmp/container-deployment-time.env ]; then
@@ -236,8 +236,9 @@ jobs:
           workingDirectory: $(Build.SourcesDirectory)/test/testkube/
           displayName: 'Install Testkube and run E2E tests'
         
+        # Post-test verification: Check pods are still healthy after test execution
         - task: Bash@3
-          displayName: 'Verify images remained stable after tests'
+          displayName: 'Post-Test: Verify pods remained stable after tests'
           condition: always()
           env:
             LINUX_IMAGE_TAG: ${{ parameters.linuxImageTag }}
@@ -247,8 +248,8 @@ jobs:
           inputs:
             targetType: 'inline'
             script: |
-              chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-pod-images.sh
-              $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-pod-images.sh post-test "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO"
+              chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/post-test-verify-pods.sh
+              $(Build.SourcesDirectory)/.pipelines/e2e-test/post-test-verify-pods.sh "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO"
         
         # Log deployment completion
         - bash: |
diff --git a/.pipelines/e2e-test/post-test-verify-pods.sh b/.pipelines/e2e-test/post-test-verify-pods.sh
new file mode 100644
index 0000000000..61170d4321
--- /dev/null
+++ b/.pipelines/e2e-test/post-test-verify-pods.sh
@@ -0,0 +1,123 @@
+#!/bin/bash
+# Post-Test Pod Verification
+# Performs a quick health check to ensure pods maintained correct images and are still healthy
+# This script is used AFTER running E2E tests to detect any pod restarts or issues during testing
+
+set -e
+
+# Parse command line arguments
+LINUX_IMAGE_TAG="${1}"
+WINDOWS_IMAGE_TAG="${2}"
+LINUX_MCR_REPO="${3}"
+WINDOWS_MCR_REPO="${4}"
+
+if [ -z "$LINUX_IMAGE_TAG" ] || [ -z "$WINDOWS_IMAGE_TAG" ] || [ -z "$LINUX_MCR_REPO" ] || [ -z "$WINDOWS_MCR_REPO" ]; then
+  echo "Error: Missing required parameters"
+  echo "Usage: $0 <linux-image-tag> <windows-image-tag> <linux-mcr-repo> <windows-mcr-repo>"
+  exit 1
+fi
+
+LINUX_IMAGE="$LINUX_MCR_REPO:$LINUX_IMAGE_TAG"
+WINDOWS_IMAGE="$WINDOWS_MCR_REPO:$WINDOWS_IMAGE_TAG"
+
+# Source shared functions
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$SCRIPT_DIR/pod-verification-common.sh"
+
+echo "================================"
+echo "Post-Test Pod Verification"
+echo "================================"
+echo "Verifying pods maintained correct images and are still healthy..."
+echo ""
+echo "Repository Configuration:"
+echo "  Linux MCR repo:   $LINUX_MCR_REPO"
+echo "  Windows MCR repo: $WINDOWS_MCR_REPO"
+echo ""
+echo "Expected Images:"
+echo "  Linux image:   $LINUX_IMAGE"
+echo "  Windows image: $WINDOWS_IMAGE"
+echo ""
+
+# Build pod configurations using shared function
+declare -a pod_configs
+build_pod_configs "$LINUX_IMAGE" "$WINDOWS_IMAGE"
+
+# Perform instant health check on all pods
+echo "Performing instant health check on all pods..."
+echo ""
+
+declare -a issues
+for config in "${pod_configs[@]}"; do
+  IFS='|' read -r pod_name expected_image container_name <<< "$config"
+  
+  # Get pod details
+  current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "ERROR")
+  pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown")
+  container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false")
+  
+  echo "Pod: $pod_name"
+  echo "  Container: $container_name"
+  echo "  Expected image: $expected_image"
+  echo "  Current image:  $current_image"
+  echo "  Pod status: $pod_status"
+  echo "  Container ready: $container_ready"
+  
+  # Check for any issues
+  has_issue=false
+  
+  if [[ "$current_image" != "$expected_image" ]]; then
+    echo "  ✗ IMAGE MISMATCH!"
+    issues+=("$pod_name: expected image '$expected_image' but found '$current_image'")
+    has_issue=true
+  fi
+  
+  if [[ "$pod_status" != "Running" ]]; then
+    echo "  ✗ POD NOT RUNNING!"
+    issues+=("$pod_name: pod status is '$pod_status' (expected 'Running')")
+    has_issue=true
+  fi
+  
+  if [[ "$container_ready" != "true" ]]; then
+    echo "  ✗ CONTAINER NOT READY!"
+    issues+=("$pod_name: container '$container_name' is not ready")
+    has_issue=true
+  fi
+  
+  if [[ "$has_issue" = false ]]; then
+    echo "  ✓ All checks passed"
+  fi
+  echo ""
+done
+
+# Report results
+echo "================================"
+echo "Post-Test Verification Summary"
+echo "================================"
+
+if [ ${#issues[@]} -eq 0 ]; then
+  echo "✓ SUCCESS: All pods maintained the correct images and are healthy!"
+  echo ""
+  echo "Final pod status:"
+  kubectl get pods -n kube-system | grep ama-logs
+  exit 0
+else
+  echo "✗ FAILURE: Some pods have issues after test execution!"
+  echo ""
+  echo "Issues detected:"
+  printf '  - %s\n' "${issues[@]}"
+  echo ""
+  echo "This indicates the pods may have been restarted or updated during testing."
+  echo "This could cause test instability or false results."
+  echo ""
+  echo "Current pod status:"
+  kubectl get pods -n kube-system | grep ama-logs
+  echo ""
+  echo "Detailed pod information:"
+  for issue in "${issues[@]}"; do
+    pod=$(echo "$issue" | cut -d: -f1)
+    echo ""
+    echo "--- Details for $pod ---"
+    kubectl describe pod "$pod" -n kube-system | grep -A 20 "Events:" || kubectl describe pod "$pod" -n kube-system | tail -30
+  done
+  exit 1
+fi
diff --git a/.pipelines/e2e-test/pre-test-verify-pods.sh b/.pipelines/e2e-test/pre-test-verify-pods.sh
new file mode 100644
index 0000000000..89a26158c7
--- /dev/null
+++ b/.pipelines/e2e-test/pre-test-verify-pods.sh
@@ -0,0 +1,217 @@
+#!/bin/bash
+# Pre-Test Pod Verification
+# Waits for all ama-logs pods to be running with the correct images and ready
+# This script is used BEFORE running E2E tests to ensure the new agent version is deployed
+
+set -e
+
+# Parse command line arguments
+LINUX_IMAGE_TAG="${1}"
+WINDOWS_IMAGE_TAG="${2}"
+LINUX_MCR_REPO="${3}"
+WINDOWS_MCR_REPO="${4}"
+
+if [ -z "$LINUX_IMAGE_TAG" ] || [ -z "$WINDOWS_IMAGE_TAG" ] || [ -z "$LINUX_MCR_REPO" ] || [ -z "$WINDOWS_MCR_REPO" ]; then
+  echo "Error: Missing required parameters"
+  echo "Usage: $0 <linux-image-tag> <windows-image-tag> <linux-mcr-repo> <windows-mcr-repo>"
+  exit 1
+fi
+
+LINUX_IMAGE="$LINUX_MCR_REPO:$LINUX_IMAGE_TAG"
+WINDOWS_IMAGE="$WINDOWS_MCR_REPO:$WINDOWS_IMAGE_TAG"
+
+# Configuration
+MAX_RETRIES=15
+CHECK_INTERVAL=60  # seconds
+MAX_WAIT_MINUTES=$((MAX_RETRIES * CHECK_INTERVAL / 60))
+
+# Source shared functions
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$SCRIPT_DIR/util.sh"
+
+echo "================================"
+echo "Pre-Test Pod Verification"
+echo "================================"
+echo "Waiting for pods to be running with new images and ready..."
+echo ""
+echo "Repository Configuration:"
+echo "  Linux MCR repo:   $LINUX_MCR_REPO"
+echo "  Windows MCR repo: $WINDOWS_MCR_REPO"
+echo ""
+echo "Expected Images:"
+echo "  Linux image:   $LINUX_IMAGE"
+echo "  Windows image: $WINDOWS_IMAGE"
+echo ""
+
+# Build pod configurations using shared function
+declare -a pod_configs
+build_pod_configs "$LINUX_IMAGE" "$WINDOWS_IMAGE"
+
+# Wait for all pods to be ready
+echo "================================"
+echo "Waiting for all pods to be ready"
+echo "================================"
+echo "Total pods to check: ${#pod_configs[@]}"
+echo "Maximum retries: $MAX_RETRIES"
+echo "Check interval: ${CHECK_INTERVAL}s"
+echo "Maximum wait time: $MAX_WAIT_MINUTES minutes"
+echo ""
+
+# Track ready status for each pod
+declare -A pod_ready_status
+for config in "${pod_configs[@]}"; do
+  pod_name=$(echo "$config" | cut -d'|' -f1)
+  pod_ready_status["$pod_name"]=false
+done
+
+attempt=1
+while [ $attempt -le $MAX_RETRIES ]; do
+  has_not_ready_pod=false
+  ready_count=0
+  total_count=${#pod_configs[@]}
+  
+  # Check each pod
+  for config in "${pod_configs[@]}"; do
+    IFS='|' read -r pod_name expected_image container_name <<< "$config"
+    
+    # Skip if already marked as ready
+    if [ "${pod_ready_status[$pod_name]}" = "true" ]; then
+      ((ready_count++))
+      continue
+    fi
+    
+    # Get pod details
+    current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "")
+    pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown")
+    container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false")
+    
+    # Check if pod is ready
+    if [[ "$current_image" == "$expected_image" ]] && [[ "$pod_status" == "Running" ]] && [[ "$container_ready" == "true" ]]; then
+      pod_ready_status["$pod_name"]=true
+      ((ready_count++))
+      echo "  ✓ $pod_name - Ready"
+    else
+      has_not_ready_pod=true
+      echo "  ⏳ $pod_name - Waiting (Status: $pod_status, Container ready: $container_ready)"
+      if [[ "$current_image" != "$expected_image" ]]; then
+        echo "      Image mismatch: expected $expected_image, got $current_image"
+      fi
+    fi
+  done
+  
+  # Show progress summary
+  elapsed_seconds=$(((attempt - 1) * CHECK_INTERVAL))
+  minutes_elapsed=$((elapsed_seconds / 60))
+  seconds_elapsed=$((elapsed_seconds % 60))
+  remaining_retries=$((MAX_RETRIES - attempt))
+  remaining_seconds=$((remaining_retries * CHECK_INTERVAL))
+  minutes_remaining=$((remaining_seconds / 60))
+  seconds_remaining=$((remaining_seconds % 60))
+  
+  echo ""
+  echo "Attempt $attempt/$MAX_RETRIES (${minutes_elapsed}m${seconds_elapsed}s elapsed, ${minutes_remaining}m${seconds_remaining}s remaining)"
+  echo "Progress: $ready_count/$total_count pods ready"
+  echo ""
+  
+  # Exit early if all pods are ready
+  if [ "$has_not_ready_pod" = false ]; then
+    echo "================================"
+    echo "✓ SUCCESS: All pods are ready!"
+    echo "================================"
+    echo "Total attempts: $attempt"
+    echo "Total wait time: ${minutes_elapsed}m${seconds_elapsed}s"
+    echo ""
+    
+    # Capture container start times using local function
+    capture_container_start_times
+    
+    echo ""
+    echo "Final pod status:"
+    kubectl get pods -n kube-system | grep ama-logs
+    exit 0
+  fi
+  
+  # Sleep before next retry (except after last attempt)
+  if [ $attempt -lt $MAX_RETRIES ]; then
+    sleep $CHECK_INTERVAL
+  fi
+  
+  ((attempt++))
+done
+
+# Max retries reached - report failed pods
+echo "================================"
+echo "✗ TIMEOUT: Not all pods became ready after $MAX_RETRIES attempts"
+echo "================================"
+echo ""
+echo "Failed pods:"
+for config in "${pod_configs[@]}"; do
+  IFS='|' read -r pod_name expected_image container_name <<< "$config"
+  if [ "${pod_ready_status[$pod_name]}" != "true" ]; then
+    current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "ERROR")
+    pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown")
+    container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false")
+    
+    echo "  ✗ $pod_name"
+    echo "      Expected image: $expected_image"
+    echo "      Current image:  $current_image"
+    echo "      Pod status:     $pod_status"
+    echo "      Container ready: $container_ready"
+  fi
+done
+echo ""
+echo "Final pod status:"
+kubectl get pods -n kube-system | grep ama-logs
+exit 1
+
+
+
+
+# Function to capture container start times
+capture_container_start_times() {
+  echo "================================"
+  echo "Container Start Time Capture"
+  echo "================================"
+  echo "Capturing LATEST container start time for Log Analytics queries..."
+  
+  local latest_start_time=""
+  
+  for config in "${pod_configs[@]}"; do
+    IFS='|' read -r pod_name expected_image container_name <<< "$config"
+    
+    # Get container start time
+    local start_time
+    start_time=$(kubectl get pod "$pod_name" -n kube-system \
+      -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].state.running.startedAt}" 2>/dev/null || echo "")
+    
+    if [ -z "$start_time" ]; then
+      start_time=$(kubectl get pod "$pod_name" -n kube-system \
+        -o jsonpath="{.status.containerStatuses[0].state.running.startedAt}" 2>/dev/null || echo "")
+    fi
+    
+    if [ -n "$start_time" ]; then
+      echo "  Pod $pod_name container started at: $start_time"
+      
+      # Track LATEST time (lexicographically later in ISO 8601 format)
+      if [ -z "$latest_start_time" ] || [[ "$start_time" > "$latest_start_time" ]]; then
+        latest_start_time="$start_time"
+      fi
+    else
+      echo "✗ ERROR: Could not determine container start time for pod $pod_name (container: $container_name)"
+      echo "This is required for Log Analytics query filtering"
+      exit 1
+    fi
+  done
+  
+  if [ -n "$latest_start_time" ]; then
+    # Export for use in tests
+    echo "CONTAINER_START_TIME=$latest_start_time" > /tmp/container-deployment-time.env
+    echo ""
+    echo "✓ LATEST container start time: $latest_start_time"
+    echo "✓ Saved to /tmp/container-deployment-time.env"
+    echo "✓ Log Analytics queries should filter: TimeGenerated > datetime('$latest_start_time')"
+  else
+    echo "✗ ERROR: Could not determine container start times"
+    exit 1
+  fi
+}
diff --git a/.pipelines/e2e-test/util.sh b/.pipelines/e2e-test/util.sh
new file mode 100644
index 0000000000..b157451c54
--- /dev/null
+++ b/.pipelines/e2e-test/util.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# Shared functions for pod verification scripts
+# This file should be sourced by pre-test and post-test verification scripts
+
+# Function to build pod configurations
+# Parameters:
+#   $1 - LINUX_IMAGE (full image path with tag)
+#   $2 - WINDOWS_IMAGE (full image path with tag)
+# Returns:
+#   pod_configs array populated with "pod_name|expected_image|container_name"
+build_pod_configs() {
+  local LINUX_IMAGE="$1"
+  local WINDOWS_IMAGE="$2"
+  
+  echo "Getting list of ama-logs pods..."
+  local pod_list=$(kubectl get pods -n kube-system --no-headers | grep ama-logs | awk '{print $1}')
+  
+  # Clear the global pod_configs array
+  pod_configs=()
+  
+  for pod_name in $pod_list; do
+    local expected_image
+    local container_name
+    
+    # Determine expected image and container name based on pod type
+    if [[ "$pod_name" =~ ^ama-logs-windows ]]; then
+      expected_image="$WINDOWS_IMAGE"
+      container_name="ama-logs-windows"
+    elif [[ "$pod_name" =~ ^ama-logs-rs ]] || [[ "$pod_name" =~ ^ama-logs-[a-z0-9]{5}$ ]]; then
+      expected_image="$LINUX_IMAGE"
+      container_name="ama-logs"
+    else
+      echo "✗ ERROR: Unknown pod pattern: $pod_name"
+      echo "Expected pod names to match one of:"
+      echo "  - ama-logs-windows-* (Windows pods)"
+      echo "  - ama-logs-rs-* (Linux ReplicaSet pods)"
+      echo "  - ama-logs-xxxxx (Linux DaemonSet pods, 5 alphanumeric chars)"
+      exit 1
+    fi
+    
+    pod_configs+=("$pod_name|$expected_image|$container_name")
+  done
+  
+  echo "Found ${#pod_configs[@]} pods to verify"
+  echo ""
+}
diff --git a/.pipelines/e2e-test/verify-pod-images.sh b/.pipelines/e2e-test/verify-pod-images.sh
index 39165ef700..f82e1a2e3c 100644
--- a/.pipelines/e2e-test/verify-pod-images.sh
+++ b/.pipelines/e2e-test/verify-pod-images.sh
@@ -11,6 +11,14 @@ WINDOWS_IMAGE_TAG="${3}"
 LINUX_MCR_REPO="${4}"
 WINDOWS_MCR_REPO="${5}"
 
+# Validate MODE parameter
+if [[ "$MODE" != "pre-test" && "$MODE" != "post-test" ]]; then
+  echo "Error: Invalid mode '$MODE'"
+  echo "MODE must be either 'pre-test' or 'post-test'"
+  echo "Usage: $0 <pre-test|post-test> <linux-image-tag> <windows-image-tag> <linux-mcr-repo> <windows-mcr-repo>"
+  exit 1
+fi
+
 if [ -z "$LINUX_IMAGE_TAG" ] || [ -z "$WINDOWS_IMAGE_TAG" ] || [ -z "$LINUX_MCR_REPO" ] || [ -z "$WINDOWS_MCR_REPO" ]; then
   echo "Error: Missing required parameters"
   echo "Usage: $0 <pre-test|post-test> <linux-image-tag> <windows-image-tag> <linux-mcr-repo> <windows-mcr-repo>"
@@ -241,8 +249,12 @@ for pod_name in $pod_list; do
     expected_image="$LINUX_IMAGE"
     container_name="ama-logs"
   else
-    echo "⚠ Unknown pod pattern: $pod_name - skipping verification"
-    continue
+    echo "✗ ERROR: Unknown pod pattern: $pod_name"
+    echo "Expected pod names to match one of:"
+    echo "  - ama-logs-windows-* (Windows pods)"
+    echo "  - ama-logs-rs-* (Linux ReplicaSet pods)"
+    echo "  - ama-logs-xxxxx (Linux DaemonSet pods, 5 alphanumeric chars)"
+    exit 1
   fi
   
   # Add to configurations for parallel checking
@@ -257,7 +269,6 @@ echo ""
 if [ "$MODE" = "pre-test" ]; then
   # Pre-test: Wait for all pods to be ready (15 retries × 60s = 15 minutes max)
   if ! check_all_pods pod_configs 15; then
-    # Function already reports which pods failed
     failed_pods=true
   else
     failed_pods=false
@@ -284,60 +295,7 @@ if [ "$MODE" = "pre-test" ]; then
     echo "Final pod status:"
     kubectl get pods -n kube-system | grep ama-logs
     echo ""
-    echo "================================"
-    echo "Container Start Time Capture"
-    echo "================================"
-    echo "Capturing LATEST container start time for Log Analytics queries..."
-    
-    # Get all container start times and find the LATEST one
-    latest_start_time=""
-    
-    pod_list=$(kubectl get pods -n kube-system --no-headers | grep ama-logs | awk '{print $1}')
-    for pod_name in $pod_list; do
-      # Get container name based on pod type
-      if [[ "$pod_name" =~ ^ama-logs-windows ]]; then
-        container_name="ama-logs-windows"
-      elif [[ "$pod_name" =~ ^ama-logs-rs ]] || [[ "$pod_name" =~ ^ama-logs-[a-z0-9]{5}$ ]]; then
-        container_name="ama-logs"
-      else
-        continue
-      fi
-      
-      # Get container start time - try first container if filter doesn't work
-      start_time=$(kubectl get pod "$pod_name" -n kube-system \
-        -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].state.running.startedAt}" 2>/dev/null || echo "")
-      
-      if [ -z "$start_time" ]; then
-        start_time=$(kubectl get pod "$pod_name" -n kube-system \
-          -o jsonpath="{.status.containerStatuses[0].state.running.startedAt}" 2>/dev/null || echo "")
-      fi
-      
-      if [ -n "$start_time" ]; then
-        echo "  Pod $pod_name container started at: $start_time"
-        
-        # Track LATEST time (lexicographically later in ISO 8601 format)
-        if [ -z "$latest_start_time" ] || [[ "$start_time" > "$latest_start_time" ]]; then
-          latest_start_time="$start_time"
-        fi
-      else
-        echo "✗ ERROR: Could not determine container start time for pod $pod_name (container: $container_name)"
-        echo "This is required for Log Analytics query filtering"
-        exit 1
-      fi
-    done
-    
-    if [ -n "$latest_start_time" ]; then
-      # Export for use in tests
-      echo "CONTAINER_START_TIME=$latest_start_time" > /tmp/container-deployment-time.env
-      echo ""
-      echo "✓ LATEST container start time: $latest_start_time"
-      echo "✓ Saved to /tmp/container-deployment-time.env"
-      echo "✓ Log Analytics queries should filter: TimeGenerated > datetime('$latest_start_time')"
-    else
-      echo "✗ ERROR: Could not determine container start times"
-      echo "This is required for Log Analytics query filtering"
-      exit 1
-    fi
+    capture_container_start_times
     
     exit 0
   else
@@ -377,3 +335,63 @@ else
     exit 1
   fi
 fi
+
+# Function to capture container start times for Log Analytics query filtering
+capture_container_start_times() {
+  echo "================================"
+  echo "Container Start Time Capture"
+  echo "================================"
+  echo "Capturing LATEST container start time for Log Analytics queries..."
+  
+  # Get all container start times and find the LATEST one
+  local latest_start_time=""
+  
+  local pod_list=$(kubectl get pods -n kube-system --no-headers | grep ama-logs | awk '{print $1}')
+  for pod_name in $pod_list; do
+    # Get container name based on pod type
+    local container_name
+    if [[ "$pod_name" =~ ^ama-logs-windows ]]; then
+      container_name="ama-logs-windows"
+    elif [[ "$pod_name" =~ ^ama-logs-rs ]] || [[ "$pod_name" =~ ^ama-logs-[a-z0-9]{5}$ ]]; then
+      container_name="ama-logs"
+    else
+      continue
+    fi
+    
+    # Get container start time - try first container if filter doesn't work
+    local start_time
+    start_time=$(kubectl get pod "$pod_name" -n kube-system \
+      -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].state.running.startedAt}" 2>/dev/null || echo "")
+    
+    if [ -z "$start_time" ]; then
+      start_time=$(kubectl get pod "$pod_name" -n kube-system \
+        -o jsonpath="{.status.containerStatuses[0].state.running.startedAt}" 2>/dev/null || echo "")
+    fi
+    
+    if [ -n "$start_time" ]; then
+      echo "  Pod $pod_name container started at: $start_time"
+      
+      # Track LATEST time (lexicographically later in ISO 8601 format)
+      if [ -z "$latest_start_time" ] || [[ "$start_time" > "$latest_start_time" ]]; then
+        latest_start_time="$start_time"
+      fi
+    else
+      echo "✗ ERROR: Could not determine container start time for pod $pod_name (container: $container_name)"
+      echo "This is required for Log Analytics query filtering"
+      exit 1
+    fi
+  done
+  
+  if [ -n "$latest_start_time" ]; then
+    # Export for use in tests
+    echo "CONTAINER_START_TIME=$latest_start_time" > /tmp/container-deployment-time.env
+    echo ""
+    echo "✓ LATEST container start time: $latest_start_time"
+    echo "✓ Saved to /tmp/container-deployment-time.env"
+    echo "✓ Log Analytics queries should filter: TimeGenerated > datetime('$latest_start_time')"
+  else
+    echo "✗ ERROR: Could not determine container start times"
+    echo "This is required for Log Analytics query filtering"
+    exit 1
+  fi
+}

From c0d8d7293f34758c2374f3c21c4d33d12b200e04 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 5 Dec 2025 10:47:00 -0800
Subject: [PATCH 11/40] move func

---
 .pipelines/e2e-test/pre-test-verify-pods.sh | 96 ++++++++++-----------
 1 file changed, 44 insertions(+), 52 deletions(-)

diff --git a/.pipelines/e2e-test/pre-test-verify-pods.sh b/.pipelines/e2e-test/pre-test-verify-pods.sh
index 89a26158c7..48789ae056 100644
--- a/.pipelines/e2e-test/pre-test-verify-pods.sh
+++ b/.pipelines/e2e-test/pre-test-verify-pods.sh
@@ -29,6 +29,50 @@ MAX_WAIT_MINUTES=$((MAX_RETRIES * CHECK_INTERVAL / 60))
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 source "$SCRIPT_DIR/util.sh"
 
+# Function to capture container start times
+capture_container_start_times() {
+  echo "================================"
+  echo "Container Start Time Capture"
+  echo "================================"
+  echo "Capturing LATEST container start time for Log Analytics queries..."
+  
+  local latest_start_time=""
+  
+  for config in "${pod_configs[@]}"; do
+    IFS='|' read -r pod_name expected_image container_name <<< "$config"
+    
+    # Get container start time for the specific container
+    local start_time
+    start_time=$(kubectl get pod "$pod_name" -n kube-system \
+      -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].state.running.startedAt}" 2>/dev/null || echo "")
+    
+    if [ -n "$start_time" ]; then
+      echo "  Pod $pod_name container started at: $start_time"
+      
+      # Track LATEST time (lexicographically later in ISO 8601 format)
+      if [ -z "$latest_start_time" ] || [[ "$start_time" > "$latest_start_time" ]]; then
+        latest_start_time="$start_time"
+      fi
+    else
+      echo "✗ ERROR: Could not determine container start time for pod $pod_name (container: $container_name)"
+      echo "This is required for Log Analytics query filtering"
+      exit 1
+    fi
+  done
+  
+  if [ -n "$latest_start_time" ]; then
+    # Export for use in tests
+    echo "CONTAINER_START_TIME=$latest_start_time" > /tmp/container-deployment-time.env
+    echo ""
+    echo "✓ LATEST container start time: $latest_start_time"
+    echo "✓ Saved to /tmp/container-deployment-time.env"
+    echo "✓ Log Analytics queries should filter: TimeGenerated > datetime('$latest_start_time')"
+  else
+    echo "✗ ERROR: Could not determine container start times"
+    exit 1
+  fi
+}
+
 echo "================================"
 echo "Pre-Test Pod Verification"
 echo "================================"
@@ -163,55 +207,3 @@ echo ""
 echo "Final pod status:"
 kubectl get pods -n kube-system | grep ama-logs
 exit 1
-
-
-
-
-# Function to capture container start times
-capture_container_start_times() {
-  echo "================================"
-  echo "Container Start Time Capture"
-  echo "================================"
-  echo "Capturing LATEST container start time for Log Analytics queries..."
-  
-  local latest_start_time=""
-  
-  for config in "${pod_configs[@]}"; do
-    IFS='|' read -r pod_name expected_image container_name <<< "$config"
-    
-    # Get container start time
-    local start_time
-    start_time=$(kubectl get pod "$pod_name" -n kube-system \
-      -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].state.running.startedAt}" 2>/dev/null || echo "")
-    
-    if [ -z "$start_time" ]; then
-      start_time=$(kubectl get pod "$pod_name" -n kube-system \
-        -o jsonpath="{.status.containerStatuses[0].state.running.startedAt}" 2>/dev/null || echo "")
-    fi
-    
-    if [ -n "$start_time" ]; then
-      echo "  Pod $pod_name container started at: $start_time"
-      
-      # Track LATEST time (lexicographically later in ISO 8601 format)
-      if [ -z "$latest_start_time" ] || [[ "$start_time" > "$latest_start_time" ]]; then
-        latest_start_time="$start_time"
-      fi
-    else
-      echo "✗ ERROR: Could not determine container start time for pod $pod_name (container: $container_name)"
-      echo "This is required for Log Analytics query filtering"
-      exit 1
-    fi
-  done
-  
-  if [ -n "$latest_start_time" ]; then
-    # Export for use in tests
-    echo "CONTAINER_START_TIME=$latest_start_time" > /tmp/container-deployment-time.env
-    echo ""
-    echo "✓ LATEST container start time: $latest_start_time"
-    echo "✓ Saved to /tmp/container-deployment-time.env"
-    echo "✓ Log Analytics queries should filter: TimeGenerated > datetime('$latest_start_time')"
-  else
-    echo "✗ ERROR: Could not determine container start times"
-    exit 1
-  fi
-}

From ffb397fdca4f8903e6cba32b3fa01962f6eafa11 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 5 Dec 2025 10:47:56 -0800
Subject: [PATCH 12/40] fix

---
 .pipelines/e2e-test/post-test-verify-pods.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pipelines/e2e-test/post-test-verify-pods.sh b/.pipelines/e2e-test/post-test-verify-pods.sh
index 61170d4321..1f20cd3c56 100644
--- a/.pipelines/e2e-test/post-test-verify-pods.sh
+++ b/.pipelines/e2e-test/post-test-verify-pods.sh
@@ -22,7 +22,7 @@ WINDOWS_IMAGE="$WINDOWS_MCR_REPO:$WINDOWS_IMAGE_TAG"
 
 # Source shared functions
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-source "$SCRIPT_DIR/pod-verification-common.sh"
+source "$SCRIPT_DIR/util.sh"
 
 echo "================================"
 echo "Post-Test Pod Verification"

From 89950523d576f4e19d4a56b1c46ad13ca9576028 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 5 Dec 2025 11:12:34 -0800
Subject: [PATCH 13/40] debug

---
 .pipelines/e2e-test/pre-test-verify-pods.sh | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.pipelines/e2e-test/pre-test-verify-pods.sh b/.pipelines/e2e-test/pre-test-verify-pods.sh
index 48789ae056..28b7ae5496 100644
--- a/.pipelines/e2e-test/pre-test-verify-pods.sh
+++ b/.pipelines/e2e-test/pre-test-verify-pods.sh
@@ -91,6 +91,13 @@ echo ""
 declare -a pod_configs
 build_pod_configs "$LINUX_IMAGE" "$WINDOWS_IMAGE"
 
+# Validate array was populated
+if [ ${#pod_configs[@]} -eq 0 ]; then
+  echo "✗ ERROR: No pods found to verify!"
+  echo "This likely means no ama-logs pods exist in the kube-system namespace."
+  exit 1
+fi
+
 # Wait for all pods to be ready
 echo "================================"
 echo "Waiting for all pods to be ready"
@@ -105,9 +112,11 @@ echo ""
 declare -A pod_ready_status
 for config in "${pod_configs[@]}"; do
   pod_name=$(echo "$config" | cut -d'|' -f1)
+  echo "DEBUG: Initializing pod $pod_name to not ready"
   pod_ready_status["$pod_name"]=false
 done
 
+echo "DEBUG: All pods initialized, starting retry loop"
 attempt=1
 while [ $attempt -le $MAX_RETRIES ]; do
   has_not_ready_pod=false

From 71e9d642fb8a22283f300362ec481541e95775d0 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 5 Dec 2025 11:23:05 -0800
Subject: [PATCH 14/40] add debug

---
 .pipelines/e2e-test/pre-test-verify-pods.sh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.pipelines/e2e-test/pre-test-verify-pods.sh b/.pipelines/e2e-test/pre-test-verify-pods.sh
index 28b7ae5496..d243cf9128 100644
--- a/.pipelines/e2e-test/pre-test-verify-pods.sh
+++ b/.pipelines/e2e-test/pre-test-verify-pods.sh
@@ -41,11 +41,15 @@ capture_container_start_times() {
   for config in "${pod_configs[@]}"; do
     IFS='|' read -r pod_name expected_image container_name <<< "$config"
     
+    echo "DEBUG: Querying start time for pod $pod_name, container $container_name"
+    
     # Get container start time for the specific container
     local start_time
     start_time=$(kubectl get pod "$pod_name" -n kube-system \
       -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].state.running.startedAt}" 2>/dev/null || echo "")
     
+    echo "DEBUG: Got start_time='$start_time'"
+    
     if [ -n "$start_time" ]; then
       echo "  Pod $pod_name container started at: $start_time"
       
@@ -118,7 +122,10 @@ done
 
 echo "DEBUG: All pods initialized, starting retry loop"
 attempt=1
+echo "DEBUG: attempt=$attempt, MAX_RETRIES=$MAX_RETRIES"
+echo "DEBUG: Condition check: [ $attempt -le $MAX_RETRIES ] = $([ $attempt -le $MAX_RETRIES ] && echo true || echo false)"
 while [ $attempt -le $MAX_RETRIES ]; do
+  echo "DEBUG: Inside while loop, attempt=$attempt"
   has_not_ready_pod=false
   ready_count=0
   total_count=${#pod_configs[@]}

From 8b4c392bafbc7a0b871a7ba3af3246402f196b9c Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 5 Dec 2025 11:33:00 -0800
Subject: [PATCH 15/40] add debug

---
 .pipelines/e2e-test/pre-test-verify-pods.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.pipelines/e2e-test/pre-test-verify-pods.sh b/.pipelines/e2e-test/pre-test-verify-pods.sh
index d243cf9128..a15f79194d 100644
--- a/.pipelines/e2e-test/pre-test-verify-pods.sh
+++ b/.pipelines/e2e-test/pre-test-verify-pods.sh
@@ -129,10 +129,13 @@ while [ $attempt -le $MAX_RETRIES ]; do
   has_not_ready_pod=false
   ready_count=0
   total_count=${#pod_configs[@]}
+  echo "DEBUG: Initialized loop variables, checking $total_count pods"
   
   # Check each pod
   for config in "${pod_configs[@]}"; do
+    echo "DEBUG: Processing config: $config"
     IFS='|' read -r pod_name expected_image container_name <<< "$config"
+    echo "DEBUG: Parsed - pod=$pod_name, container=$container_name"
     
     # Skip if already marked as ready
     if [ "${pod_ready_status[$pod_name]}" = "true" ]; then

From 231ff81db14932462511c8d8a6b28254126b3570 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 5 Dec 2025 11:52:08 -0800
Subject: [PATCH 16/40] more debug

---
 .pipelines/e2e-test/pre-test-verify-pods.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.pipelines/e2e-test/pre-test-verify-pods.sh b/.pipelines/e2e-test/pre-test-verify-pods.sh
index a15f79194d..504f36cb4b 100644
--- a/.pipelines/e2e-test/pre-test-verify-pods.sh
+++ b/.pipelines/e2e-test/pre-test-verify-pods.sh
@@ -126,8 +126,11 @@ echo "DEBUG: attempt=$attempt, MAX_RETRIES=$MAX_RETRIES"
 echo "DEBUG: Condition check: [ $attempt -le $MAX_RETRIES ] = $([ $attempt -le $MAX_RETRIES ] && echo true || echo false)"
 while [ $attempt -le $MAX_RETRIES ]; do
   echo "DEBUG: Inside while loop, attempt=$attempt"
+  echo "DEBUG: About to set has_not_ready_pod"
   has_not_ready_pod=false
+  echo "DEBUG: About to set ready_count"
   ready_count=0
+  echo "DEBUG: About to set total_count"
   total_count=${#pod_configs[@]}
   echo "DEBUG: Initialized loop variables, checking $total_count pods"
   

From 20c9267c1dc09c894f3605ccc8d7542fb6efab62 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 5 Dec 2025 12:01:06 -0800
Subject: [PATCH 17/40] more debugging

---
 .pipelines/e2e-test/pre-test-verify-pods.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.pipelines/e2e-test/pre-test-verify-pods.sh b/.pipelines/e2e-test/pre-test-verify-pods.sh
index 504f36cb4b..ecdf04843a 100644
--- a/.pipelines/e2e-test/pre-test-verify-pods.sh
+++ b/.pipelines/e2e-test/pre-test-verify-pods.sh
@@ -141,15 +141,20 @@ while [ $attempt -le $MAX_RETRIES ]; do
     echo "DEBUG: Parsed - pod=$pod_name, container=$container_name"
     
     # Skip if already marked as ready
+    echo "DEBUG: Checking if $pod_name already marked ready"
     if [ "${pod_ready_status[$pod_name]}" = "true" ]; then
       ((ready_count++))
       continue
     fi
     
+    echo "DEBUG: Getting pod details for $pod_name"
     # Get pod details
     current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "")
+    echo "DEBUG: current_image='$current_image'"
     pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown")
+    echo "DEBUG: pod_status='$pod_status'"
     container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false")
+    echo "DEBUG: container_ready='$container_ready'"
     
     # Check if pod is ready
     if [[ "$current_image" == "$expected_image" ]] && [[ "$pod_status" == "Running" ]] && [[ "$container_ready" == "true" ]]; then

From 623099fb3f2c6bf87723baf86d54d4989b767616 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 5 Dec 2025 12:48:31 -0800
Subject: [PATCH 18/40] more debug

---
 .pipelines/e2e-test/pre-test-verify-pods.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.pipelines/e2e-test/pre-test-verify-pods.sh b/.pipelines/e2e-test/pre-test-verify-pods.sh
index ecdf04843a..2fcea46e7d 100644
--- a/.pipelines/e2e-test/pre-test-verify-pods.sh
+++ b/.pipelines/e2e-test/pre-test-verify-pods.sh
@@ -143,7 +143,7 @@ while [ $attempt -le $MAX_RETRIES ]; do
     # Skip if already marked as ready
     echo "DEBUG: Checking if $pod_name already marked ready"
     if [ "${pod_ready_status[$pod_name]}" = "true" ]; then
-      ((ready_count++))
+      ready_count=$((ready_count + 1))
       continue
     fi
     
@@ -158,8 +158,9 @@ while [ $attempt -le $MAX_RETRIES ]; do
     
     # Check if pod is ready
     if [[ "$current_image" == "$expected_image" ]] && [[ "$pod_status" == "Running" ]] && [[ "$container_ready" == "true" ]]; then
+      echo "DEBUG: Marking $pod_name as ready"
       pod_ready_status["$pod_name"]=true
-      ((ready_count++))
+      ready_count=$((ready_count + 1))
       echo "  ✓ $pod_name - Ready"
     else
       has_not_ready_pod=true

From b397fc0c6ef154dcf3e4dd892ed7707ee6201ac5 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 5 Dec 2025 13:23:39 -0800
Subject: [PATCH 19/40] refactor..

---
 .pipelines/e2e-test/post-test-verify-pods.sh |  4 ++--
 .pipelines/e2e-test/pre-test-verify-pods.sh  | 25 ++++----------------
 2 files changed, 6 insertions(+), 23 deletions(-)

diff --git a/.pipelines/e2e-test/post-test-verify-pods.sh b/.pipelines/e2e-test/post-test-verify-pods.sh
index 1f20cd3c56..d3e5cf04a4 100644
--- a/.pipelines/e2e-test/post-test-verify-pods.sh
+++ b/.pipelines/e2e-test/post-test-verify-pods.sh
@@ -55,7 +55,7 @@ for config in "${pod_configs[@]}"; do
   pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown")
   container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false")
   
-  echo "Pod: $pod_name"
+  echo "Check pod: $pod_name"
   echo "  Container: $container_name"
   echo "  Expected image: $expected_image"
   echo "  Current image:  $current_image"
@@ -84,7 +84,7 @@ for config in "${pod_configs[@]}"; do
   fi
   
   if [[ "$has_issue" = false ]]; then
-    echo "  ✓ All checks passed"
+    echo "  ✓ Pod: $pod_name passed checks"
   fi
   echo ""
 done
diff --git a/.pipelines/e2e-test/pre-test-verify-pods.sh b/.pipelines/e2e-test/pre-test-verify-pods.sh
index 2fcea46e7d..816a50a498 100644
--- a/.pipelines/e2e-test/pre-test-verify-pods.sh
+++ b/.pipelines/e2e-test/pre-test-verify-pods.sh
@@ -41,15 +41,11 @@ capture_container_start_times() {
   for config in "${pod_configs[@]}"; do
     IFS='|' read -r pod_name expected_image container_name <<< "$config"
     
-    echo "DEBUG: Querying start time for pod $pod_name, container $container_name"
-    
     # Get container start time for the specific container
     local start_time
     start_time=$(kubectl get pod "$pod_name" -n kube-system \
       -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].state.running.startedAt}" 2>/dev/null || echo "")
     
-    echo "DEBUG: Got start_time='$start_time'"
-    
     if [ -n "$start_time" ]; then
       echo "  Pod $pod_name container started at: $start_time"
       
@@ -116,49 +112,36 @@ echo ""
 declare -A pod_ready_status
 for config in "${pod_configs[@]}"; do
   pod_name=$(echo "$config" | cut -d'|' -f1)
-  echo "DEBUG: Initializing pod $pod_name to not ready"
   pod_ready_status["$pod_name"]=false
 done
 
-echo "DEBUG: All pods initialized, starting retry loop"
 attempt=1
-echo "DEBUG: attempt=$attempt, MAX_RETRIES=$MAX_RETRIES"
-echo "DEBUG: Condition check: [ $attempt -le $MAX_RETRIES ] = $([ $attempt -le $MAX_RETRIES ] && echo true || echo false)"
 while [ $attempt -le $MAX_RETRIES ]; do
-  echo "DEBUG: Inside while loop, attempt=$attempt"
-  echo "DEBUG: About to set has_not_ready_pod"
   has_not_ready_pod=false
-  echo "DEBUG: About to set ready_count"
   ready_count=0
-  echo "DEBUG: About to set total_count"
   total_count=${#pod_configs[@]}
-  echo "DEBUG: Initialized loop variables, checking $total_count pods"
   
   # Check each pod
   for config in "${pod_configs[@]}"; do
-    echo "DEBUG: Processing config: $config"
     IFS='|' read -r pod_name expected_image container_name <<< "$config"
-    echo "DEBUG: Parsed - pod=$pod_name, container=$container_name"
+    echo "  Checking pod: $pod_name"
+    echo "    Container: $container_name"
+    echo "    Expected image: $expected_image"
     
     # Skip if already marked as ready
-    echo "DEBUG: Checking if $pod_name already marked ready"
     if [ "${pod_ready_status[$pod_name]}" = "true" ]; then
+      echo " Pod: $pod_name has expected image ready. Skipping check."
       ready_count=$((ready_count + 1))
       continue
     fi
     
-    echo "DEBUG: Getting pod details for $pod_name"
     # Get pod details
     current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "")
-    echo "DEBUG: current_image='$current_image'"
     pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown")
-    echo "DEBUG: pod_status='$pod_status'"
     container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false")
-    echo "DEBUG: container_ready='$container_ready'"
     
     # Check if pod is ready
     if [[ "$current_image" == "$expected_image" ]] && [[ "$pod_status" == "Running" ]] && [[ "$container_ready" == "true" ]]; then
-      echo "DEBUG: Marking $pod_name as ready"
       pod_ready_status["$pod_name"]=true
       ready_count=$((ready_count + 1))
       echo "  ✓ $pod_name - Ready"

From 6f0f06374be9bb20bcaef77958d99d4338aa82e4 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 5 Dec 2025 13:43:02 -0800
Subject: [PATCH 20/40] add comments

---
 .pipelines/e2e-test/pre-test-verify-pods.sh | 23 +++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/.pipelines/e2e-test/pre-test-verify-pods.sh b/.pipelines/e2e-test/pre-test-verify-pods.sh
index 816a50a498..14d8caa7ec 100644
--- a/.pipelines/e2e-test/pre-test-verify-pods.sh
+++ b/.pipelines/e2e-test/pre-test-verify-pods.sh
@@ -124,13 +124,17 @@ while [ $attempt -le $MAX_RETRIES ]; do
   # Check each pod
   for config in "${pod_configs[@]}"; do
     IFS='|' read -r pod_name expected_image container_name <<< "$config"
-    echo "  Checking pod: $pod_name"
+    echo ""
+    echo ""
+    echo "  Start checking pod: $pod_name"
     echo "    Container: $container_name"
     echo "    Expected image: $expected_image"
-    
+
     # Skip if already marked as ready
     if [ "${pod_ready_status[$pod_name]}" = "true" ]; then
-      echo " Pod: $pod_name has expected image ready. Skipping check."
+      echo "  Finished checking pod: $pod_name"
+      echo "    Pod: $pod_name has expected image ready. Skipping check."
+      echo "    ✓ $pod_name - Ready"
       ready_count=$((ready_count + 1))
       continue
     fi
@@ -144,13 +148,20 @@ while [ $attempt -le $MAX_RETRIES ]; do
     if [[ "$current_image" == "$expected_image" ]] && [[ "$pod_status" == "Running" ]] && [[ "$container_ready" == "true" ]]; then
       pod_ready_status["$pod_name"]=true
       ready_count=$((ready_count + 1))
-      echo "  ✓ $pod_name - Ready"
+      echo "  Finished checking pod: $pod_name"
+      echo "    Image: $current_image"
+      echo "    Expected image: $expected_image"
+      echo "    Status: $pod_status"
+      echo "    Container ready: $container_ready"
+      echo "    ✓ $pod_name - Ready"
     else
       has_not_ready_pod=true
-      echo "  ⏳ $pod_name - Waiting (Status: $pod_status, Container ready: $container_ready)"
+      echo "  Finished checking pod: $pod_name"
+      echo "    ⏳ $pod_name - Waiting (Status: $pod_status, Container ready: $container_ready)"
       if [[ "$current_image" != "$expected_image" ]]; then
-        echo "      Image mismatch: expected $expected_image, got $current_image"
+        echo "    Image mismatch: expected $expected_image, got $current_image"
       fi
+      echo "    x $pod_name - NOT Ready"
     fi
   done
   

From f2f8839152d9be53db1011ecca539d89424bd70a Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 5 Dec 2025 14:19:49 -0800
Subject: [PATCH 21/40] move container start time capture

---
 ...eploy-and-test-ci-image-in-aks-cluster.yml |  31 +-
 .../e2e-test/capture-container-start-time.sh  | 101 +++++
 ...pods.sh => verify-ci-images-after-test.sh} |   0
 ...ods.sh => verify-ci-images-before-test.sh} |  49 ---
 .pipelines/e2e-test/verify-pod-images.sh      | 397 ------------------
 5 files changed, 123 insertions(+), 455 deletions(-)
 create mode 100644 .pipelines/e2e-test/capture-container-start-time.sh
 rename .pipelines/e2e-test/{post-test-verify-pods.sh => verify-ci-images-after-test.sh} (100%)
 rename .pipelines/e2e-test/{pre-test-verify-pods.sh => verify-ci-images-before-test.sh} (78%)
 delete mode 100644 .pipelines/e2e-test/verify-pod-images.sh

diff --git a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml
index ce9ae339f4..db1c9a2a73 100644
--- a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml
+++ b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml
@@ -162,9 +162,7 @@ jobs:
               kubectl get pods -n kube-system | grep ama-logs
         
         # Pre-test verification: Wait for pods to be ready with new images
-        # Outputs container start time for Log Analytics query filtering
         - task: Bash@3
-          name: VerifyPods
           displayName: 'Pre-Test: Wait for pods to be ready with new images'
           env:
             LINUX_IMAGE_TAG: ${{ parameters.linuxImageTag }}
@@ -174,8 +172,23 @@ jobs:
           inputs:
             targetType: 'inline'
             script: |
-              chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/pre-test-verify-pods.sh
-              $(Build.SourcesDirectory)/.pipelines/e2e-test/pre-test-verify-pods.sh "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO"
+              chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-ci-images-before-test.sh
+              $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-ci-images-before-test.sh "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO"
+        
+        # Capture container start times for Log Analytics query filtering
+        - task: Bash@3
+          name: CaptureStartTime
+          displayName: 'Capture container start times for Log Analytics filtering'
+          env:
+            LINUX_IMAGE_TAG: ${{ parameters.linuxImageTag }}
+            WINDOWS_IMAGE_TAG: ${{ parameters.windowsImageTag }}
+            LINUX_MCR_REPO: $(DetermineMcrRepo.linuxMcrRepo)
+            WINDOWS_MCR_REPO: $(DetermineMcrRepo.windowsMcrRepo)
+          inputs:
+            targetType: 'inline'
+            script: |
+              chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/capture-container-start-time.sh
+              $(Build.SourcesDirectory)/.pipelines/e2e-test/capture-container-start-time.sh "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO"
               
               # Export container start time for use in tests
               if [ -f /tmp/container-deployment-time.env ]; then
@@ -197,7 +210,7 @@ jobs:
               echo "Waiting for Log Analytics Ingestion"
               echo "========================================"
               echo "Cluster: ${{ parameters.clusterName }}"
-              echo "Container start time: $(VerifyPods.CONTAINER_START_TIME)"
+              echo "Container start time: $(CaptureStartTime.CONTAINER_START_TIME)"
               echo ""
               echo "Waiting 20 minutes to allow logs to be ingested..."
               echo "This ensures queries will find logs from the newly deployed containers."
@@ -218,12 +231,12 @@ jobs:
               
               echo ""
               echo "✓ Wait complete! Logs should now be available in Log Analytics."
-              echo "✓ Tests will query logs with filter: TimeGenerated > datetime('$(VerifyPods.CONTAINER_START_TIME)')"
+              echo "✓ Tests will query logs with filter: TimeGenerated > datetime('$(CaptureStartTime.CONTAINER_START_TIME)')"
               echo "========================================"
         # TODO (improvement): container start time is captured in previous step, but not used for now. Consider passing container start time to test script to use in log queries
         - bash: |
             # Pass container start time to tests
-            export CONTAINER_START_TIME="$(VerifyPods.CONTAINER_START_TIME)"
+            export CONTAINER_START_TIME="$(CaptureStartTime.CONTAINER_START_TIME)"
             echo "Running tests for cluster: ${{ parameters.clusterName }}"
             echo "Container start time: $CONTAINER_START_TIME"
             
@@ -248,8 +261,8 @@ jobs:
           inputs:
             targetType: 'inline'
             script: |
-              chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/post-test-verify-pods.sh
-              $(Build.SourcesDirectory)/.pipelines/e2e-test/post-test-verify-pods.sh "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO"
+              chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-ci-images-after-test.sh
+              $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-ci-images-after-test.sh "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO"
         
         # Log deployment completion
         - bash: |
diff --git a/.pipelines/e2e-test/capture-container-start-time.sh b/.pipelines/e2e-test/capture-container-start-time.sh
new file mode 100644
index 0000000000..733bb77962
--- /dev/null
+++ b/.pipelines/e2e-test/capture-container-start-time.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+# Capture Container Start Times
+# Captures the LATEST container start time across all ama-logs pods
+# This is used to filter Log Analytics queries to only show logs from the newly deployed containers
+
+set -e
+
+# Parse command line arguments
+LINUX_IMAGE_TAG="${1}"
+WINDOWS_IMAGE_TAG="${2}"
+LINUX_MCR_REPO="${3}"
+WINDOWS_MCR_REPO="${4}"
+
+if [ -z "$LINUX_IMAGE_TAG" ] || [ -z "$WINDOWS_IMAGE_TAG" ] || [ -z "$LINUX_MCR_REPO" ] || [ -z "$WINDOWS_MCR_REPO" ]; then
+  echo "Error: Missing required parameters"
+  echo "Usage: $0 <linux-image-tag> <windows-image-tag> <linux-mcr-repo> <windows-mcr-repo>"
+  exit 1
+fi
+
+LINUX_IMAGE="$LINUX_MCR_REPO:$LINUX_IMAGE_TAG"
+WINDOWS_IMAGE="$WINDOWS_MCR_REPO:$WINDOWS_IMAGE_TAG"
+
+# Source shared functions
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$SCRIPT_DIR/util.sh"
+
+echo "================================"
+echo "Container Start Time Capture"
+echo "================================"
+echo "Capturing LATEST container start time for Log Analytics queries..."
+echo ""
+
+# Build pod configurations using shared function
+declare -a pod_configs
+build_pod_configs "$LINUX_IMAGE" "$WINDOWS_IMAGE"
+
+if [ ${#pod_configs[@]} -eq 0 ]; then
+  echo "✗ ERROR: No pods found!"
+  exit 1
+fi
+
+latest_start_time=""
+
+for config in "${pod_configs[@]}"; do
+  IFS='|' read -r pod_name expected_image container_name <<< "$config"
+  
+  # Get container start time for the specific container
+  start_time=$(kubectl get pod "$pod_name" -n kube-system \
+    -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].state.running.startedAt}" 2>/dev/null || echo "")
+  
+  if [ -n "$start_time" ]; then
+    echo "  Pod $pod_name (container: $container_name) started at: $start_time"
+    
+    # Track LATEST time (lexicographically later in ISO 8601 format)
+    if [ -z "$latest_start_time" ] || [[ "$start_time" > "$latest_start_time" ]]; then
+      latest_start_time="$start_time"
+    fi
+  else
+    echo "✗ ERROR: Could not determine container start time for pod $pod_name (container: $container_name)"
+    echo "This is required for Log Analytics query filtering"
+    exit 1
+  fi
+done
+
+if [ -n "$latest_start_time" ]; then
+  # Validate that start time is recent (within last 30 minutes)
+  # This ensures we captured the newly deployed containers, not old ones
+  current_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
+  current_epoch=$(date -u -d "$current_time" +%s 2>/dev/null || date -u -j -f "%Y-%m-%dT%H:%M:%SZ" "$current_time" +%s 2>/dev/null)
+  start_epoch=$(date -u -d "$latest_start_time" +%s 2>/dev/null || date -u -j -f "%Y-%m-%dT%H:%M:%SZ" "$latest_start_time" +%s 2>/dev/null)
+  time_diff=$((current_epoch - start_epoch))
+  time_diff_minutes=$((time_diff / 60))
+  
+  echo ""
+  echo "Time validation:"
+  echo "  Current UTC time: $current_time"
+  echo "  Latest start time: $latest_start_time"
+  echo "  Time difference: $time_diff_minutes minutes ago"
+  
+  if [ $time_diff_minutes -gt 30 ]; then
+    echo ""
+    echo "⚠ WARNING: Container start time is $time_diff_minutes minutes old!"
+    echo "This suggests the containers may not have been restarted with the new images."
+    echo "Expected: Within ~2-5 minutes (time for pods to restart after patching)"
+    echo "Consider investigating if the image patch actually triggered pod restarts."
+  else
+    echo "  ✓ Start time is recent (within expected range)"
+  fi
+  
+  # Export for use in tests
+  echo "CONTAINER_START_TIME=$latest_start_time" > /tmp/container-deployment-time.env
+  echo ""
+  echo "✓ LATEST container start time: $latest_start_time"
+  echo "✓ Saved to /tmp/container-deployment-time.env"
+  echo "✓ Log Analytics queries should filter: TimeGenerated > datetime('$latest_start_time')"
+  echo ""
+  exit 0
+else
+  echo "✗ ERROR: Could not determine container start times"
+  exit 1
+fi
diff --git a/.pipelines/e2e-test/post-test-verify-pods.sh b/.pipelines/e2e-test/verify-ci-images-after-test.sh
similarity index 100%
rename from .pipelines/e2e-test/post-test-verify-pods.sh
rename to .pipelines/e2e-test/verify-ci-images-after-test.sh
diff --git a/.pipelines/e2e-test/pre-test-verify-pods.sh b/.pipelines/e2e-test/verify-ci-images-before-test.sh
similarity index 78%
rename from .pipelines/e2e-test/pre-test-verify-pods.sh
rename to .pipelines/e2e-test/verify-ci-images-before-test.sh
index 14d8caa7ec..d0f4a4f25b 100644
--- a/.pipelines/e2e-test/pre-test-verify-pods.sh
+++ b/.pipelines/e2e-test/verify-ci-images-before-test.sh
@@ -29,50 +29,6 @@ MAX_WAIT_MINUTES=$((MAX_RETRIES * CHECK_INTERVAL / 60))
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 source "$SCRIPT_DIR/util.sh"
 
-# Function to capture container start times
-capture_container_start_times() {
-  echo "================================"
-  echo "Container Start Time Capture"
-  echo "================================"
-  echo "Capturing LATEST container start time for Log Analytics queries..."
-  
-  local latest_start_time=""
-  
-  for config in "${pod_configs[@]}"; do
-    IFS='|' read -r pod_name expected_image container_name <<< "$config"
-    
-    # Get container start time for the specific container
-    local start_time
-    start_time=$(kubectl get pod "$pod_name" -n kube-system \
-      -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].state.running.startedAt}" 2>/dev/null || echo "")
-    
-    if [ -n "$start_time" ]; then
-      echo "  Pod $pod_name container started at: $start_time"
-      
-      # Track LATEST time (lexicographically later in ISO 8601 format)
-      if [ -z "$latest_start_time" ] || [[ "$start_time" > "$latest_start_time" ]]; then
-        latest_start_time="$start_time"
-      fi
-    else
-      echo "✗ ERROR: Could not determine container start time for pod $pod_name (container: $container_name)"
-      echo "This is required for Log Analytics query filtering"
-      exit 1
-    fi
-  done
-  
-  if [ -n "$latest_start_time" ]; then
-    # Export for use in tests
-    echo "CONTAINER_START_TIME=$latest_start_time" > /tmp/container-deployment-time.env
-    echo ""
-    echo "✓ LATEST container start time: $latest_start_time"
-    echo "✓ Saved to /tmp/container-deployment-time.env"
-    echo "✓ Log Analytics queries should filter: TimeGenerated > datetime('$latest_start_time')"
-  else
-    echo "✗ ERROR: Could not determine container start times"
-    exit 1
-  fi
-}
-
 echo "================================"
 echo "Pre-Test Pod Verification"
 echo "================================"
@@ -186,11 +142,6 @@ while [ $attempt -le $MAX_RETRIES ]; do
     echo "================================"
     echo "Total attempts: $attempt"
     echo "Total wait time: ${minutes_elapsed}m${seconds_elapsed}s"
-    echo ""
-    
-    # Capture container start times using local function
-    capture_container_start_times
-    
     echo ""
     echo "Final pod status:"
     kubectl get pods -n kube-system | grep ama-logs
diff --git a/.pipelines/e2e-test/verify-pod-images.sh b/.pipelines/e2e-test/verify-pod-images.sh
deleted file mode 100644
index f82e1a2e3c..0000000000
--- a/.pipelines/e2e-test/verify-pod-images.sh
+++ /dev/null
@@ -1,397 +0,0 @@
-#!/bin/bash
-# Script to verify AKS pod images match expected tags
-# Can be used for both pre-test and post-test verification
-
-set -e
-
-# Parse command line arguments
-MODE="${1:-pre-test}"  # pre-test or post-test
-LINUX_IMAGE_TAG="${2}"
-WINDOWS_IMAGE_TAG="${3}"
-LINUX_MCR_REPO="${4}"
-WINDOWS_MCR_REPO="${5}"
-
-# Validate MODE parameter
-if [[ "$MODE" != "pre-test" && "$MODE" != "post-test" ]]; then
-  echo "Error: Invalid mode '$MODE'"
-  echo "MODE must be either 'pre-test' or 'post-test'"
-  echo "Usage: $0 <pre-test|post-test> <linux-image-tag> <windows-image-tag> <linux-mcr-repo> <windows-mcr-repo>"
-  exit 1
-fi
-
-if [ -z "$LINUX_IMAGE_TAG" ] || [ -z "$WINDOWS_IMAGE_TAG" ] || [ -z "$LINUX_MCR_REPO" ] || [ -z "$WINDOWS_MCR_REPO" ]; then
-  echo "Error: Missing required parameters"
-  echo "Usage: $0 <pre-test|post-test> <linux-image-tag> <windows-image-tag> <linux-mcr-repo> <windows-mcr-repo>"
-  exit 1
-fi
-
-LINUX_IMAGE="$LINUX_MCR_REPO:$LINUX_IMAGE_TAG"
-WINDOWS_IMAGE="$WINDOWS_MCR_REPO:$WINDOWS_IMAGE_TAG"
-
-if [ "$MODE" = "pre-test" ]; then
-  echo "================================"
-  echo "Pre-Test Image Verification"
-  echo "================================"
-  echo "Verifying pods are running with new images and are ready..."
-else
-  echo "================================"
-  echo "Post-Test Image Verification"
-  echo "================================"
-  echo "Verifying pods still have the correct images after test execution..."
-fi
-
-echo ""
-echo "Repository Configuration:"
-echo "  Linux MCR repo:   $LINUX_MCR_REPO"
-echo "  Windows MCR repo: $WINDOWS_MCR_REPO"
-echo ""
-echo "Expected Images:"
-echo "  Linux image:   $LINUX_IMAGE"
-echo "  Windows image: $WINDOWS_IMAGE"
-echo ""
-
-# Unified function to check all pods (with optional retry attempts)
-# max_retries of 0 means instant check (no wait), otherwise retries up to max_retries times
-check_all_pods() {
-  local -n configs_ref=$1  # Use different name to avoid circular reference
-  local max_retries=${2:-0}  # Default to 0 (instant check, no retry)
-  local check_interval=60  # Wait 60 seconds between retries
-  
-  if [ $max_retries -gt 0 ]; then
-    # Wait mode (pre-test): Monitor pods with retries
-    local attempt=1
-    
-    echo "================================"
-    echo "Waiting for all pods to be ready"
-    echo "================================"
-    echo "Total pods to check: ${#configs_ref[@]}"
-    echo "Maximum retries: $max_retries"
-    echo "Check interval: ${check_interval}s"
-    echo "Maximum wait time: $(((max_retries * check_interval) / 60)) minutes"
-    echo ""
-    
-    # Track ready status for each pod
-    declare -A pod_ready_status
-    for config in "${configs_ref[@]}"; do
-      pod_name=$(echo "$config" | cut -d'|' -f1)
-      pod_ready_status["$pod_name"]=false
-    done
-    
-    while [ $attempt -le $max_retries ]; do
-      local has_not_ready_pod=false
-      local ready_count=0
-      local total_count=${#configs_ref[@]}
-      
-      # Check each pod in this iteration
-      for config in "${configs_ref[@]}"; do
-        echo "  Raw config string: '$config'"
-        IFS='|' read -r pod_name expected_image container_name <<< "$config"
-        echo "  Parsed values:"
-        echo "    pod_name='$pod_name'"
-        echo "    expected_image='$expected_image'"
-        echo "    container_name='$container_name'"
-        
-        # Skip if already marked as ready
-        if [ "${pod_ready_status[$pod_name]}" = "true" ]; then
-          ((ready_count++))
-          continue
-        fi
-        
-        current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "")
-        pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown")
-        container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false")
-        
-        # Check if pod is ready
-        if [[ "$current_image" == "$expected_image" ]] && [[ "$pod_status" == "Running" ]] && [[ "$container_ready" == "true" ]]; then
-          pod_ready_status["$pod_name"]=true
-          ((ready_count++))
-          echo "  ✓ $pod_name - Ready"
-        else
-          has_not_ready_pod=true
-          echo "  ⏳ $pod_name - Waiting (Status: $pod_status, Container ready: $container_ready)"
-          if [[ "$current_image" != "$expected_image" ]]; then
-            echo "      Image mismatch: expected $expected_image, got $current_image"
-          fi
-        fi
-      done
-      
-      # Show progress summary
-      local elapsed_seconds=$(((attempt - 1) * check_interval))
-      local minutes_elapsed=$((elapsed_seconds / 60))
-      local seconds_elapsed=$((elapsed_seconds % 60))
-      local remaining_retries=$((max_retries - attempt))
-      local remaining_seconds=$((remaining_retries * check_interval))
-      local minutes_remaining=$((remaining_seconds / 60))
-      local seconds_remaining=$((remaining_seconds % 60))
-      
-      echo ""
-      echo "Attempt $attempt/$max_retries (${minutes_elapsed}m${seconds_elapsed}s elapsed, ${minutes_remaining}m${seconds_remaining}s remaining)"
-      echo "Progress: $ready_count/$total_count pods ready"
-      echo ""
-      
-      # Exit early if all pods are ready
-      if [ "$has_not_ready_pod" = false ]; then
-        echo "================================"
-        echo "✓ SUCCESS: All pods are ready!"
-        echo "================================"
-        echo "Total attempts: $attempt"
-        echo "Total wait time: ${minutes_elapsed}m${seconds_elapsed}s"
-        echo ""
-        return 0
-      fi
-      
-      # Don't sleep after the last attempt
-      if [ $attempt -lt $max_retries ]; then
-        sleep $check_interval
-      fi
-      
-      ((attempt++))
-    done
-    
-    # Max retries reached - report which pods failed
-    echo "================================"
-    echo "✗ MAX RETRIES REACHED: Not all pods became ready after $max_retries attempts"
-    echo "================================"
-    echo ""
-    echo "Failed pods:"
-    for config in "${configs_ref[@]}"; do
-      IFS='|' read -r pod_name expected_image container_name <<< "$config"
-      if [ "${pod_ready_status[$pod_name]}" != "true" ]; then
-        current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "ERROR")
-        pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown")
-        container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false")
-        
-        echo "  ✗ $pod_name"
-        echo "      Expected image: $expected_image"
-        echo "      Current image:  $current_image"
-        echo "      Pod status:     $pod_status"
-        echo "      Container ready: $container_ready"
-      fi
-    done
-    echo ""
-    
-    return 1
-  else
-    # Instant check mode (post-test): Single check, no waiting
-    local mismatches=()
-    
-    echo "Performing instant verification of all pods..."
-    echo ""
-    
-    for config in "${configs_ref[@]}"; do
-      IFS='|' read -r pod_name expected_image container_name <<< "$config"
-      
-      # Get pod details
-      current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "ERROR")
-      pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown")
-      container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false")
-      
-      echo "Pod: $pod_name"
-      echo "  Container: $container_name"
-      echo "  Expected image: $expected_image"
-      echo "  Current image:  $current_image"
-      echo "  Pod status: $pod_status"
-      echo "  Container ready: $container_ready"
-      
-      # Check for any issues
-      local has_issue=false
-      
-      if [[ "$current_image" != "$expected_image" ]]; then
-        echo "  ✗ IMAGE MISMATCH!"
-        mismatches+=("$pod_name: expected image '$expected_image' but found '$current_image'")
-        has_issue=true
-      fi
-      
-      if [[ "$pod_status" != "Running" ]]; then
-        echo "  ✗ POD NOT RUNNING!"
-        mismatches+=("$pod_name: pod status is '$pod_status' (expected 'Running')")
-        has_issue=true
-      fi
-      
-      if [[ "$container_ready" != "true" ]]; then
-        echo "  ✗ CONTAINER NOT READY!"
-        mismatches+=("$pod_name: container '$container_name' is not ready")
-        has_issue=true
-      fi
-      
-      if [[ "$has_issue" = false ]]; then
-        echo "  ✓ All checks passed"
-      fi
-      echo ""
-    done
-    
-    # Return mismatches via global array (bash limitation workaround)
-    image_mismatches=("${mismatches[@]}")
-    
-    if [ ${#mismatches[@]} -eq 0 ]; then
-      return 0
-    else
-      return 1
-    fi
-  fi
-}
-
-# Get all ama-logs pods
-echo "Getting list of ama-logs pods..."
-pod_list=$(kubectl get pods -n kube-system --no-headers | grep ama-logs | awk '{print $1}')
-
-# Build configurations for all pods
-pod_configs=()
-image_mismatches=()
-
-for pod_name in $pod_list; do
-  # Determine expected image based on pod type
-  if [[ "$pod_name" =~ ^ama-logs-windows ]]; then
-    expected_image="$WINDOWS_IMAGE"
-    container_name="ama-logs-windows"
-  elif [[ "$pod_name" =~ ^ama-logs-rs ]] || [[ "$pod_name" =~ ^ama-logs-[a-z0-9]{5}$ ]]; then
-    # Matches both ReplicaSet pods (ama-logs-rs-*) and DaemonSet pods (ama-logs-xxxxx)
-    expected_image="$LINUX_IMAGE"
-    container_name="ama-logs"
-  else
-    echo "✗ ERROR: Unknown pod pattern: $pod_name"
-    echo "Expected pod names to match one of:"
-    echo "  - ama-logs-windows-* (Windows pods)"
-    echo "  - ama-logs-rs-* (Linux ReplicaSet pods)"
-    echo "  - ama-logs-xxxxx (Linux DaemonSet pods, 5 alphanumeric chars)"
-    exit 1
-  fi
-  
-  # Add to configurations for parallel checking
-  # Use | as delimiter since colons appear in image tags (e.g., ciprod:3.1.31)
-  pod_configs+=("$pod_name|$expected_image|$container_name")
-done
-
-echo "Found ${#pod_configs[@]} pods to verify"
-echo ""
-
-# Use different check based on mode
-if [ "$MODE" = "pre-test" ]; then
-  # Pre-test: Wait for all pods to be ready (15 retries × 60s = 15 minutes max)
-  if ! check_all_pods pod_configs 15; then
-    failed_pods=true
-  else
-    failed_pods=false
-  fi
-else
-  # Post-test: Instant check of all pods (no retry)
-  check_all_pods pod_configs 0
-fi
-
-echo ""
-echo "================================"
-if [ "$MODE" = "pre-test" ]; then
-  echo "Pre-Test Verification Summary"
-else
-  echo "Post-Test Verification Summary"
-fi
-echo "================================"
-
-# Report results based on mode
-if [ "$MODE" = "pre-test" ]; then
-  if [ "$failed_pods" = false ]; then
-    echo "✓ All pods are running with the correct images and are ready!"
-    echo ""
-    echo "Final pod status:"
-    kubectl get pods -n kube-system | grep ama-logs
-    echo ""
-    capture_container_start_times
-    
-    exit 0
-  else
-    echo "✗ Pod verification failed (see details above)"
-    echo ""
-    echo "Final pod status:"
-    kubectl get pods -n kube-system | grep ama-logs
-    exit 1
-  fi
-else
-  # Post-test mode
-  if [ ${#image_mismatches[@]} -eq 0 ]; then
-    echo "✓ SUCCESS: All pods maintained the correct images throughout the test execution!"
-    echo ""
-    echo "Final pod status:"
-    kubectl get pods -n kube-system | grep ama-logs
-    exit 0
-  else
-    echo "✗ FAILURE: Some pods changed images during test execution!"
-    echo ""
-    echo "Pods with image mismatches:"
-    printf '  - %s\n' "${image_mismatches[@]}"
-    echo ""
-    echo "This indicates the pods may have been restarted or updated during testing."
-    echo "This could cause test instability or false results."
-    echo ""
-    echo "Current pod status:"
-    kubectl get pods -n kube-system | grep ama-logs
-    echo ""
-    echo "Detailed pod information:"
-    for mismatch in "${image_mismatches[@]}"; do
-      pod=$(echo "$mismatch" | cut -d: -f1)
-      echo ""
-      echo "--- Details for $pod ---"
-      kubectl describe pod "$pod" -n kube-system | grep -A 20 "Events:" || kubectl describe pod "$pod" -n kube-system | tail -30
-    done
-    exit 1
-  fi
-fi
-
-# Function to capture container start times for Log Analytics query filtering
-capture_container_start_times() {
-  echo "================================"
-  echo "Container Start Time Capture"
-  echo "================================"
-  echo "Capturing LATEST container start time for Log Analytics queries..."
-  
-  # Get all container start times and find the LATEST one
-  local latest_start_time=""
-  
-  local pod_list=$(kubectl get pods -n kube-system --no-headers | grep ama-logs | awk '{print $1}')
-  for pod_name in $pod_list; do
-    # Get container name based on pod type
-    local container_name
-    if [[ "$pod_name" =~ ^ama-logs-windows ]]; then
-      container_name="ama-logs-windows"
-    elif [[ "$pod_name" =~ ^ama-logs-rs ]] || [[ "$pod_name" =~ ^ama-logs-[a-z0-9]{5}$ ]]; then
-      container_name="ama-logs"
-    else
-      continue
-    fi
-    
-    # Get container start time - try first container if filter doesn't work
-    local start_time
-    start_time=$(kubectl get pod "$pod_name" -n kube-system \
-      -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].state.running.startedAt}" 2>/dev/null || echo "")
-    
-    if [ -z "$start_time" ]; then
-      start_time=$(kubectl get pod "$pod_name" -n kube-system \
-        -o jsonpath="{.status.containerStatuses[0].state.running.startedAt}" 2>/dev/null || echo "")
-    fi
-    
-    if [ -n "$start_time" ]; then
-      echo "  Pod $pod_name container started at: $start_time"
-      
-      # Track LATEST time (lexicographically later in ISO 8601 format)
-      if [ -z "$latest_start_time" ] || [[ "$start_time" > "$latest_start_time" ]]; then
-        latest_start_time="$start_time"
-      fi
-    else
-      echo "✗ ERROR: Could not determine container start time for pod $pod_name (container: $container_name)"
-      echo "This is required for Log Analytics query filtering"
-      exit 1
-    fi
-  done
-  
-  if [ -n "$latest_start_time" ]; then
-    # Export for use in tests
-    echo "CONTAINER_START_TIME=$latest_start_time" > /tmp/container-deployment-time.env
-    echo ""
-    echo "✓ LATEST container start time: $latest_start_time"
-    echo "✓ Saved to /tmp/container-deployment-time.env"
-    echo "✓ Log Analytics queries should filter: TimeGenerated > datetime('$latest_start_time')"
-  else
-    echo "✗ ERROR: Could not determine container start times"
-    echo "This is required for Log Analytics query filtering"
-    exit 1
-  fi
-}

From 6d513d2f99b8ef25ec618074a1ab50bae850c933 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 5 Dec 2025 14:48:18 -0800
Subject: [PATCH 22/40] add delay

---
 .pipelines/e2e-test/capture-container-start-time.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.pipelines/e2e-test/capture-container-start-time.sh b/.pipelines/e2e-test/capture-container-start-time.sh
index 733bb77962..b9e25b0d67 100644
--- a/.pipelines/e2e-test/capture-container-start-time.sh
+++ b/.pipelines/e2e-test/capture-container-start-time.sh
@@ -29,6 +29,10 @@ echo "Container Start Time Capture"
 echo "================================"
 echo "Capturing LATEST container start time for Log Analytics queries..."
 echo ""
+echo "Waiting 60 seconds for Kubernetes API to update container status..."
+sleep 60
+echo "Proceeding with container start time capture..."
+
 
 # Build pod configurations using shared function
 declare -a pod_configs

From db6732ea654b9702ee64c7cb96f5515246045c20 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 5 Dec 2025 15:08:06 -0800
Subject: [PATCH 23/40] wait time 15 mins

---
 .../azure-template-deploy-and-test-ci-image-in-aks-cluster.yml  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml
index db1c9a2a73..a9f744da61 100644
--- a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml
+++ b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml
@@ -216,7 +216,7 @@ jobs:
               echo "This ensures queries will find logs from the newly deployed containers."
               echo ""
               
-              wait_time=60 #TODO: change back to 1200 (20 minutes) after testing
+              wait_time=900 #TODO: change back to 1200 (20 minutes) after testing
               interval=60
               elapsed=0
               

From 93057f7caff41547c61331070a14433d1791d28b Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 5 Dec 2025 15:40:14 -0800
Subject: [PATCH 24/40] minor fix

---
 .config/guardian/.gdnbaselines                | 1 +
 .pipelines/azure_pipeline_mergedbranches.yaml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.config/guardian/.gdnbaselines b/.config/guardian/.gdnbaselines
index eff01b8012..a7976d8fde 100644
--- a/.config/guardian/.gdnbaselines
+++ b/.config/guardian/.gdnbaselines
@@ -155,3 +155,4 @@
     }
   }
 }
+
diff --git a/.pipelines/azure_pipeline_mergedbranches.yaml b/.pipelines/azure_pipeline_mergedbranches.yaml
index db5c896550..9fa66fbe49 100644
--- a/.pipelines/azure_pipeline_mergedbranches.yaml
+++ b/.pipelines/azure_pipeline_mergedbranches.yaml
@@ -889,6 +889,7 @@ extends:
             FileDirPath: '$(Build.ArtifactStagingDirectory)'
             DisableRemediation: false
             AcceptableOutdatedSignatureInHours: 72
+
     - stage: Deploy_and_Test_Images_In_Dev_Clusters
       displayName: Deploy and Test Images in Dev Clusters
       lockBehavior: sequential

From 01783abdee184b5a25bee0db8771a7828eb987ff Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 5 Dec 2025 16:04:06 -0800
Subject: [PATCH 25/40] remove doc

---
 ...I-Agent-Auto-Deploy-Implementation-Plan.md | 417 ------------------
 1 file changed, 417 deletions(-)
 delete mode 100644 Documentation/CI-Agent-Auto-Deploy-Implementation-Plan.md

diff --git a/Documentation/CI-Agent-Auto-Deploy-Implementation-Plan.md b/Documentation/CI-Agent-Auto-Deploy-Implementation-Plan.md
deleted file mode 100644
index 98bbaf4319..0000000000
--- a/Documentation/CI-Agent-Auto-Deploy-Implementation-Plan.md
+++ /dev/null
@@ -1,417 +0,0 @@
-# CI Agent Auto-Deploy Implementation Plan
-
-## Overview
-This document outlines the implementation plan for enabling auto-deployment of CI Agent to a dev cluster on every PR merge to main branch, following the Prom Agent pattern.
-
-**Goal:** Automatically deploy freshly built CI agent images to a dev cluster after each successful build on main branch.
-
-**Pattern:** Based on Prom Agent's `azure-pipeline-build.yml` approach - sequential deployments using `helm upgrade --install`.
-
----
-
-## Key Findings
-
-### ✅ No Chart Modifications Needed
-- **ServiceAccount**: Hardcoded `ama-logs` works fine for sequential deployments
-- **Image Tags**: Can be overridden via `--set` flags at deployment time
-- **Release Name**: Using same release name (`ama-logs-dev`) for all deployments allows Helm to upgrade in place
-
-### ✅ Prom Agent Pattern
-- Uses `helm upgrade --install` with same release name every time
-- Deploys to different clusters (not multiple releases per cluster)
-- Each cluster has exactly ONE release
-- No ServiceAccount conflicts with sequential deployments
-
----
-
-## Implementation Changes
-
-### 1. Pipeline Modification
-
-**File:** `Docker-Provider/.pipelines/azure_pipeline_mergedbranches.yaml`
-
-**Add Deployment Stage** after existing build stages:
-
-```yaml
-- stage: Deploy_Dev_Cluster
-  displayName: Deploy to Dev Cluster
-  dependsOn: 
-    - BuildLinuxImages
-    - BuildWindowsImages
-  # Only deploy on main branch merges (not PRs)
-  condition: and(succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
-  
-  jobs:
-  - deployment: Deploy_AKS_Chart
-    displayName: "Deploy: AKS dev cluster"
-    environment: CI-Agent-Dev  # Create this environment in Azure DevOps
-    pool:
-      name: Azure-Pipelines-CI-Test-EO
-    
-    variables:
-      # Get image tags from build stages
-      linuxImageTag: $[ stageDependencies.BuildLinuxImages.Build.outputs['setImageTag.linuxTag'] ]
-      windowsImageTag: $[ stageDependencies.BuildWindowsImages.Build.outputs['setImageTag.windowsTag'] ]
-    
-    strategy:
-      runOnce:
-        deploy:
-          steps:
-          - checkout: self
-          
-          - task: HelmDeploy@0
-            displayName: "Deploy to dev cluster"
-            inputs:
-              connectionType: 'Azure Resource Manager'
-              azureSubscription: 'ContainerInsights_Build_Subscription(9b96ebbd-c57a-42d1-bbe9-b69296e4c7fb)'
-              azureResourceGroup: 'YOUR-DEV-CLUSTER-RG'
-              kubernetesCluster: 'YOUR-DEV-CLUSTER-NAME'
-              useClusterAdmin: true
-              namespace: 'kube-system'
-              command: 'upgrade'
-              chartType: 'FilePath'
-              chartPath: '$(Build.SourcesDirectory)/charts/azuremonitor-containers/'
-              releaseName: 'ama-logs-dev'
-              overrideValues: |
-                amalogs.image.repo=mcr.microsoft.com/azuremonitor/containerinsights/cidev
-                amalogs.image.tag=$(linuxImageTag)
-                amalogs.image.tagWindows=$(windowsImageTag)
-              arguments: '--install --create-namespace'
-```
-
----
-
-### 2. Ensure Build Stages Export Image Tags
-
-**Verify in BuildLinuxImages stage:**
-
-```yaml
-- stage: BuildLinuxImages
-  jobs:
-  - job: Build
-    steps:
-    # ... existing build steps ...
-    
-    # Add this step to export tag
-    - script: |
-        echo "##vso[task.setvariable variable=linuxTag;isOutput=true]$(IMAGE_TAG)"
-      name: setImageTag
-      displayName: Export Linux image tag
-```
-
-**Verify in BuildWindowsImages stage:**
-
-```yaml
-- stage: BuildWindowsImages
-  jobs:
-  - job: Build
-    steps:
-    # ... existing build steps ...
-    
-    # Add this step to export tag
-    - script: |
-        echo "##vso[task.setvariable variable=windowsTag;isOutput=true]$(IMAGE_TAG)"
-      name: setImageTag
-      displayName: Export Windows image tag
-```
-
----
-
-### 3. Configuration Updates
-
-**Replace these placeholders with actual values:**
-
-| Placeholder | Description | Example Value |
-|-------------|-------------|---------------|
-| `YOUR-DEV-CLUSTER-RG` | Resource group containing dev cluster | `ci-dev-aks-rg` |
-| `YOUR-DEV-CLUSTER-NAME` | Name of dev AKS cluster | `ci-dev-aks-eus` |
-
-**Optional: Add more overrides for dev-specific configuration:**
-
-```yaml
-overrideValues: |
-  amalogs.image.repo=mcr.microsoft.com/azuremonitor/containerinsights/cidev
-  amalogs.image.tag=$(linuxImageTag)
-  amalogs.image.tagWindows=$(windowsImageTag)
-  amalogs.secret.wsid=YOUR-DEV-WORKSPACE-ID
-  amalogs.secret.key=YOUR-DEV-WORKSPACE-KEY
-  amalogs.env.clusterName=ci-dev-cluster
-  amalogs.ISTEST=true
-```
-
----
-
-### 4. Azure DevOps Environment Setup
-
-**Create deployment environment:**
-1. Navigate to: Azure DevOps → Pipelines → Environments
-2. Click "New environment"
-3. Name: `CI-Agent-Dev`
-4. Resource: None (environment-only)
-5. (Optional) Add approval gates if needed
-
----
-
-## Chart Details - No Modifications Required
-
-### ServiceAccount Handling
-- **Current:** Hardcoded as `ama-logs`
-- **Works because:** Sequential deployments reuse same ServiceAccount
-- **Pattern:** `helm upgrade` updates existing resources, doesn't recreate
-
-### Image Tag Handling
-- **Current:** Hardcoded in `values.yaml`
-- **Override:** Via `--set` flags at deployment time
-- **Files affected:** None (pure runtime override)
-
-### Files with ServiceAccount References (No changes needed)
-1. `templates/ama-logs-rbac.yaml` - Creates ServiceAccount `ama-logs`
-2. `templates/ama-logs-daemonset.yaml` - References `serviceAccountName: ama-logs`
-3. `templates/ama-logs-daemonset-windows.yaml` - References `serviceAccountName: ama-logs`
-4. `templates/ama-logs-deployment.yaml` - References `serviceAccountName: ama-logs`
-
----
-
-## How It Works
-
-### Deployment Flow
-
-```
-┌─────────────────────────────────────────────────────────────┐
-│ 1. PR Merged to Main Branch                                │
-└─────────────────────────────────────────────────────────────┘
-                         ▼
-┌─────────────────────────────────────────────────────────────┐
-│ 2. Build Pipeline Triggered                                 │
-│    - BuildLinuxImages stage → produces linuxImageTag        │
-│    - BuildWindowsImages stage → produces windowsImageTag    │
-└─────────────────────────────────────────────────────────────┘
-                         ▼
-┌─────────────────────────────────────────────────────────────┐
-│ 3. Deploy_Dev_Cluster Stage                                 │
-│    - Gets image tags from build stages                      │
-│    - Runs: helm upgrade ama-logs-dev --install              │
-│    - Overrides: image.tag=$(linuxImageTag)                  │
-└─────────────────────────────────────────────────────────────┘
-                         ▼
-┌─────────────────────────────────────────────────────────────┐
-│ 4. Helm Deployment on Dev Cluster                           │
-│    - First run: Creates new release "ama-logs-dev"          │
-│    - Subsequent runs: Updates existing release              │
-│    - ServiceAccount "ama-logs" reused (no conflicts)        │
-└─────────────────────────────────────────────────────────────┘
-                         ▼
-┌─────────────────────────────────────────────────────────────┐
-│ 5. Dev Cluster Running Latest Build                         │
-│    - DaemonSet updated with new image tags                  │
-│    - Windows DaemonSet updated with new image tags          │
-│    - Deployment (ReplicaSet) updated with new image tags    │
-└─────────────────────────────────────────────────────────────┘
-```
-
-### Sequential Deployment Example
-
-```bash
-# Build 1 - Creates initial deployment
-helm upgrade ama-logs-dev ./chart --install \
-  --set amalogs.image.tag=3.1.30-20231101 \
-  --set amalogs.image.tagWindows=win-3.1.30-20231101
-# Result: New release created, ServiceAccount "ama-logs" created
-
-# Build 2 - Updates existing deployment
-helm upgrade ama-logs-dev ./chart --install \
-  --set amalogs.image.tag=3.1.30-20231102 \
-  --set amalogs.image.tagWindows=win-3.1.30-20231102
-# Result: Release updated, ServiceAccount "ama-logs" reused ✅
-
-# Build 3 - Updates existing deployment
-helm upgrade ama-logs-dev ./chart --install \
-  --set amalogs.image.tag=3.1.30-20231103 \
-  --set amalogs.image.tagWindows=win-3.1.30-20231103
-# Result: Release updated, ServiceAccount "ama-logs" reused ✅
-```
-
----
-
-## Testing Plan
-
-### Pre-Deployment Testing
-
-1. **Validate Chart Templates:**
-```bash
-cd Docker-Provider/charts/azuremonitor-containers
-helm template ama-logs-dev . \
-  --set amalogs.image.tag=test-tag \
-  --set amalogs.image.tagWindows=test-tag-win \
-  --debug
-```
-
-2. **Dry Run Deployment:**
-```bash
-helm upgrade ama-logs-dev . --install \
-  --namespace kube-system \
-  --set amalogs.image.tag=test-tag \
-  --dry-run --debug
-```
-
-### Post-Deployment Validation
-
-1. **Check Pipeline Execution:**
-   - Verify Deploy_Dev_Cluster stage runs
-   - Check image tags are passed correctly
-   - Confirm Helm deployment succeeds
-
-2. **Verify Cluster Deployment:**
-```bash
-# Check pods are running
-kubectl get pods -n kube-system | grep ama-logs
-
-# Verify DaemonSet
-kubectl describe daemonset ama-logs -n kube-system
-
-# Verify Windows DaemonSet  
-kubectl describe daemonset ama-logs-win -n kube-system
-
-# Verify Deployment (ReplicaSet)
-kubectl describe deployment ama-logs-rs -n kube-system
-
-# Check image tags match build
-kubectl get daemonset ama-logs -n kube-system -o jsonpath='{.spec.template.spec.containers[0].image}'
-```
-
-3. **Verify ServiceAccount:**
-```bash
-# Confirm ServiceAccount exists and is used
-kubectl get serviceaccount ama-logs -n kube-system
-kubectl get pods -n kube-system -l dsName=ama-logs-ds -o jsonpath='{.items[0].spec.serviceAccountName}'
-```
-
----
-
-## Rollback Plan
-
-If deployment fails or causes issues:
-
-### Option 1: Rollback via Helm
-```bash
-# List releases
-helm list -n kube-system
-
-# Rollback to previous version
-helm rollback ama-logs-dev -n kube-system
-```
-
-### Option 2: Manual Revert
-```bash
-# Revert to specific image version
-helm upgrade ama-logs-dev ./chart --install \
-  --set amalogs.image.tag=PREVIOUS-WORKING-TAG \
-  --set amalogs.image.tagWindows=PREVIOUS-WORKING-TAG-win
-```
-
-### Option 3: Remove Pipeline Stage
-- Comment out `Deploy_Dev_Cluster` stage in pipeline
-- Commit and push
-- Cluster remains at current version
-
----
-
-## Comparison: CI Agent vs Prom Agent
-
-| Aspect | Prom Agent | CI Agent (This Plan) |
-|--------|-----------|---------------------|
-| **Chart Changes** | None | None |
-| **ServiceAccount** | Hardcoded `ama-metrics-serviceaccount` | Hardcoded `ama-logs` |
-| **Deployment Method** | `helm upgrade --install` | `helm upgrade --install` |
-| **Release Name** | `ama-metrics` | `ama-logs-dev` |
-| **Image Override** | `--set image.tag=...` | `--set amalogs.image.tag=...` |
-| **Multiple Versions** | ❌ Not supported | ❌ Not supported (sequential only) |
-| **Cluster Strategy** | One release per cluster | One release per cluster |
-
----
-
-## Estimated Effort
-
-| Task | Effort | Notes |
-|------|--------|-------|
-| Add deployment stage to pipeline | 30 min | Copy from Prom agent pattern |
-| Update cluster name/RG variables | 5 min | Simple config update |
-| Create Azure DevOps environment | 5 min | One-time setup |
-| Verify build tag exports | 15 min | May already exist |
-| Test dry-run deployment | 15 min | Validate before merge |
-| Deploy and validate | 30 min | First deployment + verification |
-| **Total** | **~2 hours** | Including testing and validation |
-
----
-
-## Future Enhancements (Optional)
-
-### 1. Add E2E Tests Post-Deployment
-Similar to Prom agent's TestKube integration:
-```yaml
-- job: Run_E2E_Tests
-  dependsOn: Deploy_AKS_Chart
-  steps:
-  - script: kubectl testkube run testsuite ci-agent-e2e-tests
-```
-
-### 2. Deploy to Multiple Dev Clusters
-Add additional deployment jobs for different regions:
-```yaml
-- deployment: Deploy_EUS_Cluster
-  cluster: ci-dev-aks-eus
-  
-- deployment: Deploy_WUS_Cluster
-  cluster: ci-dev-aks-wus
-```
-
-### 3. Slack/Teams Notifications
-Notify team of successful deployments:
-```yaml
-- task: SlackNotification@1
-  inputs:
-    message: "✅ CI Agent $(linuxImageTag) deployed to dev cluster"
-```
-
----
-
-## References
-
-- **Prom Agent Build Pipeline:** `prometheus-collector/.pipelines/azure-pipeline-build.yml`
-- **CI Agent Current Pipeline:** `Docker-Provider/.pipelines/azure_pipeline_mergedbranches.yaml`
-- **Helm Chart:** `Docker-Provider/charts/azuremonitor-containers/`
-- **Prom Agent Chart:** `prometheus-collector/otelcollector/deploy/addon-chart/azure-monitor-metrics-addon/`
-
----
-
-## Questions & Answers
-
-### Q: Why not use Release.Name for ServiceAccount?
-**A:** Not needed for sequential deployments. Same release name = same ServiceAccount = no conflicts. Only needed for parallel deployments (multiple versions simultaneously).
-
-### Q: Can we deploy multiple versions to same cluster?
-**A:** No, with current approach (hardcoded ServiceAccount). Would require chart modifications to use `{{ .Release.Name }}` pattern. Not recommended unless specifically needed.
-
-### Q: What if build fails?
-**A:** Deploy stage has `condition: succeeded()` - won't run if build fails. Cluster stays at previous version.
-
-### Q: How to deploy to production?
-**A:** This plan is for dev cluster only. Production deployments should continue using existing release pipeline with proper approvals and phased rollouts.
-
----
-
-## Status
-
-- [x] Research Prom agent pattern
-- [x] Document findings
-- [x] Create implementation plan
-- [ ] Update pipeline with deployment stage
-- [ ] Test deployment to dev cluster
-- [ ] Validate with team
-- [ ] Merge to main branch
-
----
-
-**Last Updated:** 2025-11-07  
-**Author:** Implementation plan based on Prom agent analysis  
-**Status:** Ready for implementation

From 6a09f91eca9cc69dea72a529cce937acaf146e63 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 5 Dec 2025 16:06:08 -0800
Subject: [PATCH 26/40] minor

---
 test/testkube/helm-testkube-values.yaml             | 4 ----
 test/testkube/install-and-execute-testkube-tests.sh | 1 +
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/test/testkube/helm-testkube-values.yaml b/test/testkube/helm-testkube-values.yaml
index 29727dcfba..bd8d5f5796 100644
--- a/test/testkube/helm-testkube-values.yaml
+++ b/test/testkube/helm-testkube-values.yaml
@@ -1304,7 +1304,3 @@ testkube-operator:
     # ref: https://cloud.google.com/kubernetes-engine/docs/how-to/prepare-arm-workloads-for-deployment#node-affinity-multi-arch-arm
     # -- Tolerations to schedule a workload to nodes with any architecture type. Required for deployment to GKE cluster.
     tolerations: []
-<<<<<<< HEAD
-
-=======
->>>>>>> efd34efac (add bebugging)
diff --git a/test/testkube/install-and-execute-testkube-tests.sh b/test/testkube/install-and-execute-testkube-tests.sh
index 1df4bccf47..7f5c8334d3 100644
--- a/test/testkube/install-and-execute-testkube-tests.sh
+++ b/test/testkube/install-and-execute-testkube-tests.sh
@@ -126,3 +126,4 @@ EOF
     # Explicitly fail the ADO task since at least one test failed
     exit 1
 fi
+

From eb7c9a942fe4ce3b0aae15881fb13b9505d9dc75 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 5 Dec 2025 16:07:54 -0800
Subject: [PATCH 27/40] pull from ci_prod

---
 .config/guardian/.gdnbaselines                 |  3 +--
 test/testkube/helm-testkube-values.yaml        |  7 ++++---
 .../install-and-execute-testkube-tests.sh      | 18 ++----------------
 3 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/.config/guardian/.gdnbaselines b/.config/guardian/.gdnbaselines
index a7976d8fde..2b12b418dd 100644
--- a/.config/guardian/.gdnbaselines
+++ b/.config/guardian/.gdnbaselines
@@ -154,5 +154,4 @@
       "justification": "This error is baselined with an expiration date of 180 days from 2025-05-20 23:41:13Z"
     }
   }
-}
-
+}
\ No newline at end of file
diff --git a/test/testkube/helm-testkube-values.yaml b/test/testkube/helm-testkube-values.yaml
index bd8d5f5796..8039c50e56 100644
--- a/test/testkube/helm-testkube-values.yaml
+++ b/test/testkube/helm-testkube-values.yaml
@@ -110,11 +110,11 @@ mongodb:
   # Currently Bitnami doesn't support ARM: https://github.com/bitnami/charts/issues/7305
   image:
     # -- MongoDB image registry
-    registry: docker.io
+    registry: mcr.microsoft.com
     # -- MongoDB image repository
-    repository: bitnami/mongodb
+    repository: azuremonitor/containerinsights/cidev
     # -- MongoDB image tag
-    tag: latest
+    tag: mongodb_6.0.5-debian-11-r64
     # -- MongoDB image pull Secret
     pullSecrets: []
   nodeSelector: 
@@ -1304,3 +1304,4 @@ testkube-operator:
     # ref: https://cloud.google.com/kubernetes-engine/docs/how-to/prepare-arm-workloads-for-deployment#node-affinity-multi-arch-arm
     # -- Tolerations to schedule a workload to nodes with any architecture type. Required for deployment to GKE cluster.
     tolerations: []
+
diff --git a/test/testkube/install-and-execute-testkube-tests.sh b/test/testkube/install-and-execute-testkube-tests.sh
index 7f5c8334d3..fb8f55b89c 100644
--- a/test/testkube/install-and-execute-testkube-tests.sh
+++ b/test/testkube/install-and-execute-testkube-tests.sh
@@ -24,19 +24,6 @@ echo "deb https://repo.testkube.io/linux linux main" | sudo tee -a /etc/apt/sour
 sudo apt-get update
 sudo apt-get install -y testkube
 
-echo "Checking for existing Testkube installation..."    
-if helm list -n testkube 2>/dev/null | grep -q testkube; then
-    echo "Found existing Testkube installation. Cleaning up..."
-    helm uninstall testkube -n testkube || true
-    echo "Deleting testkube namespace..."
-    kubectl delete namespace testkube --wait=true --timeout=120s || true
-    echo "Waiting for namespace to fully terminate..."
-    sleep 30
-    echo "Cleanup complete!"
-else
-    echo "No existing Testkube installation found."
-fi
-
 echo "Install testkube on the cluster"
 helm repo add kubeshop https://kubeshop.github.io/helm-charts
 helm repo update
@@ -52,7 +39,7 @@ envsubst < ./testkube-test-crs.yaml > ./testkube-test-crs-updated.yaml
 kubectl apply -f ./testkube-test-crs-updated.yaml
 
 echo "Wait for cluster to be ready"
-sleep 200
+sleep 120
 
 echo "Run testkube tests"
 execution_id=""
@@ -125,5 +112,4 @@ EOF
 
     # Explicitly fail the ADO task since at least one test failed
     exit 1
-fi
-
+fi
\ No newline at end of file

From abb61eb5552f8543ca3ab1eb99d447c6d142561c Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 5 Dec 2025 16:12:31 -0800
Subject: [PATCH 28/40] rebase

---
 test/testkube/helm-testkube-values.yaml           |  6 +++---
 .../install-and-execute-testkube-tests.sh         | 15 ++++++++++++++-
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/test/testkube/helm-testkube-values.yaml b/test/testkube/helm-testkube-values.yaml
index 8039c50e56..04a273fe69 100644
--- a/test/testkube/helm-testkube-values.yaml
+++ b/test/testkube/helm-testkube-values.yaml
@@ -110,11 +110,11 @@ mongodb:
   # Currently Bitnami doesn't support ARM: https://github.com/bitnami/charts/issues/7305
   image:
     # -- MongoDB image registry
-    registry: mcr.microsoft.com
+    registry: docker.io
     # -- MongoDB image repository
-    repository: azuremonitor/containerinsights/cidev
+    repository: bitnami/mongodb
     # -- MongoDB image tag
-    tag: mongodb_6.0.5-debian-11-r64
+    tag: latest
     # -- MongoDB image pull Secret
     pullSecrets: []
   nodeSelector: 
diff --git a/test/testkube/install-and-execute-testkube-tests.sh b/test/testkube/install-and-execute-testkube-tests.sh
index fb8f55b89c..164be7fe69 100644
--- a/test/testkube/install-and-execute-testkube-tests.sh
+++ b/test/testkube/install-and-execute-testkube-tests.sh
@@ -24,6 +24,19 @@ echo "deb https://repo.testkube.io/linux linux main" | sudo tee -a /etc/apt/sour
 sudo apt-get update
 sudo apt-get install -y testkube
 
+echo "Checking for existing Testkube installation..."    
+if helm list -n testkube 2>/dev/null | grep -q testkube; then
+    echo "Found existing Testkube installation. Cleaning up..."
+    helm uninstall testkube -n testkube || true
+    echo "Deleting testkube namespace..."
+    kubectl delete namespace testkube --wait=true --timeout=120s || true
+    echo "Waiting for namespace to fully terminate..."
+    sleep 30
+    echo "Cleanup complete!"
+else
+    echo "No existing Testkube installation found."
+fi
+
 echo "Install testkube on the cluster"
 helm repo add kubeshop https://kubeshop.github.io/helm-charts
 helm repo update
@@ -39,7 +52,7 @@ envsubst < ./testkube-test-crs.yaml > ./testkube-test-crs-updated.yaml
 kubectl apply -f ./testkube-test-crs-updated.yaml
 
 echo "Wait for cluster to be ready"
-sleep 120
+sleep 200
 
 echo "Run testkube tests"
 execution_id=""

From 5c2b3e376acd75b81897bec909fe8fd5fbda9769 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 5 Dec 2025 19:50:23 -0800
Subject: [PATCH 29/40] wait 5 minutes

---
 ...eploy-and-test-ci-image-in-aks-cluster.yml | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml
index a9f744da61..9d6c5483eb 100644
--- a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml
+++ b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml
@@ -161,6 +161,34 @@ jobs:
               echo "Current ama-logs pods:"
               kubectl get pods -n kube-system | grep ama-logs
         
+        # Wait for Kubernetes API to update and pods to begin restarting
+        - task: Bash@3
+          displayName: 'Wait for pod patch to propagate (5 minutes)'
+          inputs:
+            targetType: 'inline'
+            script: |
+              echo "========================================"
+              echo "Waiting for Pod Patch Propagation"
+              echo "========================================"
+              echo ""
+              echo "Waiting 5 minutes"
+              echo ""
+              
+              wait_time=300
+              interval=30
+              elapsed=0
+              
+              while [ $elapsed -lt $wait_time ]; do
+                remaining=$((wait_time - elapsed))
+                echo "⏳ Waiting... ($elapsed/$wait_time seconds elapsed, $remaining seconds remaining)"
+                sleep $interval
+                elapsed=$((elapsed + interval))
+              done
+              
+              echo ""
+              echo "✓ Wait complete! Now checking actual pod readiness status..."
+              echo "========================================"
+        
         # Pre-test verification: Wait for pods to be ready with new images
         - task: Bash@3
           displayName: 'Pre-Test: Wait for pods to be ready with new images'

From bf5126beb2404ad953970fcecccb1df9ecbe7a57 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 5 Dec 2025 20:39:05 -0800
Subject: [PATCH 30/40] remove container start time check wait

---
 .pipelines/e2e-test/capture-container-start-time.sh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.pipelines/e2e-test/capture-container-start-time.sh b/.pipelines/e2e-test/capture-container-start-time.sh
index b9e25b0d67..b3c9b4322d 100644
--- a/.pipelines/e2e-test/capture-container-start-time.sh
+++ b/.pipelines/e2e-test/capture-container-start-time.sh
@@ -29,9 +29,6 @@ echo "Container Start Time Capture"
 echo "================================"
 echo "Capturing LATEST container start time for Log Analytics queries..."
 echo ""
-echo "Waiting 60 seconds for Kubernetes API to update container status..."
-sleep 60
-echo "Proceeding with container start time capture..."
 
 
 # Build pod configurations using shared function

From 97d2d8ca73b0d67938ea7280bc7b3f30caf60048 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 5 Dec 2025 20:43:04 -0800
Subject: [PATCH 31/40] 20 min wait time

---
 .../azure-template-deploy-and-test-ci-image-in-aks-cluster.yml  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml
index 9d6c5483eb..22af526ac2 100644
--- a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml
+++ b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml
@@ -244,7 +244,7 @@ jobs:
               echo "This ensures queries will find logs from the newly deployed containers."
               echo ""
               
-              wait_time=900 #TODO: change back to 1200 (20 minutes) after testing
+              wait_time=1200 #TODO: change back to 1200 (20 minutes) after testing
               interval=60
               elapsed=0
               

From b0ea65e2af6e132f89a91423fc0759eb1027c22c Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 5 Dec 2025 20:48:07 -0800
Subject: [PATCH 32/40] echo utc timing

---
 .../azure-template-deploy-and-test-ci-image-in-aks-cluster.yml   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml
index 22af526ac2..f25bdc793d 100644
--- a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml
+++ b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml
@@ -158,6 +158,7 @@ jobs:
               
               echo ""
               echo "Pod patching complete!"
+              echo "Current UTC time: $(date -u +"%Y-%m-%dT%H:%M:%SZ")"
               echo "Current ama-logs pods:"
               kubectl get pods -n kube-system | grep ama-logs
         

From 8bcad01b9ed648fd0f25a8b594c52195602344eb Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Mon, 8 Dec 2025 12:34:02 -0800
Subject: [PATCH 33/40] fix trigger condition

---
 .pipelines/azure_pipeline_mergedbranches.yaml | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/.pipelines/azure_pipeline_mergedbranches.yaml b/.pipelines/azure_pipeline_mergedbranches.yaml
index 9fa66fbe49..19c4e240f3 100644
--- a/.pipelines/azure_pipeline_mergedbranches.yaml
+++ b/.pipelines/azure_pipeline_mergedbranches.yaml
@@ -896,14 +896,19 @@ extends:
       dependsOn: 
         - Build_And_Publish_Images
       # Deploy runs when Build succeeds OR when Build is skipped with valid overrides
-      # TODO: remove eq(variables['Build.SourceBranch'], 'refs/heads/zane/ci-agent-auto-deploy'),
-      # this stage runs when Build_And_Publish_Images succeeds or is skipped with valid overrides.
+      # This stage runs when:
+      # 1. Direct push to ci_prod or zane/ci-agent-auto-deploy or branches containing 'run-e2e'
+      # 2. PR from zane/ci-agent-auto-deploy branch
       condition: |
         and(
           or(
             eq(variables['Build.SourceBranch'], 'refs/heads/ci_prod'),
             eq(variables['Build.SourceBranch'], 'refs/heads/zane/ci-agent-auto-deploy'),
-            contains(variables['Build.SourceBranch'], 'run-e2e')
+            contains(variables['Build.SourceBranch'], 'run-e2e'),
+            and(
+              eq(variables['Build.Reason'], 'PullRequest'),
+              eq(variables['System.PullRequest.SourceBranch'], 'refs/heads/zane/ci-agent-auto-deploy')
+            )
           ),
           or(
             eq(dependencies.Build_And_Publish_Images.result, 'Succeeded'),

From d1379607c4bda628432b7aa78d3f2f760d895088 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Mon, 8 Dec 2025 13:45:41 -0800
Subject: [PATCH 34/40] trigger when PR branch contains run-e2e

---
 .pipelines/azure_pipeline_mergedbranches.yaml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.pipelines/azure_pipeline_mergedbranches.yaml b/.pipelines/azure_pipeline_mergedbranches.yaml
index 19c4e240f3..1284da3eae 100644
--- a/.pipelines/azure_pipeline_mergedbranches.yaml
+++ b/.pipelines/azure_pipeline_mergedbranches.yaml
@@ -898,7 +898,7 @@ extends:
       # Deploy runs when Build succeeds OR when Build is skipped with valid overrides
       # This stage runs when:
       # 1. Direct push to ci_prod or zane/ci-agent-auto-deploy or branches containing 'run-e2e'
-      # 2. PR from zane/ci-agent-auto-deploy branch
+      # 2. PR from zane/ci-agent-auto-deploy branch OR PR from branch containing 'run-e2e'
       condition: |
         and(
           or(
@@ -907,7 +907,10 @@ extends:
             contains(variables['Build.SourceBranch'], 'run-e2e'),
             and(
               eq(variables['Build.Reason'], 'PullRequest'),
-              eq(variables['System.PullRequest.SourceBranch'], 'refs/heads/zane/ci-agent-auto-deploy')
+              or(
+                eq(variables['System.PullRequest.SourceBranch'], 'refs/heads/zane/ci-agent-auto-deploy'),
+                contains(variables['System.PullRequest.SourceBranch'], 'run-e2e')
+              )
             )
           ),
           or(

From 8c7d4cb4bf3c0cb6382dee14536e5dac6cd37040 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Mon, 8 Dec 2025 14:08:45 -0800
Subject: [PATCH 35/40] fix trigger

---
 .pipelines/azure_pipeline_mergedbranches.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pipelines/azure_pipeline_mergedbranches.yaml b/.pipelines/azure_pipeline_mergedbranches.yaml
index 1284da3eae..4f28c9b6aa 100644
--- a/.pipelines/azure_pipeline_mergedbranches.yaml
+++ b/.pipelines/azure_pipeline_mergedbranches.yaml
@@ -908,7 +908,7 @@ extends:
             and(
               eq(variables['Build.Reason'], 'PullRequest'),
               or(
-                eq(variables['System.PullRequest.SourceBranch'], 'refs/heads/zane/ci-agent-auto-deploy'),
+                eq(variables['System.PullRequest.SourceBranch'], 'zane/ci-agent-auto-deploy'),
                 contains(variables['System.PullRequest.SourceBranch'], 'run-e2e')
               )
             )

From 8a1bc5a3a98beaffac4f0dba74200376c66c13d8 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Mon, 8 Dec 2025 23:29:38 -0800
Subject: [PATCH 36/40] rename stage name

---
 .pipelines/azure_pipeline_mergedbranches.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.pipelines/azure_pipeline_mergedbranches.yaml b/.pipelines/azure_pipeline_mergedbranches.yaml
index 4f28c9b6aa..aba3589b76 100644
--- a/.pipelines/azure_pipeline_mergedbranches.yaml
+++ b/.pipelines/azure_pipeline_mergedbranches.yaml
@@ -44,7 +44,7 @@ extends:
     stages:
     # This stage will be skipped when LinuxImageOverride and WindowsImageOverride are both set
     # This feature allows bypassing the build stage when using pre-built images for testing, which saves time and resources.
-    - stage: Build_And_Publish_Images
+    - stage: stage
       displayName: 'Build and Publish Container Images'
       condition: |
         or(
@@ -894,7 +894,7 @@ extends:
       displayName: Deploy and Test Images in Dev Clusters
       lockBehavior: sequential
       dependsOn: 
-        - Build_And_Publish_Images
+        - stage
       # Deploy runs when Build succeeds OR when Build is skipped with valid overrides
       # This stage runs when:
       # 1. Direct push to ci_prod or zane/ci-agent-auto-deploy or branches containing 'run-e2e'
@@ -914,9 +914,9 @@ extends:
             )
           ),
           or(
-            eq(dependencies.Build_And_Publish_Images.result, 'Succeeded'),
+            eq(dependencies.stage.result, 'Succeeded'),
             and(
-              eq(dependencies.Build_And_Publish_Images.result, 'Skipped'),
+              eq(dependencies.stage.result, 'Skipped'),
               ne(variables['LinuxImageOverride'], ''),
               ne(variables['WindowsImageOverride'], '')
             )
@@ -925,8 +925,8 @@ extends:
       variables:
         # Use images built from previous build stage by default
         # To override: Set pipeline variables 'LinuxImageOverride' and 'WindowsImageOverride' when queuing
-        linuxImageTagUnderTest: $[coalesce(variables['LinuxImageOverride'], stageDependencies.Build_And_Publish_Images.common.outputs['setup.linuxImagetag'])]
-        windowsImageTagUnderTest: $[coalesce(variables['WindowsImageOverride'], stageDependencies.Build_And_Publish_Images.common.outputs['setup.windowsImageTag'])]
+        linuxImageTagUnderTest: $[coalesce(variables['LinuxImageOverride'], stageDependencies.stage.common.outputs['setup.linuxImagetag'])]
+        windowsImageTagUnderTest: $[coalesce(variables['WindowsImageOverride'], stageDependencies.stage.common.outputs['setup.windowsImageTag'])]
       jobs:
       # TODO: gradually add more clusters from test automation framework when the tests are stable
       # TODO: TeamsWebhookUri to be added

From 96ef037d77a9a747726cfd70f63ff35976ea2b71 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 12 Dec 2025 13:24:18 -0800
Subject: [PATCH 37/40] push image to acr during pr for particiluar branch

---
 .pipelines/azure_pipeline_mergedbranches.yaml | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/.pipelines/azure_pipeline_mergedbranches.yaml b/.pipelines/azure_pipeline_mergedbranches.yaml
index aba3589b76..a578302d33 100644
--- a/.pipelines/azure_pipeline_mergedbranches.yaml
+++ b/.pipelines/azure_pipeline_mergedbranches.yaml
@@ -193,7 +193,17 @@ extends:
               docker pull mcr.microsoft.com/azuremonitor/containerinsights/cidev/prometheus-collector/images:buildx-stable-1
               docker buildx create --name dockerbuilder --driver docker-container --driver-opt image=mcr.microsoft.com/azuremonitor/containerinsights/cidev/prometheus-collector/images:buildx-stable-1 --use
               docker buildx inspect --bootstrap
+              # Determine if we should push to ACR
+              # Push when: NOT a PR, OR when PR is from specific branches (zane/ci-agent-auto-deploy or branches containing 'run-e2e')
+              SHOULD_PUSH="false"
               if [ "$(Build.Reason)" != "PullRequest" ]; then
+                SHOULD_PUSH="true"
+              elif [[ "$(System.PullRequest.SourceBranch)" == "zane/ci-agent-auto-deploy" ]] || [[ "$(System.PullRequest.SourceBranch)" == *"run-e2e"* ]]; then
+                SHOULD_PUSH="true"
+                echo "PR from branch $(System.PullRequest.SourceBranch) - will push image to ACR for E2E testing"
+              fi
+              
+              if [ "$SHOULD_PUSH" == "true" ]; then
                 docker buildx build --platform $(BUILD_PLATFORMS) --tag ${{ variables.repoImageName }}:$(linuxImagetag) -f kubernetes/linux/Dockerfile.multiarch --metadata-file $(Build.ArtifactStagingDirectory)/linux/metadata.json --build-arg IMAGE_TAG=$(linuxTelemetryTag) --build-arg GOLANG_BASE_IMAGE=$(GOLANG_BASE_IMAGE) --build-arg CI_BASE_IMAGE=$(CI_BASE_IMAGE) --push --provenance=false .
                 echo "##vso[task.logissue type=warning]Linux image built with tag: ${{ variables.repoImageName }}:$(linuxImagetag)"
                 docker pull ${{ variables.repoImageName }}:$(linuxImagetag)
@@ -551,7 +561,16 @@ extends:
           inputs:
             targetType: 'inline'
             script: |
+              # Push when: NOT a PR, OR when PR is from specific branches (zane/ci-agent-auto-deploy or branches containing 'run-e2e')
+              $shouldPush = $false
               if ("$(Build.Reason)" -ne "PullRequest") {
+                $shouldPush = $true
+              } elseif ("$(System.PullRequest.SourceBranch)" -eq "zane/ci-agent-auto-deploy" -or "$(System.PullRequest.SourceBranch)" -like "*run-e2e*") {
+                $shouldPush = $true
+                Write-Host "PR from branch $(System.PullRequest.SourceBranch) - will push image to ACR for E2E testing"
+              }
+              
+              if ($shouldPush) {
                 docker push ${{ variables.repoImageName }}:$(windowsImageTag)-$(windows2019BaseImageVersion)
               }
         - task: CodeQL3000Finalize@0
@@ -759,7 +778,16 @@ extends:
           inputs:
             targetType: 'inline'
             script: |
+              # Push when: NOT a PR, OR when PR is from specific branches (zane/ci-agent-auto-deploy or branches containing 'run-e2e')
+              $shouldPush = $false
               if ("$(Build.Reason)" -ne "PullRequest") {
+                $shouldPush = $true
+              } elseif ("$(System.PullRequest.SourceBranch)" -eq "zane/ci-agent-auto-deploy" -or "$(System.PullRequest.SourceBranch)" -like "*run-e2e*") {
+                $shouldPush = $true
+                Write-Host "PR from branch $(System.PullRequest.SourceBranch) - will push image to ACR for E2E testing"
+              }
+              
+              if ($shouldPush) {
                 docker push ${{ variables.repoImageName }}:$(windowsImageTag)-$(windows2022BaseImageVersion)
               }
         - task: CodeQL3000Finalize@0
@@ -800,7 +828,16 @@ extends:
               az account set -s ${{ variables.subscription }}
               az acr login -n ${{ variables.containerRegistry }}
               @{"image.name"="${{ variables.repoImageName }}:$(windowsImageTag)"} | ConvertTo-Json -Compress | Out-File -Encoding ascii $(Build.ArtifactStagingDirectory)/windows/metadata.json
+              # Push when: NOT a PR, OR when PR is from specific branches (zane/ci-agent-auto-deploy or branches containing 'run-e2e')
+              $shouldPush = $false
               if ("$(Build.Reason)" -ne "PullRequest") {
+                $shouldPush = $true
+              } elseif ("$(System.PullRequest.SourceBranch)" -eq "zane/ci-agent-auto-deploy" -or "$(System.PullRequest.SourceBranch)" -like "*run-e2e*") {
+                $shouldPush = $true
+                Write-Host "PR from branch $(System.PullRequest.SourceBranch) - will push multi-arch image to ACR for E2E testing"
+              }
+              
+              if ($shouldPush) {
                  docker manifest create ${{ variables.repoImageName }}:$(windowsImageTag) ${{ variables.repoImageName }}:$(windowsImageTag)-$(windows2019BaseImageVersion) ${{ variables.repoImageName }}:$(windowsImageTag)-$(windows2022BaseImageVersion)
                  docker manifest push ${{ variables.repoImageName }}:$(windowsImageTag)
                  Write-Host "##vso[task.logissue type=warning]Windows image built with tag: ${{ variables.repoImageName }}:$(windowsImageTag)"

From c3a811cba2bb1d771ddcfa9e044d8410e50a4062 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 12 Dec 2025 15:18:55 -0800
Subject: [PATCH 38/40] pin fluentd version

---
 kubernetes/linux/setup.sh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh
index ed037d598c..37a46d487c 100644
--- a/kubernetes/linux/setup.sh
+++ b/kubernetes/linux/setup.sh
@@ -78,6 +78,12 @@ echo "$(fluent-bit --version)" >> packages_version.txt
 
 # install fluentd
 fluentd_version="1.16.3"
+
+# Pre-install cool.io to avoid ARM64 build issues (segfault during native extension compilation)
+if [ "$ARCH" == "arm64" ]; then
+    gem install cool.io -v "1.8.0" --no-document
+fi
+
 gem install fluentd -v $fluentd_version --no-document
 
 # remove the test directory from fluentd

From ca3cb540a1ba79edace9945a9fc30bfe4ca30e52 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 12 Dec 2025 16:21:51 -0800
Subject: [PATCH 39/40] pin cool.io to 1.9.0

---
 kubernetes/linux/setup.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh
index 37a46d487c..71bf8f38ff 100644
--- a/kubernetes/linux/setup.sh
+++ b/kubernetes/linux/setup.sh
@@ -81,7 +81,7 @@ fluentd_version="1.16.3"
 
 # Pre-install cool.io to avoid ARM64 build issues (segfault during native extension compilation)
 if [ "$ARCH" == "arm64" ]; then
-    gem install cool.io -v "1.8.0" --no-document
+    gem install cool.io -v "1.9.0" --no-document
 fi
 
 gem install fluentd -v $fluentd_version --no-document

From bea28083a22487987b93db476b856d4256fa09b4 Mon Sep 17 00:00:00 2001
From: zanejohnson-azure <zanejohnson@microsoft.com>
Date: Fri, 12 Dec 2025 17:15:31 -0800
Subject: [PATCH 40/40] remove cool.io install

---
 kubernetes/linux/setup.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh
index 71bf8f38ff..31883f858c 100644
--- a/kubernetes/linux/setup.sh
+++ b/kubernetes/linux/setup.sh
@@ -80,9 +80,9 @@ echo "$(fluent-bit --version)" >> packages_version.txt
 fluentd_version="1.16.3"
 
 # Pre-install cool.io to avoid ARM64 build issues (segfault during native extension compilation)
-if [ "$ARCH" == "arm64" ]; then
-    gem install cool.io -v "1.9.0" --no-document
-fi
+# if [ "$ARCH" == "arm64" ]; then
+#     gem install cool.io -v "1.9.0" --no-document
+# fi
 
 gem install fluentd -v $fluentd_version --no-document