diff --git a/.pipelines/azure_pipeline_mergedbranches.yaml b/.pipelines/azure_pipeline_mergedbranches.yaml index 291772961..a578302d3 100644 --- a/.pipelines/azure_pipeline_mergedbranches.yaml +++ b/.pipelines/azure_pipeline_mergedbranches.yaml @@ -42,7 +42,15 @@ extends: customBuildTags: - ES365AIMigrationTooling stages: + # This stage will be skipped when LinuxImageOverride and WindowsImageOverride are both set + # This feature allows bypassing the build stage when using pre-built images for testing, which saves time and resources. - stage: stage + displayName: 'Build and Publish Container Images' + condition: | + or( + eq(variables['LinuxImageOverride'], ''), + eq(variables['WindowsImageOverride'], '') + ) jobs: - job: common pool: @@ -185,7 +193,17 @@ extends: docker pull mcr.microsoft.com/azuremonitor/containerinsights/cidev/prometheus-collector/images:buildx-stable-1 docker buildx create --name dockerbuilder --driver docker-container --driver-opt image=mcr.microsoft.com/azuremonitor/containerinsights/cidev/prometheus-collector/images:buildx-stable-1 --use docker buildx inspect --bootstrap + # Determine if we should push to ACR + # Push when: NOT a PR, OR when PR is from specific branches (zane/ci-agent-auto-deploy or branches containing 'run-e2e') + SHOULD_PUSH="false" if [ "$(Build.Reason)" != "PullRequest" ]; then + SHOULD_PUSH="true" + elif [[ "$(System.PullRequest.SourceBranch)" == "zane/ci-agent-auto-deploy" ]] || [[ "$(System.PullRequest.SourceBranch)" == *"run-e2e"* ]]; then + SHOULD_PUSH="true" + echo "PR from branch $(System.PullRequest.SourceBranch) - will push image to ACR for E2E testing" + fi + + if [ "$SHOULD_PUSH" == "true" ]; then docker buildx build --platform $(BUILD_PLATFORMS) --tag ${{ variables.repoImageName }}:$(linuxImagetag) -f kubernetes/linux/Dockerfile.multiarch --metadata-file $(Build.ArtifactStagingDirectory)/linux/metadata.json --build-arg IMAGE_TAG=$(linuxTelemetryTag) --build-arg GOLANG_BASE_IMAGE=$(GOLANG_BASE_IMAGE) --build-arg CI_BASE_IMAGE=$(CI_BASE_IMAGE) --push --provenance=false . echo "##vso[task.logissue type=warning]Linux image built with tag: ${{ variables.repoImageName }}:$(linuxImagetag)" docker pull ${{ variables.repoImageName }}:$(linuxImagetag) @@ -543,7 +561,16 @@ extends: inputs: targetType: 'inline' script: | + # Push when: NOT a PR, OR when PR is from specific branches (zane/ci-agent-auto-deploy or branches containing 'run-e2e') + $shouldPush = $false if ("$(Build.Reason)" -ne "PullRequest") { + $shouldPush = $true + } elseif ("$(System.PullRequest.SourceBranch)" -eq "zane/ci-agent-auto-deploy" -or "$(System.PullRequest.SourceBranch)" -like "*run-e2e*") { + $shouldPush = $true + Write-Host "PR from branch $(System.PullRequest.SourceBranch) - will push image to ACR for E2E testing" + } + + if ($shouldPush) { docker push ${{ variables.repoImageName }}:$(windowsImageTag)-$(windows2019BaseImageVersion) } - task: CodeQL3000Finalize@0 @@ -751,7 +778,16 @@ extends: inputs: targetType: 'inline' script: | + # Push when: NOT a PR, OR when PR is from specific branches (zane/ci-agent-auto-deploy or branches containing 'run-e2e') + $shouldPush = $false if ("$(Build.Reason)" -ne "PullRequest") { + $shouldPush = $true + } elseif ("$(System.PullRequest.SourceBranch)" -eq "zane/ci-agent-auto-deploy" -or "$(System.PullRequest.SourceBranch)" -like "*run-e2e*") { + $shouldPush = $true + Write-Host "PR from branch $(System.PullRequest.SourceBranch) - will push image to ACR for E2E testing" + } + + if ($shouldPush) { docker push ${{ variables.repoImageName }}:$(windowsImageTag)-$(windows2022BaseImageVersion) } - task: CodeQL3000Finalize@0 @@ -792,7 +828,16 @@ extends: az account set -s ${{ variables.subscription }} az acr login -n ${{ variables.containerRegistry }} @{"image.name"="${{ variables.repoImageName }}:$(windowsImageTag)"} | ConvertTo-Json -Compress | Out-File -Encoding ascii $(Build.ArtifactStagingDirectory)/windows/metadata.json + # Push when: NOT a PR, OR when PR is from specific branches (zane/ci-agent-auto-deploy or branches containing 'run-e2e') + $shouldPush = $false if ("$(Build.Reason)" -ne "PullRequest") { + $shouldPush = $true + } elseif ("$(System.PullRequest.SourceBranch)" -eq "zane/ci-agent-auto-deploy" -or "$(System.PullRequest.SourceBranch)" -like "*run-e2e*") { + $shouldPush = $true + Write-Host "PR from branch $(System.PullRequest.SourceBranch) - will push multi-arch image to ACR for E2E testing" + } + + if ($shouldPush) { docker manifest create ${{ variables.repoImageName }}:$(windowsImageTag) ${{ variables.repoImageName }}:$(windowsImageTag)-$(windows2019BaseImageVersion) ${{ variables.repoImageName }}:$(windowsImageTag)-$(windows2022BaseImageVersion) docker manifest push ${{ variables.repoImageName }}:$(windowsImageTag) Write-Host "##vso[task.logissue type=warning]Windows image built with tag: ${{ variables.repoImageName }}:$(windowsImageTag)" @@ -880,4 +925,70 @@ extends: ScanType: CustomScan FileDirPath: '$(Build.ArtifactStagingDirectory)' DisableRemediation: false - AcceptableOutdatedSignatureInHours: 72 \ No newline at end of file + AcceptableOutdatedSignatureInHours: 72 + + - stage: Deploy_and_Test_Images_In_Dev_Clusters + displayName: Deploy and Test Images in Dev Clusters + lockBehavior: sequential + dependsOn: + - stage + # Deploy runs when Build succeeds OR when Build is skipped with valid overrides + # This stage runs when: + # 1. Direct push to ci_prod or zane/ci-agent-auto-deploy or branches containing 'run-e2e' + # 2. PR from zane/ci-agent-auto-deploy branch OR PR from branch containing 'run-e2e' + condition: | + and( + or( + eq(variables['Build.SourceBranch'], 'refs/heads/ci_prod'), + eq(variables['Build.SourceBranch'], 'refs/heads/zane/ci-agent-auto-deploy'), + contains(variables['Build.SourceBranch'], 'run-e2e'), + and( + eq(variables['Build.Reason'], 'PullRequest'), + or( + eq(variables['System.PullRequest.SourceBranch'], 'zane/ci-agent-auto-deploy'), + contains(variables['System.PullRequest.SourceBranch'], 'run-e2e') + ) + ) + ), + or( + eq(dependencies.stage.result, 'Succeeded'), + and( + eq(dependencies.stage.result, 'Skipped'), + ne(variables['LinuxImageOverride'], ''), + ne(variables['WindowsImageOverride'], '') + ) + ) + ) + variables: + # Use images built from previous build stage by default + # To override: Set pipeline variables 'LinuxImageOverride' and 'WindowsImageOverride' when queuing + linuxImageTagUnderTest: $[coalesce(variables['LinuxImageOverride'], stageDependencies.stage.common.outputs['setup.linuxImagetag'])] + windowsImageTagUnderTest: $[coalesce(variables['WindowsImageOverride'], stageDependencies.stage.common.outputs['setup.windowsImageTag'])] + jobs: + # TODO: gradually add more clusters from test automation framework when the tests are stable + # TODO: TeamsWebhookUri to be added + # Cluster 1: zane-test Cluster + - template: /.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml@self + parameters: + clusterName: 'zane-test' + resourceGroup: 'zane-test' + azureSubscription: 'ContainerInsights_Build_Subscription_CI' + environmentName: 'CI-Agent-Dev' + linuxImageTag: $(linuxImageTagUnderTest) + windowsImageTag: $(windowsImageTagUnderTest) + azureClientId: $(AksZaneTestClientId) + azureTenantId: $(AzureZaneTestTenantId) + teamsWebhookUri: $(TeamsWebhookUri) + + # Cluster 2: zane-test2 Cluster + - template: /.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml@self + parameters: + clusterName: 'zane-test2' + resourceGroup: 'zane-test' + azureSubscription: 'ContainerInsights_Build_Subscription_CI' + environmentName: 'CI-Agent-Dev2' + linuxImageTag: $(linuxImageTagUnderTest) + windowsImageTag: $(windowsImageTagUnderTest) + azureClientId: $(AksZaneTest2ClientId) + azureTenantId: $(AzureZaneTestTenantId) + teamsWebhookUri: $(TeamsWebhookUri) diff --git a/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml new file mode 100644 index 000000000..f25bdc793 --- /dev/null +++ b/.pipelines/e2e-test/azure-template-deploy-and-test-ci-image-in-aks-cluster.yml @@ -0,0 +1,306 @@ +parameters: +- name: clusterName + type: string +- name: resourceGroup + type: string +- name: azureSubscription + type: string + default: 'ContainerInsights_Build_Subscription_CI' +- name: environmentName + type: string +- name: linuxImageTag + type: string +- name: windowsImageTag + type: string +- name: azureClientId + type: string +- name: azureTenantId + type: string +- name: teamsWebhookUri + type: string + default: '$(TeamsWebhookUri)' +- name: additionalTestParams + type: string + default: '' + +jobs: +- deployment: Deploy_${{ replace(parameters.clusterName, '-', '_') }} + displayName: 'Deploy & Test: ${{ parameters.clusterName }}' + environment: ${{ parameters.environmentName }} + pool: + name: Azure-Pipelines-CI-Test-EO + image: ci-1es-managed-ubuntu-2204 + os: linux + variables: + skipComponentGovernanceDetection: true + strategy: + runOnce: + deploy: + steps: + # Log deployment start + - bash: | + set -euo pipefail + + echo "=========================================" + echo "CLUSTER DEPLOYMENT STARTING" + echo "=========================================" + echo "Cluster: ${{ parameters.clusterName }}" + echo "Environment: ${{ parameters.environmentName }}" + echo "Build ID: $(Build.BuildId)" + echo "Pipeline Run: $(Build.BuildNumber)" + echo "" + echo "✓ Sequential deployment locking enabled at stage level" + echo "✓ Multiple pipeline runs will execute sequentially" + echo "=========================================" + displayName: 'Deployment Start' + + - checkout: self + persistCredentials: true + + - script: | + set -euo pipefail + echo "Ensuring kubectl & helm are installed" + if ! command -v kubectl >/dev/null 2>&1; then + echo "Installing kubectl" + sudo az aks install-cli + else + echo "kubectl already installed: $(kubectl version --client --short || true)" + fi + if ! command -v helm >/dev/null 2>&1; then + echo "Installing Helm 3" + curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + else + echo "Helm already installed: $(helm version --short || true)" + fi + displayName: 'Install kubectl and Helm' + + - task: AzureCLI@2 + displayName: 'Get credentials for ${{ parameters.clusterName }}' + inputs: + azureSubscription: ${{ parameters.azureSubscription }} + scriptLocation: 'inlineScript' + scriptType: 'bash' + inlineScript: 'az aks get-credentials -g ${{ parameters.resourceGroup }} -n ${{ parameters.clusterName }}' + + # Determine MCR repository paths based on image tags. + - task: Bash@3 + name: DetermineMcrRepo + displayName: 'Determine MCR Repository Paths' + env: + LINUX_IMAGE_TAG: ${{ parameters.linuxImageTag }} + WINDOWS_IMAGE_TAG: ${{ parameters.windowsImageTag }} + inputs: + targetType: 'inline' + script: | + # Function to determine registry path based on image tag + # CI dev builds contain git hash pattern (e.g., -gbdc2f3f42-20250701203056) + # Production releases are simple versions (e.g., 3.1.32) + get_mcr_repo() { + local image_tag="$1" + if [[ "$image_tag" =~ -g[a-f0-9]+-[0-9]+ ]]; then + echo "mcr.microsoft.com/azuremonitor/containerinsights/cidev" + else + echo "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" + fi + } + + LINUX_MCR_REPO=$(get_mcr_repo "$LINUX_IMAGE_TAG") + WINDOWS_MCR_REPO=$(get_mcr_repo "$WINDOWS_IMAGE_TAG") + + echo "Repository Path Detection:" + echo " Linux image tag: $LINUX_IMAGE_TAG" + echo " → Linux MCR repo: $LINUX_MCR_REPO" + echo " Windows image tag: $WINDOWS_IMAGE_TAG" + echo " → Windows MCR repo: $WINDOWS_MCR_REPO" + + # Export for subsequent steps + echo "##vso[task.setvariable variable=linuxMcrRepo;isOutput=true]$LINUX_MCR_REPO" + echo "##vso[task.setvariable variable=windowsMcrRepo;isOutput=true]$WINDOWS_MCR_REPO" + + # TODO: consider to use helm chart when it is ready for aks deployment + - task: Bash@3 + displayName: 'Patch ama-logs pods with new images' + env: + LINUX_IMAGE_TAG: ${{ parameters.linuxImageTag }} + WINDOWS_IMAGE_TAG: ${{ parameters.windowsImageTag }} + LINUX_MCR_REPO: $(DetermineMcrRepo.linuxMcrRepo) + WINDOWS_MCR_REPO: $(DetermineMcrRepo.windowsMcrRepo) + inputs: + targetType: 'inline' + script: | + echo "Deploying to cluster: ${{ parameters.clusterName }}" + echo " Linux image: $LINUX_MCR_REPO:$LINUX_IMAGE_TAG" + echo " Windows image: $WINDOWS_MCR_REPO:$WINDOWS_IMAGE_TAG" + echo "" + echo "Finding and patching ama-logs pods in kube-system namespace..." + + kubectl get pods -n kube-system --no-headers | grep ama-logs | awk '{print $1}' | while read pod_name; do + echo "Processing pod: $pod_name" + + if [[ "$pod_name" =~ ^ama-logs-windows ]]; then + IMG_URL="$WINDOWS_MCR_REPO:$WINDOWS_IMAGE_TAG" + container_name="ama-logs-windows" + elif [[ "$pod_name" =~ ^ama-logs-rs ]] || [[ "$pod_name" =~ ^ama-logs-[a-z0-9]{5}$ ]]; then + IMG_URL="$LINUX_MCR_REPO:$LINUX_IMAGE_TAG" + container_name="ama-logs" + else + echo " ⚠ Unknown pod pattern: $pod_name - skipping" + continue + fi + + echo " → Patching with image: $IMG_URL (container: $container_name)" + + kubectl patch pod "$pod_name" -n kube-system \ + --patch "{\"spec\": {\"containers\": [{\"name\": \"$container_name\", \"image\": \"$IMG_URL\"}]}}" \ + && echo " ✓ Successfully patched $pod_name" \ + || echo " ✗ Failed to patch $pod_name" + done + + echo "" + echo "Pod patching complete!" + echo "Current UTC time: $(date -u +"%Y-%m-%dT%H:%M:%SZ")" + echo "Current ama-logs pods:" + kubectl get pods -n kube-system | grep ama-logs + + # Wait for Kubernetes API to update and pods to begin restarting + - task: Bash@3 + displayName: 'Wait for pod patch to propagate (5 minutes)' + inputs: + targetType: 'inline' + script: | + echo "========================================" + echo "Waiting for Pod Patch Propagation" + echo "========================================" + echo "" + echo "Waiting 5 minutes" + echo "" + + wait_time=300 + interval=30 + elapsed=0 + + while [ $elapsed -lt $wait_time ]; do + remaining=$((wait_time - elapsed)) + echo "⏳ Waiting... ($elapsed/$wait_time seconds elapsed, $remaining seconds remaining)" + sleep $interval + elapsed=$((elapsed + interval)) + done + + echo "" + echo "✓ Wait complete! Now checking actual pod readiness status..." + echo "========================================" + + # Pre-test verification: Wait for pods to be ready with new images + - task: Bash@3 + displayName: 'Pre-Test: Wait for pods to be ready with new images' + env: + LINUX_IMAGE_TAG: ${{ parameters.linuxImageTag }} + WINDOWS_IMAGE_TAG: ${{ parameters.windowsImageTag }} + LINUX_MCR_REPO: $(DetermineMcrRepo.linuxMcrRepo) + WINDOWS_MCR_REPO: $(DetermineMcrRepo.windowsMcrRepo) + inputs: + targetType: 'inline' + script: | + chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-ci-images-before-test.sh + $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-ci-images-before-test.sh "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO" + + # Capture container start times for Log Analytics query filtering + - task: Bash@3 + name: CaptureStartTime + displayName: 'Capture container start times for Log Analytics filtering' + env: + LINUX_IMAGE_TAG: ${{ parameters.linuxImageTag }} + WINDOWS_IMAGE_TAG: ${{ parameters.windowsImageTag }} + LINUX_MCR_REPO: $(DetermineMcrRepo.linuxMcrRepo) + WINDOWS_MCR_REPO: $(DetermineMcrRepo.windowsMcrRepo) + inputs: + targetType: 'inline' + script: | + chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/capture-container-start-time.sh + $(Build.SourcesDirectory)/.pipelines/e2e-test/capture-container-start-time.sh "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO" + + # Export container start time for use in tests + if [ -f /tmp/container-deployment-time.env ]; then + source /tmp/container-deployment-time.env + echo "Container start time captured: $CONTAINER_START_TIME" + echo "##vso[task.setvariable variable=CONTAINER_START_TIME;isOutput=true]$CONTAINER_START_TIME" + else + echo "ERROR: Container start time not found at /tmp/container-deployment-time.env" + echo "This is required for Log Analytics query filtering" + exit 1 + fi + + - task: Bash@3 + displayName: 'Wait for logs to be ingested into Log Analytics (20 min)' + inputs: + targetType: 'inline' + script: | + echo "========================================" + echo "Waiting for Log Analytics Ingestion" + echo "========================================" + echo "Cluster: ${{ parameters.clusterName }}" + echo "Container start time: $(CaptureStartTime.CONTAINER_START_TIME)" + echo "" + echo "Waiting 20 minutes to allow logs to be ingested..." + echo "This ensures queries will find logs from the newly deployed containers." + echo "" + + wait_time=1200 #TODO: change back to 1200 (20 minutes) after testing + interval=60 + elapsed=0 + + while [ $elapsed -lt $wait_time ]; do + remaining=$((wait_time - elapsed)) + minutes_elapsed=$((elapsed / 60)) + minutes_remaining=$((remaining / 60)) + echo "⏳ Waiting... ($minutes_elapsed/$((wait_time / 60)) minutes elapsed, $minutes_remaining minutes remaining)" + sleep $interval + elapsed=$((elapsed + interval)) + done + + echo "" + echo "✓ Wait complete! Logs should now be available in Log Analytics." + echo "✓ Tests will query logs with filter: TimeGenerated > datetime('$(CaptureStartTime.CONTAINER_START_TIME)')" + echo "========================================" + # TODO (improvement): container start time is captured in previous step, but not used for now. Consider passing container start time to test script to use in log queries + - bash: | + # Pass container start time to tests + export CONTAINER_START_TIME="$(CaptureStartTime.CONTAINER_START_TIME)" + echo "Running tests for cluster: ${{ parameters.clusterName }}" + echo "Container start time: $CONTAINER_START_TIME" + + chmod +x ./install-and-execute-testkube-tests.sh + ./install-and-execute-testkube-tests.sh \ + AzureClientId=${{ parameters.azureClientId }} \ + AzureTenantId=${{ parameters.azureTenantId }} \ + TeamsWebhookUri=${{ parameters.teamsWebhookUri }} \ + ${{ parameters.additionalTestParams }} + workingDirectory: $(Build.SourcesDirectory)/test/testkube/ + displayName: 'Install Testkube and run E2E tests' + + # Post-test verification: Check pods are still healthy after test execution + - task: Bash@3 + displayName: 'Post-Test: Verify pods remained stable after tests' + condition: always() + env: + LINUX_IMAGE_TAG: ${{ parameters.linuxImageTag }} + WINDOWS_IMAGE_TAG: ${{ parameters.windowsImageTag }} + LINUX_MCR_REPO: $(DetermineMcrRepo.linuxMcrRepo) + WINDOWS_MCR_REPO: $(DetermineMcrRepo.windowsMcrRepo) + inputs: + targetType: 'inline' + script: | + chmod +x $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-ci-images-after-test.sh + $(Build.SourcesDirectory)/.pipelines/e2e-test/verify-ci-images-after-test.sh "$LINUX_IMAGE_TAG" "$WINDOWS_IMAGE_TAG" "$LINUX_MCR_REPO" "$WINDOWS_MCR_REPO" + + # Log deployment completion + - bash: | + echo "=========================================" + echo "DEPLOYMENT COMPLETE" + echo "=========================================" + echo "Cluster: ${{ parameters.clusterName }}" + echo "Build ID: $(Build.BuildId)" + echo "✓ Deployment finished for: ${{ parameters.clusterName }}" + echo "=========================================" + displayName: 'Deployment Completion' + condition: always() diff --git a/.pipelines/e2e-test/capture-container-start-time.sh b/.pipelines/e2e-test/capture-container-start-time.sh new file mode 100644 index 000000000..b3c9b4322 --- /dev/null +++ b/.pipelines/e2e-test/capture-container-start-time.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# Capture Container Start Times +# Captures the LATEST container start time across all ama-logs pods +# This is used to filter Log Analytics queries to only show logs from the newly deployed containers + +set -e + +# Parse command line arguments +LINUX_IMAGE_TAG="${1}" +WINDOWS_IMAGE_TAG="${2}" +LINUX_MCR_REPO="${3}" +WINDOWS_MCR_REPO="${4}" + +if [ -z "$LINUX_IMAGE_TAG" ] || [ -z "$WINDOWS_IMAGE_TAG" ] || [ -z "$LINUX_MCR_REPO" ] || [ -z "$WINDOWS_MCR_REPO" ]; then + echo "Error: Missing required parameters" + echo "Usage: $0 " + exit 1 +fi + +LINUX_IMAGE="$LINUX_MCR_REPO:$LINUX_IMAGE_TAG" +WINDOWS_IMAGE="$WINDOWS_MCR_REPO:$WINDOWS_IMAGE_TAG" + +# Source shared functions +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/util.sh" + +echo "================================" +echo "Container Start Time Capture" +echo "================================" +echo "Capturing LATEST container start time for Log Analytics queries..." +echo "" + + +# Build pod configurations using shared function +declare -a pod_configs +build_pod_configs "$LINUX_IMAGE" "$WINDOWS_IMAGE" + +if [ ${#pod_configs[@]} -eq 0 ]; then + echo "✗ ERROR: No pods found!" + exit 1 +fi + +latest_start_time="" + +for config in "${pod_configs[@]}"; do + IFS='|' read -r pod_name expected_image container_name <<< "$config" + + # Get container start time for the specific container + start_time=$(kubectl get pod "$pod_name" -n kube-system \ + -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].state.running.startedAt}" 2>/dev/null || echo "") + + if [ -n "$start_time" ]; then + echo " Pod $pod_name (container: $container_name) started at: $start_time" + + # Track LATEST time (lexicographically later in ISO 8601 format) + if [ -z "$latest_start_time" ] || [[ "$start_time" > "$latest_start_time" ]]; then + latest_start_time="$start_time" + fi + else + echo "✗ ERROR: Could not determine container start time for pod $pod_name (container: $container_name)" + echo "This is required for Log Analytics query filtering" + exit 1 + fi +done + +if [ -n "$latest_start_time" ]; then + # Validate that start time is recent (within last 30 minutes) + # This ensures we captured the newly deployed containers, not old ones + current_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + current_epoch=$(date -u -d "$current_time" +%s 2>/dev/null || date -u -j -f "%Y-%m-%dT%H:%M:%SZ" "$current_time" +%s 2>/dev/null) + start_epoch=$(date -u -d "$latest_start_time" +%s 2>/dev/null || date -u -j -f "%Y-%m-%dT%H:%M:%SZ" "$latest_start_time" +%s 2>/dev/null) + time_diff=$((current_epoch - start_epoch)) + time_diff_minutes=$((time_diff / 60)) + + echo "" + echo "Time validation:" + echo " Current UTC time: $current_time" + echo " Latest start time: $latest_start_time" + echo " Time difference: $time_diff_minutes minutes ago" + + if [ $time_diff_minutes -gt 30 ]; then + echo "" + echo "⚠ WARNING: Container start time is $time_diff_minutes minutes old!" + echo "This suggests the containers may not have been restarted with the new images." + echo "Expected: Within ~2-5 minutes (time for pods to restart after patching)" + echo "Consider investigating if the image patch actually triggered pod restarts." + else + echo " ✓ Start time is recent (within expected range)" + fi + + # Export for use in tests + echo "CONTAINER_START_TIME=$latest_start_time" > /tmp/container-deployment-time.env + echo "" + echo "✓ LATEST container start time: $latest_start_time" + echo "✓ Saved to /tmp/container-deployment-time.env" + echo "✓ Log Analytics queries should filter: TimeGenerated > datetime('$latest_start_time')" + echo "" + exit 0 +else + echo "✗ ERROR: Could not determine container start times" + exit 1 +fi diff --git a/.pipelines/e2e-test/util.sh b/.pipelines/e2e-test/util.sh new file mode 100644 index 000000000..b157451c5 --- /dev/null +++ b/.pipelines/e2e-test/util.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Shared functions for pod verification scripts +# This file should be sourced by pre-test and post-test verification scripts + +# Function to build pod configurations +# Parameters: +# $1 - LINUX_IMAGE (full image path with tag) +# $2 - WINDOWS_IMAGE (full image path with tag) +# Returns: +# pod_configs array populated with "pod_name|expected_image|container_name" +build_pod_configs() { + local LINUX_IMAGE="$1" + local WINDOWS_IMAGE="$2" + + echo "Getting list of ama-logs pods..." + local pod_list=$(kubectl get pods -n kube-system --no-headers | grep ama-logs | awk '{print $1}') + + # Clear the global pod_configs array + pod_configs=() + + for pod_name in $pod_list; do + local expected_image + local container_name + + # Determine expected image and container name based on pod type + if [[ "$pod_name" =~ ^ama-logs-windows ]]; then + expected_image="$WINDOWS_IMAGE" + container_name="ama-logs-windows" + elif [[ "$pod_name" =~ ^ama-logs-rs ]] || [[ "$pod_name" =~ ^ama-logs-[a-z0-9]{5}$ ]]; then + expected_image="$LINUX_IMAGE" + container_name="ama-logs" + else + echo "✗ ERROR: Unknown pod pattern: $pod_name" + echo "Expected pod names to match one of:" + echo " - ama-logs-windows-* (Windows pods)" + echo " - ama-logs-rs-* (Linux ReplicaSet pods)" + echo " - ama-logs-xxxxx (Linux DaemonSet pods, 5 alphanumeric chars)" + exit 1 + fi + + pod_configs+=("$pod_name|$expected_image|$container_name") + done + + echo "Found ${#pod_configs[@]} pods to verify" + echo "" +} diff --git a/.pipelines/e2e-test/verify-ci-images-after-test.sh b/.pipelines/e2e-test/verify-ci-images-after-test.sh new file mode 100644 index 000000000..d3e5cf04a --- /dev/null +++ b/.pipelines/e2e-test/verify-ci-images-after-test.sh @@ -0,0 +1,123 @@ +#!/bin/bash +# Post-Test Pod Verification +# Performs a quick health check to ensure pods maintained correct images and are still healthy +# This script is used AFTER running E2E tests to detect any pod restarts or issues during testing + +set -e + +# Parse command line arguments +LINUX_IMAGE_TAG="${1}" +WINDOWS_IMAGE_TAG="${2}" +LINUX_MCR_REPO="${3}" +WINDOWS_MCR_REPO="${4}" + +if [ -z "$LINUX_IMAGE_TAG" ] || [ -z "$WINDOWS_IMAGE_TAG" ] || [ -z "$LINUX_MCR_REPO" ] || [ -z "$WINDOWS_MCR_REPO" ]; then + echo "Error: Missing required parameters" + echo "Usage: $0 " + exit 1 +fi + +LINUX_IMAGE="$LINUX_MCR_REPO:$LINUX_IMAGE_TAG" +WINDOWS_IMAGE="$WINDOWS_MCR_REPO:$WINDOWS_IMAGE_TAG" + +# Source shared functions +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/util.sh" + +echo "================================" +echo "Post-Test Pod Verification" +echo "================================" +echo "Verifying pods maintained correct images and are still healthy..." +echo "" +echo "Repository Configuration:" +echo " Linux MCR repo: $LINUX_MCR_REPO" +echo " Windows MCR repo: $WINDOWS_MCR_REPO" +echo "" +echo "Expected Images:" +echo " Linux image: $LINUX_IMAGE" +echo " Windows image: $WINDOWS_IMAGE" +echo "" + +# Build pod configurations using shared function +declare -a pod_configs +build_pod_configs "$LINUX_IMAGE" "$WINDOWS_IMAGE" + +# Perform instant health check on all pods +echo "Performing instant health check on all pods..." +echo "" + +declare -a issues +for config in "${pod_configs[@]}"; do + IFS='|' read -r pod_name expected_image container_name <<< "$config" + + # Get pod details + current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "ERROR") + pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown") + container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false") + + echo "Check pod: $pod_name" + echo " Container: $container_name" + echo " Expected image: $expected_image" + echo " Current image: $current_image" + echo " Pod status: $pod_status" + echo " Container ready: $container_ready" + + # Check for any issues + has_issue=false + + if [[ "$current_image" != "$expected_image" ]]; then + echo " ✗ IMAGE MISMATCH!" + issues+=("$pod_name: expected image '$expected_image' but found '$current_image'") + has_issue=true + fi + + if [[ "$pod_status" != "Running" ]]; then + echo " ✗ POD NOT RUNNING!" + issues+=("$pod_name: pod status is '$pod_status' (expected 'Running')") + has_issue=true + fi + + if [[ "$container_ready" != "true" ]]; then + echo " ✗ CONTAINER NOT READY!" + issues+=("$pod_name: container '$container_name' is not ready") + has_issue=true + fi + + if [[ "$has_issue" = false ]]; then + echo " ✓ Pod: $pod_name passed checks" + fi + echo "" +done + +# Report results +echo "================================" +echo "Post-Test Verification Summary" +echo "================================" + +if [ ${#issues[@]} -eq 0 ]; then + echo "✓ SUCCESS: All pods maintained the correct images and are healthy!" + echo "" + echo "Final pod status:" + kubectl get pods -n kube-system | grep ama-logs + exit 0 +else + echo "✗ FAILURE: Some pods have issues after test execution!" + echo "" + echo "Issues detected:" + printf ' - %s\n' "${issues[@]}" + echo "" + echo "This indicates the pods may have been restarted or updated during testing." + echo "This could cause test instability or false results." + echo "" + echo "Current pod status:" + kubectl get pods -n kube-system | grep ama-logs + echo "" + echo "Detailed pod information:" + for issue in "${issues[@]}"; do + pod=$(echo "$issue" | cut -d: -f1) + echo "" + echo "--- Details for $pod ---" + kubectl describe pod "$pod" -n kube-system | grep -A 20 "Events:" || kubectl describe pod "$pod" -n kube-system | tail -30 + done + exit 1 +fi diff --git a/.pipelines/e2e-test/verify-ci-images-before-test.sh b/.pipelines/e2e-test/verify-ci-images-before-test.sh new file mode 100644 index 000000000..d0f4a4f25 --- /dev/null +++ b/.pipelines/e2e-test/verify-ci-images-before-test.sh @@ -0,0 +1,182 @@ +#!/bin/bash +# Pre-Test Pod Verification +# Waits for all ama-logs pods to be running with the correct images and ready +# This script is used BEFORE running E2E tests to ensure the new agent version is deployed + +set -e + +# Parse command line arguments +LINUX_IMAGE_TAG="${1}" +WINDOWS_IMAGE_TAG="${2}" +LINUX_MCR_REPO="${3}" +WINDOWS_MCR_REPO="${4}" + +if [ -z "$LINUX_IMAGE_TAG" ] || [ -z "$WINDOWS_IMAGE_TAG" ] || [ -z "$LINUX_MCR_REPO" ] || [ -z "$WINDOWS_MCR_REPO" ]; then + echo "Error: Missing required parameters" + echo "Usage: $0 " + exit 1 +fi + +LINUX_IMAGE="$LINUX_MCR_REPO:$LINUX_IMAGE_TAG" +WINDOWS_IMAGE="$WINDOWS_MCR_REPO:$WINDOWS_IMAGE_TAG" + +# Configuration +MAX_RETRIES=15 +CHECK_INTERVAL=60 # seconds +MAX_WAIT_MINUTES=$((MAX_RETRIES * CHECK_INTERVAL / 60)) + +# Source shared functions +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/util.sh" + +echo "================================" +echo "Pre-Test Pod Verification" +echo "================================" +echo "Waiting for pods to be running with new images and ready..." +echo "" +echo "Repository Configuration:" +echo " Linux MCR repo: $LINUX_MCR_REPO" +echo " Windows MCR repo: $WINDOWS_MCR_REPO" +echo "" +echo "Expected Images:" +echo " Linux image: $LINUX_IMAGE" +echo " Windows image: $WINDOWS_IMAGE" +echo "" + +# Build pod configurations using shared function +declare -a pod_configs +build_pod_configs "$LINUX_IMAGE" "$WINDOWS_IMAGE" + +# Validate array was populated +if [ ${#pod_configs[@]} -eq 0 ]; then + echo "✗ ERROR: No pods found to verify!" + echo "This likely means no ama-logs pods exist in the kube-system namespace." + exit 1 +fi + +# Wait for all pods to be ready +echo "================================" +echo "Waiting for all pods to be ready" +echo "================================" +echo "Total pods to check: ${#pod_configs[@]}" +echo "Maximum retries: $MAX_RETRIES" +echo "Check interval: ${CHECK_INTERVAL}s" +echo "Maximum wait time: $MAX_WAIT_MINUTES minutes" +echo "" + +# Track ready status for each pod +declare -A pod_ready_status +for config in "${pod_configs[@]}"; do + pod_name=$(echo "$config" | cut -d'|' -f1) + pod_ready_status["$pod_name"]=false +done + +attempt=1 +while [ $attempt -le $MAX_RETRIES ]; do + has_not_ready_pod=false + ready_count=0 + total_count=${#pod_configs[@]} + + # Check each pod + for config in "${pod_configs[@]}"; do + IFS='|' read -r pod_name expected_image container_name <<< "$config" + echo "" + echo "" + echo " Start checking pod: $pod_name" + echo " Container: $container_name" + echo " Expected image: $expected_image" + + # Skip if already marked as ready + if [ "${pod_ready_status[$pod_name]}" = "true" ]; then + echo " Finished checking pod: $pod_name" + echo " Pod: $pod_name has expected image ready. Skipping check." + echo " ✓ $pod_name - Ready" + ready_count=$((ready_count + 1)) + continue + fi + + # Get pod details + current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "") + pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown") + container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false") + + # Check if pod is ready + if [[ "$current_image" == "$expected_image" ]] && [[ "$pod_status" == "Running" ]] && [[ "$container_ready" == "true" ]]; then + pod_ready_status["$pod_name"]=true + ready_count=$((ready_count + 1)) + echo " Finished checking pod: $pod_name" + echo " Image: $current_image" + echo " Expected image: $expected_image" + echo " Status: $pod_status" + echo " Container ready: $container_ready" + echo " ✓ $pod_name - Ready" + else + has_not_ready_pod=true + echo " Finished checking pod: $pod_name" + echo " ⏳ $pod_name - Waiting (Status: $pod_status, Container ready: $container_ready)" + if [[ "$current_image" != "$expected_image" ]]; then + echo " Image mismatch: expected $expected_image, got $current_image" + fi + echo " x $pod_name - NOT Ready" + fi + done + + # Show progress summary + elapsed_seconds=$(((attempt - 1) * CHECK_INTERVAL)) + minutes_elapsed=$((elapsed_seconds / 60)) + seconds_elapsed=$((elapsed_seconds % 60)) + remaining_retries=$((MAX_RETRIES - attempt)) + remaining_seconds=$((remaining_retries * CHECK_INTERVAL)) + minutes_remaining=$((remaining_seconds / 60)) + seconds_remaining=$((remaining_seconds % 60)) + + echo "" + echo "Attempt $attempt/$MAX_RETRIES (${minutes_elapsed}m${seconds_elapsed}s elapsed, ${minutes_remaining}m${seconds_remaining}s remaining)" + echo "Progress: $ready_count/$total_count pods ready" + echo "" + + # Exit early if all pods are ready + if [ "$has_not_ready_pod" = false ]; then + echo "================================" + echo "✓ SUCCESS: All pods are ready!" + echo "================================" + echo "Total attempts: $attempt" + echo "Total wait time: ${minutes_elapsed}m${seconds_elapsed}s" + echo "" + echo "Final pod status:" + kubectl get pods -n kube-system | grep ama-logs + exit 0 + fi + + # Sleep before next retry (except after last attempt) + if [ $attempt -lt $MAX_RETRIES ]; then + sleep $CHECK_INTERVAL + fi + + ((attempt++)) +done + +# Max retries reached - report failed pods +echo "================================" +echo "✗ TIMEOUT: Not all pods became ready after $MAX_RETRIES attempts" +echo "================================" +echo "" +echo "Failed pods:" +for config in "${pod_configs[@]}"; do + IFS='|' read -r pod_name expected_image container_name <<< "$config" + if [ "${pod_ready_status[$pod_name]}" != "true" ]; then + current_image=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.spec.containers[?(@.name=='$container_name')].image}" 2>/dev/null || echo "ERROR") + pod_status=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.phase}" 2>/dev/null || echo "Unknown") + container_ready=$(kubectl get pod "$pod_name" -n kube-system -o jsonpath="{.status.containerStatuses[?(@.name=='$container_name')].ready}" 2>/dev/null || echo "false") + + echo " ✗ $pod_name" + echo " Expected image: $expected_image" + echo " Current image: $current_image" + echo " Pod status: $pod_status" + echo " Container ready: $container_ready" + fi +done +echo "" +echo "Final pod status:" +kubectl get pods -n kube-system | grep ama-logs +exit 1 diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index ed037d598..31883f858 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -78,6 +78,12 @@ echo "$(fluent-bit --version)" >> packages_version.txt # install fluentd fluentd_version="1.16.3" + +# Pre-install cool.io to avoid ARM64 build issues (segfault during native extension compilation) +# if [ "$ARCH" == "arm64" ]; then +# gem install cool.io -v "1.9.0" --no-document +# fi + gem install fluentd -v $fluentd_version --no-document # remove the test directory from fluentd