From f08ffdc7d48a0cd23986bac21e1567f9c075858e Mon Sep 17 00:00:00 2001 From: ada mancini Date: Mon, 8 Dec 2025 12:51:33 -0500 Subject: [PATCH 1/4] feat: add support bundle analyzer for PDB eviction failures Add analyzer to detect PodDisruptionBudget-related pod eviction failures in k0scontroller logs. This helps identify when PDBs are blocking autopilot upgrades, causing cluster upgrades to hang. The analyzer: - Scans k0scontroller logs for eviction error patterns - Detects "error when evicting pods" and PDB violation messages - Provides comprehensive remediation guidance - Includes kubectl commands and step-by-step resolution steps This makes troubleshooting PDB-blocked upgrades significantly faster for support engineers and customers. --- .../support/host-support-bundle.tmpl.yaml | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/cmd/installer/goods/support/host-support-bundle.tmpl.yaml b/cmd/installer/goods/support/host-support-bundle.tmpl.yaml index 6de761c283..bba0f30f2d 100644 --- a/cmd/installer/goods/support/host-support-bundle.tmpl.yaml +++ b/cmd/installer/goods/support/host-support-bundle.tmpl.yaml @@ -434,6 +434,41 @@ spec: operationSize: 2300 datasync: true runTime: "0" # let it run to completion + - textAnalyze: + checkName: PodDisruptionBudget blocking upgrade + fileName: host-collectors/run-host/k0scontroller-logs.txt + regex: '(error when evicting pods|Cannot evict pod as it would violate the pod''s disruption budget)' + outcomes: + - fail: + when: "true" + message: |- + PodDisruptionBudgets are preventing pod eviction during cluster upgrade. + + The k0scontroller cannot evict pods, causing the autopilot upgrade process to hang. + This makes the cluster unable to complete upgrades. + + To resolve this issue: + + 1. List all PodDisruptionBudgets in the cluster: + kubectl get pdb -A + + 2. Identify which PDBs are blocking eviction. Look for PDBs with: + - minAvailable set too high (not allowing enough pods to be evicted) + - maxUnavailable set too low (too restrictive) + + 3. Temporarily adjust or remove the problematic PDBs: + kubectl delete pdb -n + + Or edit to allow more disruption: + kubectl edit pdb -n + + 4. After the upgrade completes, restore the PDBs if needed. + + WARNING: Modifying PDBs reduces availability guarantees. Coordinate with + application owners and consider scheduling during maintenance windows. + - pass: + when: "false" + message: No PodDisruptionBudget eviction failures detected - run: collectorName: "localhost-ips" command: "sh" From 55462d5d337f5ee5ed4c65328b2ccdd4355d597f Mon Sep 17 00:00:00 2001 From: ada mancini Date: Mon, 8 Dec 2025 12:57:34 -0500 Subject: [PATCH 2/4] move new analyzer to hostAnalyzers section --- .../support/host-support-bundle.tmpl.yaml | 74 +++++++++---------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/cmd/installer/goods/support/host-support-bundle.tmpl.yaml b/cmd/installer/goods/support/host-support-bundle.tmpl.yaml index bba0f30f2d..de1c05533e 100644 --- a/cmd/installer/goods/support/host-support-bundle.tmpl.yaml +++ b/cmd/installer/goods/support/host-support-bundle.tmpl.yaml @@ -197,7 +197,7 @@ spec: - run: collectorName: curl-replicated-app command: sh - args: + args: - -c - | if [ -n "{{ .HTTPSProxy }}" ]; then @@ -398,7 +398,7 @@ spec: - -c - | pat='(clamav|sophos|esets_daemon|fsav|symantec|mfend|ds_agent|kav|bdagent|s1agent|falcon|illumio|xagt|wdavdaemon|mdatp)' - + if command -v pgrep >/dev/null 2>&1; then pgrep -afi "$pat" else @@ -434,41 +434,6 @@ spec: operationSize: 2300 datasync: true runTime: "0" # let it run to completion - - textAnalyze: - checkName: PodDisruptionBudget blocking upgrade - fileName: host-collectors/run-host/k0scontroller-logs.txt - regex: '(error when evicting pods|Cannot evict pod as it would violate the pod''s disruption budget)' - outcomes: - - fail: - when: "true" - message: |- - PodDisruptionBudgets are preventing pod eviction during cluster upgrade. - - The k0scontroller cannot evict pods, causing the autopilot upgrade process to hang. - This makes the cluster unable to complete upgrades. - - To resolve this issue: - - 1. List all PodDisruptionBudgets in the cluster: - kubectl get pdb -A - - 2. Identify which PDBs are blocking eviction. Look for PDBs with: - - minAvailable set too high (not allowing enough pods to be evicted) - - maxUnavailable set too low (too restrictive) - - 3. Temporarily adjust or remove the problematic PDBs: - kubectl delete pdb -n - - Or edit to allow more disruption: - kubectl edit pdb -n - - 4. After the upgrade completes, restore the PDBs if needed. - - WARNING: Modifying PDBs reduces availability guarantees. Coordinate with - application owners and consider scheduling during maintenance windows. - - pass: - when: "false" - message: No PodDisruptionBudget eviction failures detected - run: collectorName: "localhost-ips" command: "sh" @@ -768,6 +733,41 @@ spec: - pass: when: "false" message: "No signs of hostname changes found" + - textAnalyze: + checkName: PodDisruptionBudget blocking upgrade + fileName: host-collectors/run-host/k0scontroller-logs.txt + regex: '(error when evicting pods|Cannot evict pod as it would violate the pod''s disruption budget)' + outcomes: + - fail: + when: "true" + message: |- + PodDisruptionBudgets are preventing pod eviction during cluster upgrade. + + The k0scontroller cannot evict pods, causing the autopilot upgrade process to hang. + This makes the cluster unable to complete upgrades. + + To resolve this issue: + + 1. List all PodDisruptionBudgets in the cluster: + kubectl get pdb -A + + 2. Identify which PDBs are blocking eviction. Look for PDBs with: + - minAvailable set too high (not allowing enough pods to be evicted) + - maxUnavailable set too low (too restrictive) + + 3. Temporarily adjust or remove the problematic PDBs: + kubectl delete pdb -n + + Or edit to allow more disruption: + kubectl edit pdb -n + + 4. After the upgrade completes, restore the PDBs if needed. + + WARNING: Modifying PDBs reduces availability guarantees. Coordinate with + application owners and consider scheduling during maintenance windows. + - pass: + when: "false" + message: No PodDisruptionBudget eviction failures detected - textAnalyze: checkName: Check if localhost resolves to 127.0.0.1 fileName: host-collectors/run-host/localhost-ips.txt From fd383aec770f1d85c5ba07907f3920ea06c1b842 Mon Sep 17 00:00:00 2001 From: ada mancini Date: Mon, 8 Dec 2025 12:59:05 -0500 Subject: [PATCH 3/4] remove extra whitespace --- cmd/installer/goods/support/host-support-bundle.tmpl.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/cmd/installer/goods/support/host-support-bundle.tmpl.yaml b/cmd/installer/goods/support/host-support-bundle.tmpl.yaml index de1c05533e..9e04df243d 100644 --- a/cmd/installer/goods/support/host-support-bundle.tmpl.yaml +++ b/cmd/installer/goods/support/host-support-bundle.tmpl.yaml @@ -398,7 +398,6 @@ spec: - -c - | pat='(clamav|sophos|esets_daemon|fsav|symantec|mfend|ds_agent|kav|bdagent|s1agent|falcon|illumio|xagt|wdavdaemon|mdatp)' - if command -v pgrep >/dev/null 2>&1; then pgrep -afi "$pat" else From 442d2e23bfcf549a6e90374b5afdefb096b5b60e Mon Sep 17 00:00:00 2001 From: ada mancini Date: Mon, 8 Dec 2025 13:03:50 -0500 Subject: [PATCH 4/4] fix indentation --- .../support/host-support-bundle.tmpl.yaml | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/cmd/installer/goods/support/host-support-bundle.tmpl.yaml b/cmd/installer/goods/support/host-support-bundle.tmpl.yaml index 9e04df243d..de0e192ca9 100644 --- a/cmd/installer/goods/support/host-support-bundle.tmpl.yaml +++ b/cmd/installer/goods/support/host-support-bundle.tmpl.yaml @@ -737,36 +737,36 @@ spec: fileName: host-collectors/run-host/k0scontroller-logs.txt regex: '(error when evicting pods|Cannot evict pod as it would violate the pod''s disruption budget)' outcomes: - - fail: - when: "true" - message: |- - PodDisruptionBudgets are preventing pod eviction during cluster upgrade. + - fail: + when: "true" + message: |- + PodDisruptionBudgets are preventing pod eviction during cluster upgrade. - The k0scontroller cannot evict pods, causing the autopilot upgrade process to hang. - This makes the cluster unable to complete upgrades. + The k0scontroller cannot evict pods, causing the autopilot upgrade process to hang. + This makes the cluster unable to complete upgrades. - To resolve this issue: + To resolve this issue: - 1. List all PodDisruptionBudgets in the cluster: - kubectl get pdb -A + 1. List all PodDisruptionBudgets in the cluster: + kubectl get pdb -A - 2. Identify which PDBs are blocking eviction. Look for PDBs with: - - minAvailable set too high (not allowing enough pods to be evicted) - - maxUnavailable set too low (too restrictive) + 2. Identify which PDBs are blocking eviction. Look for PDBs with: + - minAvailable set too high (not allowing enough pods to be evicted) + - maxUnavailable set too low (too restrictive) - 3. Temporarily adjust or remove the problematic PDBs: - kubectl delete pdb -n + 3. Temporarily adjust or remove the problematic PDBs: + kubectl delete pdb -n - Or edit to allow more disruption: - kubectl edit pdb -n + Or edit to allow more disruption: + kubectl edit pdb -n - 4. After the upgrade completes, restore the PDBs if needed. + 4. After the upgrade completes, restore the PDBs if needed. - WARNING: Modifying PDBs reduces availability guarantees. Coordinate with - application owners and consider scheduling during maintenance windows. - - pass: - when: "false" - message: No PodDisruptionBudget eviction failures detected + WARNING: Modifying PDBs reduces availability guarantees. Coordinate with + application owners and consider scheduling during maintenance windows. + - pass: + when: "false" + message: No PodDisruptionBudget eviction failures detected - textAnalyze: checkName: Check if localhost resolves to 127.0.0.1 fileName: host-collectors/run-host/localhost-ips.txt