From 15d3649ff20b01184702f56598461d6dd3248aab Mon Sep 17 00:00:00 2001 From: Hongkai Liu Date: Tue, 23 Sep 2025 11:05:21 -0400 Subject: [PATCH 1/3] ClusterOperators should not go Progressing only for a node reboot This is to cover the node rebooting case from the rule [1] that is introduced recently: ``` Operators should not report Progressing only because DaemonSets owned by them are adjusting to a new node from cluster scaleup or a node rebooting from cluster upgrade. ``` The test fails if - `co/machine-config` never became Progressing=True during a cluster upgrade, or - some CO left Progressing=False during the upgrade after `machine-config` became Progressing=True. This should not have taken place as `machine-config` was rebooting the nodes which was the only thing ongoing to the cluster during that time. [1]. https://github.com/openshift/api/blob/61248d910ff74aef020492922d14e6dadaba598b/config/v1/types_cluster_operator.go#L163-L164 --- .../legacycvomonitortests/monitortest.go | 1 + .../legacycvomonitortests/operators.go | 149 +++++++++++++++++- 2 files changed, 143 insertions(+), 7 deletions(-) diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go index b07977b0a376..1c9135e4955f 100644 --- a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go +++ b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go @@ -45,6 +45,7 @@ func (w *legacyMonitorTests) EvaluateTestsFromConstructedIntervals(ctx context.C isUpgrade := platformidentification.DidUpgradeHappenDuringCollection(finalIntervals, time.Time{}, time.Time{}) if isUpgrade { junits = append(junits, testUpgradeOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...) + junits = append(junits, clusterOperatorIsNotProgressingWhenMachineConfigIs(finalIntervals)...) } else { junits = append(junits, testStableSystemOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...) } diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go index ce019a78976d..182ce36e382a 100644 --- a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go +++ b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go @@ -6,21 +6,21 @@ import ( "strings" "time" - "github.com/openshift/origin/pkg/monitortestlibrary/utility" + configv1 "github.com/openshift/api/config/v1" + clientconfigv1 "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1" + "github.com/sirupsen/logrus" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" - "github.com/openshift/origin/pkg/monitortests/clusterversionoperator/operatorstateanalyzer" - "github.com/sirupsen/logrus" - - configv1 "github.com/openshift/api/config/v1" - clientconfigv1 "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1" "github.com/openshift/origin/pkg/monitor/monitorapi" "github.com/openshift/origin/pkg/monitortestlibrary/platformidentification" platformidentification2 "github.com/openshift/origin/pkg/monitortestlibrary/platformidentification" + "github.com/openshift/origin/pkg/monitortestlibrary/utility" + "github.com/openshift/origin/pkg/monitortests/clusterversionoperator/operatorstateanalyzer" "github.com/openshift/origin/pkg/test/ginkgo/junitapi" exutil "github.com/openshift/origin/test/extended/util" - "k8s.io/client-go/rest" ) // exceptionCallback consumes a suspicious condition and returns an @@ -586,6 +586,9 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes [] } if len(fatal) > 0 || len(excepted) > 0 { + // add a failure so we + // either flake (or pass) in case len(fatal) == 0 by adding a success to the same test + // or fail in case len(fatal) > 0 by leaving the failure as the only output for the test ret = append(ret, &junitapi.JUnitTestCase{ Name: testName, Duration: duration, @@ -606,6 +609,138 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes [] return ret } +func clusterOperatorIsNotProgressingWhenMachineConfigIs(events monitorapi.Intervals) []*junitapi.JUnitTestCase { + var ret []*junitapi.JUnitTestCase + upgradeWindows := getUpgradeWindows(events) + + var machineConfigProgressing time.Time + var eventsInUpgradeWindows monitorapi.Intervals + + var start, stop time.Time + for _, event := range events { + if !isInUpgradeWindow(upgradeWindows, event) { + continue + } + eventsInUpgradeWindows = append(eventsInUpgradeWindows, event) + if start.IsZero() || event.From.Before(start) { + start = event.From + } + if stop.IsZero() || event.To.After(stop) { + stop = event.To + } + } + duration := stop.Sub(start).Seconds() + + eventsByOperator := getEventsByOperator(eventsInUpgradeWindows) + for _, mcEvent := range eventsByOperator["machine-config"] { + condition := monitorapi.GetOperatorConditionStatus(mcEvent) + if condition == nil { + continue // ignore non-condition intervals + } + if condition.Type == configv1.OperatorProgressing && condition.Status == configv1.ConditionTrue { + machineConfigProgressing = mcEvent.To + break + } + } + + mcTestCase := &junitapi.JUnitTestCase{ + Name: fmt.Sprintf("[bz-Machine Config Operator] clusteroperator/machine-config must go Progressing=True during an upgrade test"), + Duration: duration, + } + if machineConfigProgressing.IsZero() { + mcTestCase.FailureOutput = &junitapi.FailureOutput{ + Output: fmt.Sprintf("machine-config was never Progressing=True during the upgrade window from %s to %s", start.Format(time.RFC3339), stop.Format(time.RFC3339)), + } + return []*junitapi.JUnitTestCase{mcTestCase} + } else { + mcTestCase.SystemOut = fmt.Sprintf("machine-config became Progressing=True at %s during the upgrade window from %s to %s", machineConfigProgressing.Format(time.RFC3339), start.Format(time.RFC3339), stop.Format(time.RFC3339)) + } + ret = append(ret, mcTestCase) + + for _, operatorName := range platformidentification.KnownOperators.Difference(sets.NewString("machine-config")).List() { + bzComponent := platformidentification.GetBugzillaComponentForOperator(operatorName) + if bzComponent == "Unknown" { + bzComponent = operatorName + } + testName := fmt.Sprintf("[bz-%v] clusteroperator/%v should stay Progressing=False while MCO is Progressing=True", bzComponent, operatorName) + operatorEvents := eventsByOperator[operatorName] + if len(operatorEvents) == 0 { + ret = append(ret, &junitapi.JUnitTestCase{ + Name: testName, + Duration: duration, + }) + continue + } + + except := func(co string, condition *configv1.ClusterOperatorStatusCondition) string { + return "" + } + + var excepted, fatal []string + for _, operatorEvent := range operatorEvents { + if operatorEvent.From.Before(machineConfigProgressing) { + continue + } + condition := monitorapi.GetOperatorConditionStatus(operatorEvent) + if condition == nil { + continue // ignore non-condition intervals + } + if condition.Type == "" { + fatal = append(fatal, fmt.Sprintf("failed to convert %v into a condition with a type", operatorEvent)) + continue + } + + if condition.Type != configv1.OperatorProgressing || condition.Status == configv1.ConditionFalse { + continue + } + + // if there was any switch, it was wrong/unexpected at some point + failure := fmt.Sprintf("%v", operatorEvent) + + exception := except(operatorName, condition) + if exception == "" { + fatal = append(fatal, failure) + } else { + excepted = append(excepted, fmt.Sprintf("%s (exception: %s)", failure, exception)) + } + } + + output := fmt.Sprintf("%d (out of %d) unexpected clusteroperator state transitions while machine-config is progressing during the upgrade window from %s to %s", len(fatal), len(operatorEvents), start.Format(time.RFC3339), stop.Format(time.RFC3339)) + if len(fatal) > 0 { + output = fmt.Sprintf("%s. These did not match any known exceptions, so they cause this test-case to fail:\n\n%v\n", output, strings.Join(fatal, "\n")) + } else { + output = fmt.Sprintf("%s, as desired.", output) + } + output = fmt.Sprintf("%s\n%d unwelcome but acceptable clusteroperator state transitions while machine-config is progressing during the upgrade window from %s to %s", output, len(excepted), start.Format(time.RFC3339), stop.Format(time.RFC3339)) + if len(excepted) > 0 { + output = fmt.Sprintf("%s. These should not happen, but because they are tied to exceptions, the fact that they did happen is not sufficient to cause this test-case to fail:\n\n%v\n", output, strings.Join(excepted, "\n")) + } else { + output = fmt.Sprintf("%s, as desired.", output) + } + + if len(fatal) > 0 || len(excepted) > 0 { + // add a failure so we + // either flake (or pass) in case len(fatal) == 0 by adding a success to the same test + // or fail in case len(fatal) > 0 by leaving the failure as the only output for the test + ret = append(ret, &junitapi.JUnitTestCase{ + Name: testName, + Duration: duration, + SystemOut: output, + FailureOutput: &junitapi.FailureOutput{ + Output: output, + }, + }) + } + + if len(fatal) == 0 { + // add a success so we flake (or pass) and don't fail + ret = append(ret, &junitapi.JUnitTestCase{Name: testName}) + } + } + + return ret +} + type startedStaged struct { // OSUpdateStarted is the event Reason emitted by the machine config operator when a node begins extracting // it's OS content. From 8257b5c6af46bf4af4b34dae7a8c9ab69a02bbc3 Mon Sep 17 00:00:00 2001 From: Hongkai Liu Date: Tue, 30 Sep 2025 09:44:04 -0400 Subject: [PATCH 2/3] Add exceptions for the violating COs --- .../legacycvomonitortests/operators.go | 57 ++++++++++++++++++- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go index 182ce36e382a..48b52921b288 100644 --- a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go +++ b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go @@ -672,7 +672,60 @@ func clusterOperatorIsNotProgressingWhenMachineConfigIs(events monitorapi.Interv continue } - except := func(co string, condition *configv1.ClusterOperatorStatusCondition) string { + except := func(co string, reason string) string { + switch co { + case "csi-snapshot-controller": + if reason == "CSISnapshotController_Deploying" { + return "https://issues.redhat.com/browse/OCPBUGS-62624" + } + case "dns": + if reason == "DNSReportsProgressingIsTrue" { + return "https://issues.redhat.com/browse/OCPBUGS-62623" + } + case "image-registry": + if reason == "NodeCADaemonUnavailable::Ready" || reason == "DeploymentNotCompleted" { + return "https://issues.redhat.com/browse/OCPBUGS-62626" + } + case "ingress": + if reason == "Reconciling" { + return "https://issues.redhat.com/browse/OCPBUGS-62627" + } + case "kube-storage-version-migrator": + if reason == "KubeStorageVersionMigrator_Deploying" { + return "https://issues.redhat.com/browse/OCPBUGS-62629" + } + case "network": + if reason == "Deploying" { + return "https://issues.redhat.com/browse/OCPBUGS-62630" + } + case "node-tuning": + if reason == "Reconciling" { + return "https://issues.redhat.com/browse/OCPBUGS-62632" + } + case "openshift-controller-manager": + if reason == "_DesiredStateNotYetAchieved" { + return "https://issues.redhat.com/browse/OCPBUGS-63116" + } + case "service-ca": + if reason == "_ManagedDeploymentsAvailable" { + return "https://issues.redhat.com/browse/OCPBUGS-62633" + } + case "storage": + // GCPPDCSIDriverOperatorCR_GCPPDDriverControllerServiceController_Deploying + // GCPPDCSIDriverOperatorCR_GCPPDDriverNodeServiceController_Deploying + // AWSEBSCSIDriverOperatorCR_AWSEBSDriverNodeServiceController_Deploying + // VolumeDataSourceValidatorDeploymentController_Deploying + if strings.HasSuffix(reason, "Controller_Deploying") || + reason == "GCPPD_Deploying" { + return "https://issues.redhat.com/browse/OCPBUGS-62634" + } + case "olm": + // CatalogdDeploymentCatalogdControllerManager_Deploying + // OperatorcontrollerDeploymentOperatorControllerControllerManager_Deploying + if strings.HasSuffix(reason, "ControllerManager_Deploying") { + return "https://issues.redhat.com/browse/OCPBUGS-62635" + } + } return "" } @@ -697,7 +750,7 @@ func clusterOperatorIsNotProgressingWhenMachineConfigIs(events monitorapi.Interv // if there was any switch, it was wrong/unexpected at some point failure := fmt.Sprintf("%v", operatorEvent) - exception := except(operatorName, condition) + exception := except(operatorName, condition.Reason) if exception == "" { fatal = append(fatal, failure) } else { From 7d824bfcb00137de3861cc388329c38a70f417a0 Mon Sep 17 00:00:00 2001 From: Hongkai Liu Date: Thu, 16 Oct 2025 16:04:51 -0400 Subject: [PATCH 3/3] Address comments from review --- .../legacycvomonitortests/operators.go | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go index 48b52921b288..fa99ed61b5c7 100644 --- a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go +++ b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go @@ -516,9 +516,6 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes [] for _, conditionType := range conditionTypes { for _, operatorName := range platformidentification.KnownOperators.List() { bzComponent := platformidentification.GetBugzillaComponentForOperator(operatorName) - if bzComponent == "Unknown" { - bzComponent = operatorName - } testName := fmt.Sprintf("[bz-%v] clusteroperator/%v should not change condition/%v", bzComponent, operatorName, conditionType) operatorEvents := eventsByOperator[operatorName] if len(operatorEvents) == 0 { @@ -600,8 +597,12 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes [] } if len(fatal) == 0 { - // add a success so we flake (or pass) and don't fail - ret = append(ret, &junitapi.JUnitTestCase{Name: testName}) + if len(excepted) > 0 { + // add a success so we flake (or pass) and don't fail + ret = append(ret, &junitapi.JUnitTestCase{Name: testName, SystemOut: "Passing the case to make the overall test case flake as the previous failure is expected"}) + } else { + ret = append(ret, &junitapi.JUnitTestCase{Name: testName}) + } } } } @@ -613,7 +614,7 @@ func clusterOperatorIsNotProgressingWhenMachineConfigIs(events monitorapi.Interv var ret []*junitapi.JUnitTestCase upgradeWindows := getUpgradeWindows(events) - var machineConfigProgressing time.Time + var machineConfigProgressingStart time.Time var eventsInUpgradeWindows monitorapi.Intervals var start, stop time.Time @@ -638,7 +639,7 @@ func clusterOperatorIsNotProgressingWhenMachineConfigIs(events monitorapi.Interv continue // ignore non-condition intervals } if condition.Type == configv1.OperatorProgressing && condition.Status == configv1.ConditionTrue { - machineConfigProgressing = mcEvent.To + machineConfigProgressingStart = mcEvent.To break } } @@ -647,21 +648,18 @@ func clusterOperatorIsNotProgressingWhenMachineConfigIs(events monitorapi.Interv Name: fmt.Sprintf("[bz-Machine Config Operator] clusteroperator/machine-config must go Progressing=True during an upgrade test"), Duration: duration, } - if machineConfigProgressing.IsZero() { + if machineConfigProgressingStart.IsZero() { mcTestCase.FailureOutput = &junitapi.FailureOutput{ Output: fmt.Sprintf("machine-config was never Progressing=True during the upgrade window from %s to %s", start.Format(time.RFC3339), stop.Format(time.RFC3339)), } return []*junitapi.JUnitTestCase{mcTestCase} } else { - mcTestCase.SystemOut = fmt.Sprintf("machine-config became Progressing=True at %s during the upgrade window from %s to %s", machineConfigProgressing.Format(time.RFC3339), start.Format(time.RFC3339), stop.Format(time.RFC3339)) + mcTestCase.SystemOut = fmt.Sprintf("machine-config became Progressing=True at %s during the upgrade window from %s to %s", machineConfigProgressingStart.Format(time.RFC3339), start.Format(time.RFC3339), stop.Format(time.RFC3339)) } ret = append(ret, mcTestCase) for _, operatorName := range platformidentification.KnownOperators.Difference(sets.NewString("machine-config")).List() { bzComponent := platformidentification.GetBugzillaComponentForOperator(operatorName) - if bzComponent == "Unknown" { - bzComponent = operatorName - } testName := fmt.Sprintf("[bz-%v] clusteroperator/%v should stay Progressing=False while MCO is Progressing=True", bzComponent, operatorName) operatorEvents := eventsByOperator[operatorName] if len(operatorEvents) == 0 { @@ -731,7 +729,7 @@ func clusterOperatorIsNotProgressingWhenMachineConfigIs(events monitorapi.Interv var excepted, fatal []string for _, operatorEvent := range operatorEvents { - if operatorEvent.From.Before(machineConfigProgressing) { + if operatorEvent.From.Before(machineConfigProgressingStart) { continue } condition := monitorapi.GetOperatorConditionStatus(operatorEvent) @@ -786,8 +784,12 @@ func clusterOperatorIsNotProgressingWhenMachineConfigIs(events monitorapi.Interv } if len(fatal) == 0 { - // add a success so we flake (or pass) and don't fail - ret = append(ret, &junitapi.JUnitTestCase{Name: testName}) + if len(excepted) > 0 { + // add a success so we flake (or pass) and don't fail + ret = append(ret, &junitapi.JUnitTestCase{Name: testName, SystemOut: "Passing the case to make the overall test case flake as the previous failure is expected"}) + } else { + ret = append(ret, &junitapi.JUnitTestCase{Name: testName}) + } } }