diff --git a/pkg/operator/controller/controller.go b/pkg/operator/controller/controller.go index 58ba5eefa..ac6534423 100644 --- a/pkg/operator/controller/controller.go +++ b/pkg/operator/controller/controller.go @@ -12,6 +12,7 @@ import ( "github.com/openshift/cluster-dns-operator/pkg/manifests" operatorconfig "github.com/openshift/cluster-dns-operator/pkg/operator/config" + retryable "github.com/openshift/cluster-dns-operator/pkg/util/retryableerror" "github.com/openshift/cluster-dns-operator/pkg/util/slice" "github.com/sirupsen/logrus" @@ -122,11 +123,11 @@ func New(mgr manager.Manager, config Config) (controller.Controller, error) { DeleteFunc: func(e event.DeleteEvent) bool { return nodePredicate(e.Object) }, UpdateFunc: func(e event.UpdateEvent) bool { old := e.ObjectOld.(*corev1.Node) - new := e.ObjectNew.(*corev1.Node) - if ignoreNodeForTopologyAwareHints(old) != ignoreNodeForTopologyAwareHints(new) { + nu := e.ObjectNew.(*corev1.Node) + if ignoreNodeForTopologyAwareHints(old) != ignoreNodeForTopologyAwareHints(nu) { return true } - if !ignoreNodeForTopologyAwareHints(new) && nodeIsValidForTopologyAwareHints(old) != nodeIsValidForTopologyAwareHints(new) { + if !ignoreNodeForTopologyAwareHints(nu) && nodeIsValidForTopologyAwareHints(old) != nodeIsValidForTopologyAwareHints(nu) { return true } return false @@ -263,6 +264,11 @@ func (r *reconciler) Reconcile(ctx context.Context, request reconcile.Request) ( } else { // Handle everything else. if err := r.ensureDNS(dns, &result); err != nil { + switch e := err.(type) { + case retryable.Error: + logrus.Error(e, "got retryable error; requeueing", "after", e.After()) + return reconcile.Result{RequeueAfter: e.After()}, nil + } errs = append(errs, fmt.Errorf("failed to ensure dns %s: %v", dns.Name, err)) } else if err := r.ensureExternalNameForOpenshiftService(); err != nil { errs = append(errs, fmt.Errorf("failed to ensure external name for openshift service: %v", err)) diff --git a/pkg/operator/controller/dns_status.go b/pkg/operator/controller/dns_status.go index e68560a60..76367511d 100644 --- a/pkg/operator/controller/dns_status.go +++ b/pkg/operator/controller/dns_status.go @@ -2,16 +2,17 @@ package controller import ( "context" + "errors" "fmt" "reflect" "strings" "time" - "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" - "github.com/sirupsen/logrus" - operatorv1 "github.com/openshift/api/operator/v1" + cond "github.com/openshift/cluster-dns-operator/pkg/util/conditions" + retryable "github.com/openshift/cluster-dns-operator/pkg/util/retryableerror" + + "github.com/sirupsen/logrus" appsv1 "k8s.io/api/apps/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -26,10 +27,19 @@ import ( // for progressing and degraded then consider oldCondition to be recent // and return oldCondition to prevent frequent updates. func (r *reconciler) syncDNSStatus(dns *operatorv1.DNS, clusterIP, clusterDomain string, haveDNSDaemonset bool, dnsDaemonset *appsv1.DaemonSet, haveNodeResolverDaemonset bool, nodeResolverDaemonset *appsv1.DaemonSet, transitionUnchangedToleration time.Duration, reconcileResult *reconcile.Result) error { + var errs []error updated := dns.DeepCopy() updated.Status.ClusterIP = clusterIP updated.Status.ClusterDomain = clusterDomain - updated.Status.Conditions = computeDNSStatusConditions(dns, clusterIP, haveDNSDaemonset, dnsDaemonset, haveNodeResolverDaemonset, nodeResolverDaemonset, transitionUnchangedToleration, reconcileResult) + // This can return a retryable error. + statusConds, err := computeDNSStatusConditions(dns, clusterIP, haveDNSDaemonset, dnsDaemonset, haveNodeResolverDaemonset, nodeResolverDaemonset, transitionUnchangedToleration, reconcileResult) + if err != nil { + logrus.Infof("error computing DNS %s status: %v got %v", dns.ObjectMeta.Name, statusConds, err) + errs = append(errs, err) + return retryable.NewMaybeRetryableAggregate(errs) + } + + updated.Status.Conditions = statusConds if !dnsStatusesEqual(updated.Status, dns.Status) { if err := r.client.Status().Update(context.TODO(), updated); err != nil { return fmt.Errorf("failed to update dns status: %v", err) @@ -46,7 +56,7 @@ func (r *reconciler) syncDNSStatus(dns *operatorv1.DNS, clusterIP, clusterDomain // oldCondition.LastTransitionTime is <= transitionUnchangedToleration // for progressing and degraded then consider oldCondition to be recent // and return oldCondition to prevent frequent updates. -func computeDNSStatusConditions(dns *operatorv1.DNS, clusterIP string, haveDNSDaemonset bool, dnsDaemonset *appsv1.DaemonSet, haveNodeResolverDaemonset bool, nodeResolverDaemonset *appsv1.DaemonSet, transitionUnchangedToleration time.Duration, reconcileResult *reconcile.Result) []operatorv1.OperatorCondition { +func computeDNSStatusConditions(dns *operatorv1.DNS, clusterIP string, haveDNSDaemonset bool, dnsDaemonset *appsv1.DaemonSet, haveNodeResolverDaemonset bool, nodeResolverDaemonset *appsv1.DaemonSet, transitionUnchangedToleration time.Duration, reconcileResult *reconcile.Result) ([]operatorv1.OperatorCondition, error) { oldConditions := dns.Status.Conditions var oldDegradedCondition, oldProgressingCondition, oldAvailableCondition, oldUpgradeableCondition *operatorv1.OperatorCondition for i := range oldConditions { @@ -63,16 +73,42 @@ func computeDNSStatusConditions(dns *operatorv1.DNS, clusterIP string, haveDNSDa } now := time.Now() - conditions := []operatorv1.OperatorCondition{ - computeDNSDegradedCondition(oldDegradedCondition, clusterIP, haveDNSDaemonset, dnsDaemonset, transitionUnchangedToleration, now), - computeDNSProgressingCondition(oldProgressingCondition, dns, clusterIP, haveDNSDaemonset, dnsDaemonset, haveNodeResolverDaemonset, nodeResolverDaemonset, transitionUnchangedToleration, now, reconcileResult), - computeDNSAvailableCondition(oldAvailableCondition, clusterIP, haveDNSDaemonset, dnsDaemonset), - computeDNSUpgradeableCondition(oldUpgradeableCondition, dns), - } + var conditions []operatorv1.OperatorCondition + // If the operator is currently Progressing=true, we may not want to mark it Degraded=true. + newProgressingCondition := computeDNSProgressingCondition(oldProgressingCondition, dns, clusterIP, haveDNSDaemonset, dnsDaemonset, haveNodeResolverDaemonset, nodeResolverDaemonset, transitionUnchangedToleration, now, reconcileResult) + conditions = append(conditions, newProgressingCondition) + conditions = append(conditions, computeDNSAvailableCondition(oldAvailableCondition, clusterIP, haveDNSDaemonset, dnsDaemonset)) + conditions = append(conditions, computeDNSUpgradeableCondition(oldUpgradeableCondition, dns)) + // Store the error from computeDNSDegradedCondition for use in retries by caller. + degradedCondition, err := computeDNSDegradedCondition(oldDegradedCondition, &newProgressingCondition, clusterIP, haveDNSDaemonset, dnsDaemonset, transitionUnchangedToleration, now) + conditions = append(conditions, degradedCondition) - return conditions + return conditions, err } +const ( + // DNSNoService indicates that no IP address is assigned to the DNS service. + DNSNoService = "NoService" + + // DNSNoDNSDaemonSet indicates that the DNS daemon set doesn't exist. + DNSNoDNSDaemonSet = "NoDNSDaemonSet" + + // DNSNoDNSPodsDesired indicates that no DNS pods are desired; this could mean + // all nodes are tainted or unschedulable. + DNSNoDNSPodsDesired = "NoDNSPodsDesired" + + // DNSNoDNSPodsAvailable indicates that no DNS pods are available. + DNSNoDNSPodsAvailable = "NoDNSPodsAvailable" + + // DNSInvalidDNSMaxUnavailable indicates that the DNS daemonset has an invalid + // MaxUnavailable value configured. + DNSInvalidDNSMaxUnavailable = "InvalidDNSMaxUnavailable" + + // DNSMaxUnavailableDNSPodsExceeded indicates that the number of unavailable DNS + // pods is greater than the configured MaxUnavailable. + DNSMaxUnavailableDNSPodsExceeded = "MaxUnavailableDNSPodsExceeded" +) + // computeDNSDegradedCondition computes the dns Degraded status // condition based on the status of clusterIP and the DNS daemonset. // The node-resolver daemonset is not a part of the calculation of @@ -80,22 +116,20 @@ func computeDNSStatusConditions(dns *operatorv1.DNS, clusterIP string, haveDNSDa // oldCondition.LastTransitionTime is <= transitionUnchangedToleration // then consider oldCondition to be recent and return oldCondition to // prevent frequent updates. -func computeDNSDegradedCondition(oldCondition *operatorv1.OperatorCondition, clusterIP string, haveDNSDaemonset bool, dnsDaemonset *appsv1.DaemonSet, transitionUnchangedToleration time.Duration, currentTime time.Time) operatorv1.OperatorCondition { - degradedCondition := &operatorv1.OperatorCondition{ - Type: operatorv1.OperatorStatusTypeDegraded, +func computeDNSDegradedCondition(oldDegradedCondition, newProgressingCondition *operatorv1.OperatorCondition, clusterIP string, haveDNSDaemonset bool, dnsDaemonset *appsv1.DaemonSet, transitionUnchangedToleration time.Duration, currentTime time.Time) (operatorv1.OperatorCondition, error) { + degradedCondition := operatorv1.OperatorCondition{ + Type: operatorv1.OperatorStatusTypeDegraded, + Status: operatorv1.ConditionUnknown, } - status := operatorv1.ConditionUnknown - degradedReasons := []string{} - messages := []string{} + var degradedConditions []*operatorv1.OperatorCondition + + transitionTime := metav1.NewTime(currentTime) + if len(clusterIP) == 0 { - status = operatorv1.ConditionTrue - degradedReasons = append(degradedReasons, "NoService") - messages = append(messages, "No IP address is assigned to the DNS service.") + degradedConditions = append(degradedConditions, generateCondition(DNSNoService, DNSNoService, "No IP address is assigned to the DNS service.", operatorv1.ConditionTrue, transitionTime)) } if !haveDNSDaemonset { - status = operatorv1.ConditionTrue - degradedReasons = append(degradedReasons, "NoDNSDaemonSet") - messages = append(messages, "The DNS daemonset does not exist.") + degradedConditions = append(degradedConditions, generateCondition(DNSNoDNSDaemonSet, DNSNoDNSDaemonSet, "The DNS daemonset does not exist.", operatorv1.ConditionTrue, transitionTime)) } else { want := dnsDaemonset.Status.DesiredNumberScheduled have := dnsDaemonset.Status.NumberAvailable @@ -105,40 +139,52 @@ func computeDNSDegradedCondition(oldCondition *operatorv1.OperatorCondition, clu switch { case want == 0: - status = operatorv1.ConditionTrue - degradedReasons = append(degradedReasons, "NoDNSPodsDesired") - messages = append(messages, "No DNS pods are desired; this could mean all nodes are tainted or unschedulable.") + degradedConditions = append(degradedConditions, generateCondition(DNSNoDNSPodsDesired, DNSNoDNSPodsDesired, "No DNS pods are desired; this could mean all nodes are tainted or unschedulable.", operatorv1.ConditionTrue, transitionTime)) case have == 0: - status = operatorv1.ConditionTrue - degradedReasons = append(degradedReasons, "NoDNSPodsAvailable") - messages = append(messages, "No DNS pods are available.") + degradedConditions = append(degradedConditions, generateCondition(DNSNoDNSPodsAvailable, DNSNoDNSPodsAvailable, "No DNS pods are available.", operatorv1.ConditionTrue, transitionTime)) case intstrErr != nil: // This should not happen, but is included just to safeguard against future changes. - degradedReasons = append(degradedReasons, "InvalidDNSMaxUnavailable") - messages = append(messages, fmt.Sprintf("The DNS daemonset has an invalid MaxUnavailable value: %v", intstrErr)) + degradedConditions = append(degradedConditions, generateCondition(DNSInvalidDNSMaxUnavailable, DNSInvalidDNSMaxUnavailable, fmt.Sprintf("The DNS daemonset has an invalid MaxUnavailable value: %v", intstrErr), operatorv1.ConditionTrue, transitionTime)) case int(numberUnavailable) > maxUnavailable: - status = operatorv1.ConditionTrue - degradedReasons = append(degradedReasons, "MaxUnavailableDNSPodsExceeded") - messages = append(messages, fmt.Sprintf("Too many DNS pods are unavailable (%d > %d max unavailable).", numberUnavailable, maxUnavailable)) + degradedConditions = append(degradedConditions, generateCondition(DNSMaxUnavailableDNSPodsExceeded, DNSMaxUnavailableDNSPodsExceeded, fmt.Sprintf("Too many DNS pods are unavailable (%d > %d max unavailable).", numberUnavailable, maxUnavailable), operatorv1.ConditionTrue, transitionTime)) } } - if len(degradedReasons) != 0 { + // Record whether the operator is Progressing. + progressing := newProgressingCondition != nil && newProgressingCondition.Status == operatorv1.ConditionTrue + + if len(degradedConditions) != 0 { // if the last status was set to false within the last transitionUnchangedToleration, skip the new update // to prevent frequent status flaps, and try to keep the long-lasting state (i.e. Degraded=False). See https://bugzilla.redhat.com/show_bug.cgi?id=2037190. - if oldCondition != nil && oldCondition.Status == operatorv1.ConditionFalse && lastTransitionTimeIsRecent(currentTime, oldCondition.LastTransitionTime.Time, transitionUnchangedToleration) { - return *oldCondition + if oldDegradedCondition != nil && oldDegradedCondition.Status == operatorv1.ConditionFalse && lastTransitionTimeIsRecent(currentTime, oldDegradedCondition.LastTransitionTime.Time, transitionUnchangedToleration) { + return *oldDegradedCondition, nil } - degradedCondition.Status = status - degradedCondition.Reason = strings.Join(degradedReasons, "") - degradedCondition.Message = strings.Join(messages, "\n") + } + + var err error + + if !progressing && len(degradedConditions) != 0 { + degradedMessages := cond.FormatConditions(degradedConditions) + degradedCondition.Status = operatorv1.ConditionTrue + degradedCondition.Reason = "DegradedConditions" + degradedCondition.Message = "One or more other status conditions indicate a degraded state: " + degradedMessages + err = retryable.New(errors.New("DNS operator is degraded"), transitionUnchangedToleration) } else { degradedCondition.Status = operatorv1.ConditionFalse degradedCondition.Reason = "AsExpected" degradedCondition.Message = "Enough DNS pods are available, and the DNS service has a cluster IP address." } + return setDNSLastTransitionTime(°radedCondition, oldDegradedCondition), err +} - return setDNSLastTransitionTime(degradedCondition, oldCondition) +func generateCondition(condtype, reason, message string, status operatorv1.ConditionStatus, transitionTime metav1.Time) *operatorv1.OperatorCondition { + return &operatorv1.OperatorCondition{ + Type: condtype, + Reason: reason, + Status: status, + Message: message, + LastTransitionTime: transitionTime, + } } // computeDNSProgressingCondition computes the dns Progressing status @@ -158,15 +204,10 @@ func computeDNSProgressingCondition(oldCondition *operatorv1.OperatorCondition, if !haveDNSDaemonset { messages = append(messages, "The DNS daemonset does not exist.") } else { - have := dnsDaemonset.Status.NumberAvailable - want := dnsDaemonset.Status.DesiredNumberScheduled - if have != want { - messages = append(messages, fmt.Sprintf("Have %d available DNS pods, want %d.", have, want)) - } - - have = dnsDaemonset.Status.UpdatedNumberScheduled - want = dnsDaemonset.Status.DesiredNumberScheduled - if have != want { + want := dnsDaemonset.Status.DesiredNumberScheduled // num of nodes that should be running the pod. + have := dnsDaemonset.Status.UpdatedNumberScheduled // num of nodes running the updated pod. + // It's progressing when have < want. If have >= want, that's okay. + if have < want { messages = append(messages, fmt.Sprintf("Have %d up-to-date DNS pods, want %d.", have, want)) } @@ -185,9 +226,11 @@ func computeDNSProgressingCondition(oldCondition *operatorv1.OperatorCondition, if !haveNodeResolverDaemonset { messages = append(messages, "The node-resolver daemonset does not exist.") } else { - have := nodeResolverDaemonset.Status.NumberAvailable - want := nodeResolverDaemonset.Status.DesiredNumberScheduled - if have != want { + want := nodeResolverDaemonset.Status.DesiredNumberScheduled // num of nodes that should be running the pod. + have := nodeResolverDaemonset.Status.UpdatedNumberScheduled // num of nodes running the updated pod. + + // It's progressing when have < want. If have >= want, that's okay. + if have < want { messages = append(messages, fmt.Sprintf("Have %d available node-resolver pods, want %d.", have, want)) } } @@ -205,7 +248,7 @@ func computeDNSProgressingCondition(oldCondition *operatorv1.OperatorCondition, } else { progressingCondition.Status = operatorv1.ConditionFalse progressingCondition.Reason = "AsExpected" - progressingCondition.Message = "All DNS and node-resolver pods are available, and the DNS service has a cluster IP address." + progressingCondition.Message = "All DNS and node-resolver pods are updated, and the DNS service has a cluster IP address." } return setDNSLastTransitionTime(progressingCondition, oldCondition) @@ -232,7 +275,7 @@ func computeDNSAvailableCondition(oldCondition *operatorv1.OperatorCondition, cl } if len(unavailableReasons) != 0 { availableCondition.Status = operatorv1.ConditionFalse - availableCondition.Reason = strings.Join(unavailableReasons, "") + availableCondition.Reason = strings.Join(unavailableReasons, " ") availableCondition.Message = strings.Join(messages, "\n") } else { availableCondition.Status = operatorv1.ConditionTrue @@ -277,11 +320,7 @@ func setDNSLastTransitionTime(condition, oldCondition *operatorv1.OperatorCondit // if the provided values should be considered equal for the purpose of determining // whether an update is necessary, false otherwise. func dnsStatusesEqual(a, b operatorv1.DNSStatus) bool { - conditionCmpOpts := []cmp.Option{ - cmpopts.EquateEmpty(), - cmpopts.SortSlices(func(a, b operatorv1.OperatorCondition) bool { return a.Type < b.Type }), - } - if !cmp.Equal(a.Conditions, b.Conditions, conditionCmpOpts...) { + if !cond.ConditionsEqual(a.Conditions, b.Conditions) { return false } if a.ClusterIP != b.ClusterIP { diff --git a/pkg/operator/controller/dns_status_test.go b/pkg/operator/controller/dns_status_test.go index 8311518f6..5a524c303 100644 --- a/pkg/operator/controller/dns_status_test.go +++ b/pkg/operator/controller/dns_status_test.go @@ -6,16 +6,21 @@ import ( "time" operatorv1 "github.com/openshift/api/operator/v1" + retryable "github.com/openshift/cluster-dns-operator/pkg/util/retryableerror" + "sigs.k8s.io/controller-runtime/pkg/reconcile" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" + utilclock "k8s.io/utils/clock" + utilclocktesting "k8s.io/utils/clock/testing" ) var ( - maxUnavailable = intstr.FromInt(1) + maxUnavailable = intstr.FromInt(1) + clock utilclock.Clock = utilclock.RealClock{} ) func TestDNSStatusConditions(t *testing.T) { @@ -24,7 +29,7 @@ func TestDNSStatusConditions(t *testing.T) { haveDNS bool availDNS, desireDNS, updatedDNS int32 haveNR bool - availNR, desireNR int32 + availNR, desireNR, updatedNR int32 managementState operatorv1.ManagementState } type testOut struct { @@ -34,40 +39,51 @@ func TestDNSStatusConditions(t *testing.T) { inputs testIn outputs testOut }{ - {testIn{false, false, 0, 0, 0, false, 0, 0, operatorv1.Managed}, testOut{true, true, false, true}}, - {testIn{false, true, 0, 0, 0, true, 0, 0, operatorv1.Managed}, testOut{true, true, false, true}}, - {testIn{false, true, 0, 0, 0, true, 0, 2, operatorv1.Managed}, testOut{true, true, false, true}}, - {testIn{false, true, 0, 2, 0, true, 0, 0, operatorv1.Managed}, testOut{true, true, false, true}}, - {testIn{false, true, 0, 2, 0, true, 0, 2, operatorv1.Managed}, testOut{true, true, false, true}}, - {testIn{false, true, 1, 2, 1, true, 0, 2, operatorv1.Managed}, testOut{true, true, false, true}}, - {testIn{false, true, 0, 2, 0, true, 1, 2, operatorv1.Managed}, testOut{true, true, false, true}}, - {testIn{false, true, 1, 2, 1, true, 1, 2, operatorv1.Managed}, testOut{true, true, false, true}}, - {testIn{false, true, 1, 2, 1, true, 2, 2, operatorv1.Managed}, testOut{true, true, false, true}}, - {testIn{false, true, 2, 2, 2, true, 1, 2, operatorv1.Managed}, testOut{true, true, false, true}}, - {testIn{false, true, 2, 2, 2, true, 2, 2, operatorv1.Managed}, testOut{true, true, false, true}}, - {testIn{true, true, 0, 0, 0, true, 0, 0, operatorv1.Managed}, testOut{true, false, false, true}}, - {testIn{true, true, 0, 0, 0, true, 0, 2, operatorv1.Managed}, testOut{true, true, false, true}}, - {testIn{true, true, 0, 2, 0, true, 0, 0, operatorv1.Managed}, testOut{true, true, false, true}}, - {testIn{true, true, 0, 2, 0, true, 0, 2, operatorv1.Managed}, testOut{true, true, false, true}}, - {testIn{true, true, 0, 2, 0, true, 1, 2, operatorv1.Managed}, testOut{true, true, false, true}}, - {testIn{true, true, 1, 2, 1, true, 0, 2, operatorv1.Managed}, testOut{false, true, true, true}}, - {testIn{true, true, 1, 2, 1, true, 1, 2, operatorv1.Managed}, testOut{false, true, true, true}}, - {testIn{true, true, 1, 2, 1, true, 2, 2, operatorv1.Managed}, testOut{false, true, true, true}}, - {testIn{true, true, 2, 2, 2, true, 0, 2, operatorv1.Managed}, testOut{false, true, true, true}}, - {testIn{true, true, 2, 2, 2, true, 2, 2, operatorv1.Managed}, testOut{false, false, true, true}}, - {testIn{true, true, 1, 3, 1, true, 3, 3, operatorv1.Managed}, testOut{true, true, true, true}}, - {testIn{true, true, 3, 3, 3, true, 0, 3, operatorv1.Managed}, testOut{false, true, true, true}}, - {testIn{true, true, 2, 3, 2, true, 3, 3, operatorv1.Managed}, testOut{false, true, true, true}}, - {testIn{true, true, 0, 1, 0, true, 0, 1, operatorv1.Managed}, testOut{true, true, false, true}}, - {testIn{true, true, 0, 0, 0, true, 0, 2, operatorv1.Unmanaged}, testOut{true, true, false, false}}, - {testIn{true, true, 1, 3, 1, true, 3, 3, operatorv1.Unmanaged}, testOut{true, true, true, false}}, - {testIn{true, true, 2, 2, 2, true, 0, 2, operatorv1.Unmanaged}, testOut{false, true, true, false}}, - {testIn{true, true, 2, 2, 2, true, 2, 2, operatorv1.Unmanaged}, testOut{false, false, true, false}}, - {testIn{true, true, 0, 0, 0, true, 0, 2, operatorv1.ManagementState("")}, testOut{true, true, false, true}}, - {testIn{true, true, 1, 3, 1, true, 3, 3, operatorv1.ManagementState("")}, testOut{true, true, true, true}}, - {testIn{true, true, 2, 2, 2, true, 0, 2, operatorv1.ManagementState("")}, testOut{false, true, true, true}}, - {testIn{true, true, 2, 2, 1, true, 2, 2, operatorv1.ManagementState("")}, testOut{false, true, true, true}}, - {testIn{true, true, 2, 2, 2, true, 2, 2, operatorv1.ManagementState("")}, testOut{false, false, true, true}}, + // It is always Progressing=true and Degraded=false when cluster ip is missing. + {testIn{false, false, 0, 0, 0, false, 0, 0, 0, operatorv1.Managed}, testOut{false, true, false, true}}, + {testIn{false, true, 0, 0, 0, true, 0, 0, 0, operatorv1.Managed}, testOut{false, true, false, true}}, + {testIn{false, true, 0, 0, 0, true, 0, 2, 0, operatorv1.Managed}, testOut{false, true, false, true}}, + {testIn{false, true, 0, 2, 0, true, 0, 0, 0, operatorv1.Managed}, testOut{false, true, false, true}}, + {testIn{false, true, 0, 2, 0, true, 0, 2, 0, operatorv1.Managed}, testOut{false, true, false, true}}, + {testIn{false, true, 1, 2, 1, true, 0, 2, 1, operatorv1.Managed}, testOut{false, true, false, true}}, + {testIn{false, true, 0, 2, 0, true, 1, 2, 0, operatorv1.Managed}, testOut{false, true, false, true}}, + {testIn{false, true, 1, 2, 1, true, 1, 2, 1, operatorv1.Managed}, testOut{false, true, false, true}}, + {testIn{false, true, 1, 2, 1, true, 2, 2, 1, operatorv1.Managed}, testOut{false, true, false, true}}, + {testIn{false, true, 2, 2, 2, true, 1, 2, 2, operatorv1.Managed}, testOut{false, true, false, true}}, + {testIn{false, true, 2, 2, 2, true, 2, 2, 2, operatorv1.Managed}, testOut{false, true, false, true}}, + // It is Progressing=false and Degraded=true when desireDNS and/or desireNR are 0, and there are no availDNS. No checks involving time in this suite. + {testIn{true, true, 0, 0, 0, true, 0, 0, 0, operatorv1.Managed}, testOut{true, false, false, true}}, + {testIn{true, true, 0, 0, 0, true, 2, 2, 2, operatorv1.Managed}, testOut{true, false, false, true}}, + {testIn{true, true, 0, 2, 2, true, 0, 0, 0, operatorv1.Managed}, testOut{true, false, false, true}}, + {testIn{true, true, 0, 2, 2, true, 0, 0, 0, operatorv1.Managed}, testOut{true, false, false, true}}, + // It is Progressing=true and Degraded=false when updatedNR < desireNR or updatedDNS < desireDNS. + {testIn{true, true, 0, 0, 0, true, 0, 2, 0, operatorv1.Managed}, testOut{false, true, false, true}}, + {testIn{true, true, 0, 2, 0, true, 0, 0, 0, operatorv1.Managed}, testOut{false, true, false, true}}, + {testIn{true, true, 0, 2, 0, true, 0, 2, 3, operatorv1.Managed}, testOut{false, true, false, true}}, + {testIn{true, true, 0, 2, 3, true, 0, 2, 0, operatorv1.Managed}, testOut{false, true, false, true}}, + {testIn{true, true, 1, 2, 1, true, 0, 2, 0, operatorv1.Managed}, testOut{false, true, true, true}}, + {testIn{true, true, 1, 2, 1, true, 1, 2, 1, operatorv1.Managed}, testOut{false, true, true, true}}, + {testIn{true, true, 1, 2, 1, true, 2, 2, 2, operatorv1.Managed}, testOut{false, true, true, true}}, + {testIn{true, true, 2, 2, 2, true, 0, 2, 0, operatorv1.Managed}, testOut{false, true, true, true}}, + {testIn{true, true, 1, 3, 1, true, 3, 3, 3, operatorv1.Managed}, testOut{false, true, true, true}}, + {testIn{true, true, 3, 3, 3, true, 0, 3, 0, operatorv1.Managed}, testOut{false, true, true, true}}, + {testIn{true, true, 2, 3, 2, true, 3, 3, 3, operatorv1.Managed}, testOut{false, true, true, true}}, + {testIn{true, true, 0, 1, 0, true, 0, 1, 0, operatorv1.Managed}, testOut{false, true, false, true}}, + // It is Upgradeable=false whenever managementState=Unmanaged + {testIn{true, true, 0, 0, 0, true, 0, 2, 0, operatorv1.Unmanaged}, testOut{false, true, false, false}}, + {testIn{true, true, 1, 3, 1, true, 3, 3, 3, operatorv1.Unmanaged}, testOut{false, true, true, false}}, + {testIn{true, true, 2, 2, 2, true, 0, 2, 0, operatorv1.Unmanaged}, testOut{false, true, true, false}}, + // It is Available=false whenever availDNS=0 + {testIn{true, true, 0, 0, 0, true, 0, 2, 0, operatorv1.ManagementState("")}, testOut{false, true, false, true}}, + {testIn{true, true, 1, 1, 0, true, 0, 2, 0, operatorv1.ManagementState("")}, testOut{false, true, true, true}}, + {testIn{true, true, 0, 5, 1, true, 3, 3, 3, operatorv1.ManagementState("")}, testOut{false, true, false, true}}, + {testIn{true, true, 2, 2, 2, true, 0, 2, 0, operatorv1.ManagementState("")}, testOut{false, true, true, true}}, + {testIn{true, true, 2, 2, 1, true, 2, 2, 2, operatorv1.ManagementState("")}, testOut{false, true, true, true}}, + // It is Degraded=false, Progressing=false whenever avail=desired=updated. + {testIn{true, true, 2, 2, 2, true, 2, 2, 2, operatorv1.Managed}, testOut{false, false, true, true}}, + {testIn{true, true, 2, 2, 2, true, 2, 2, 2, operatorv1.Unmanaged}, testOut{false, false, true, false}}, + {testIn{true, true, 2, 2, 2, true, 2, 2, 2, operatorv1.ManagementState("")}, testOut{false, false, true, true}}, + // We should never have a situation where Degraded=true and Progressing=true } for i, tc := range testCases { @@ -101,7 +117,7 @@ func TestDNSStatusConditions(t *testing.T) { dnsDaemonset.Spec.Template.Spec.NodeSelector = nodeSelectorForDNS(&operatorv1.DNS{}) dnsDaemonset.Spec.Template.Spec.Tolerations = tolerationsForDNS(&operatorv1.DNS{}) } - if tc.inputs.haveDNS { + if tc.inputs.haveNR { nodeResolverDaemonset = &appsv1.DaemonSet{ ObjectMeta: metav1.ObjectMeta{ Name: fmt.Sprintf("node-resolver-%d", i+1), @@ -116,6 +132,7 @@ func TestDNSStatusConditions(t *testing.T) { Status: appsv1.DaemonSetStatus{ DesiredNumberScheduled: tc.inputs.desireNR, NumberAvailable: tc.inputs.availNR, + UpdatedNumberScheduled: tc.inputs.updatedNR, }, } } @@ -159,7 +176,7 @@ func TestDNSStatusConditions(t *testing.T) { Status: upgradeable, }, } - actual := computeDNSStatusConditions(&dns, clusterIP, tc.inputs.haveDNS, dnsDaemonset, tc.inputs.haveNR, nodeResolverDaemonset, 0, &reconcile.Result{}) + actual, _ := computeDNSStatusConditions(&dns, clusterIP, tc.inputs.haveDNS, dnsDaemonset, tc.inputs.haveNR, nodeResolverDaemonset, 0, &reconcile.Result{}) gotExpected := true if len(actual) != len(expected) { gotExpected = false @@ -194,7 +211,7 @@ func TestDNSStatusConditions(t *testing.T) { case operatorv1.Unmanaged: managementState = "Unmanaged" } - description := fmt.Sprintf("%s, %d/%d DNS pods available, %d/%d node-resolver pods available, managementState is %s", haveClusterIP, tc.inputs.availDNS, tc.inputs.desireDNS, tc.inputs.availNR, tc.inputs.desireNR, managementState) + description := fmt.Sprintf("%s, %d/%d/%d DNS pods available/updated/desired, %d/%d node-resolver pods available/desired, managementState is %s", haveClusterIP, tc.inputs.availDNS, tc.inputs.updatedDNS, tc.inputs.desireDNS, tc.inputs.availNR, tc.inputs.desireNR, managementState) if !gotExpected { t.Fatalf("%q:\nexpected %#v\ngot %#v", description, expected, actual) } @@ -204,6 +221,14 @@ func TestDNSStatusConditions(t *testing.T) { // TestComputeDNSDegradedCondition verifies the computeDNSDegradedCondition has // the expected behavior. func TestComputeDNSDegradedCondition(t *testing.T) { + // Inject a fake clock. + fakeClock := utilclocktesting.NewFakeClock(time.Time{}) + clock = fakeClock + defer func() { + // Don't forget to reset it. + clock = utilclock.RealClock{} + }() + makeDaemonSet := func(desired, available int) *appsv1.DaemonSet { return &appsv1.DaemonSet{ Spec: appsv1.DaemonSetSpec{ @@ -219,80 +244,207 @@ func TestComputeDNSDegradedCondition(t *testing.T) { }, } } + + degradedFalse := &operatorv1.OperatorCondition{ + Type: operatorv1.OperatorStatusTypeDegraded, + Status: operatorv1.ConditionFalse, + LastTransitionTime: metav1.NewTime(clock.Now()), + } + degradedTrue := &operatorv1.OperatorCondition{ + Type: operatorv1.OperatorStatusTypeDegraded, + Status: operatorv1.ConditionTrue, + LastTransitionTime: metav1.NewTime(clock.Now()), + } + + progressingTrue := &operatorv1.OperatorCondition{ + Type: operatorv1.OperatorStatusTypeProgressing, + Status: operatorv1.ConditionTrue, + LastTransitionTime: metav1.NewTime(clock.Now()), + } + progressingFalse := &operatorv1.OperatorCondition{ + Type: operatorv1.OperatorStatusTypeProgressing, + Status: operatorv1.ConditionFalse, + LastTransitionTime: metav1.NewTime(clock.Now()), + } + testCases := []struct { - name string - clusterIP string - dnsDaemonset *appsv1.DaemonSet - expected operatorv1.ConditionStatus + name string + oldDegradedCondition *operatorv1.OperatorCondition + newProgressingCondition *operatorv1.OperatorCondition + clusterIP string + dnsDaemonset *appsv1.DaemonSet + nrDaemonset *appsv1.DaemonSet + expected operatorv1.ConditionStatus + // Expect a requeue when it's Progressing=false, Degraded=true, + // and has been degraded past the grace period or may become degraded soon; OR + // it's previously and still Degraded after the grace period. + expectRequeue bool + // A degraded condition will give a retry duration based on its grace period. + expectAfter time.Duration }{ { - name: "0 available", - clusterIP: "172.30.0.10", - dnsDaemonset: makeDaemonSet(6, 0), - expected: operatorv1.ConditionTrue, + name: "0 available, previously Degraded=false and Progressing=false condition", + oldDegradedCondition: degradedFalse, + newProgressingCondition: progressingFalse, + clusterIP: "172.30.0.10", + dnsDaemonset: makeDaemonSet(6, 0), + nrDaemonset: makeDaemonSet(6, 6), + expected: operatorv1.ConditionFalse, }, { - name: "no clusterIP, 0 available", - clusterIP: "", - dnsDaemonset: makeDaemonSet(6, 0), - expected: operatorv1.ConditionTrue, + name: "0 available, previously Degraded=false and Progressing=true condition", + oldDegradedCondition: degradedFalse, + newProgressingCondition: progressingTrue, + clusterIP: "172.30.0.10", + dnsDaemonset: makeDaemonSet(6, 0), + nrDaemonset: makeDaemonSet(6, 6), + expected: operatorv1.ConditionFalse, }, { - name: "no clusterIP", - clusterIP: "", - dnsDaemonset: makeDaemonSet(6, 6), - expected: operatorv1.ConditionTrue, + name: "no cluster ip, previously Degraded=false and Progressing=true condition", + oldDegradedCondition: degradedFalse, + newProgressingCondition: progressingTrue, + clusterIP: "", + dnsDaemonset: makeDaemonSet(6, 6), + nrDaemonset: makeDaemonSet(6, 6), + expected: operatorv1.ConditionFalse, }, { - name: "0 desired", - clusterIP: "172.30.0.10", - dnsDaemonset: makeDaemonSet(0, 0), - expected: operatorv1.ConditionTrue, + name: "no cluster ip, previously Degraded=false and Progressing=false condition", + oldDegradedCondition: degradedFalse, + newProgressingCondition: progressingFalse, + clusterIP: "", + dnsDaemonset: makeDaemonSet(6, 6), + nrDaemonset: makeDaemonSet(6, 6), + expected: operatorv1.ConditionFalse, }, { - name: "0 available", - clusterIP: "172.30.0.10", - dnsDaemonset: makeDaemonSet(6, 0), - expected: operatorv1.ConditionTrue, + name: "no cluster ip, previously Degraded=true and Progressing=false condition", + oldDegradedCondition: degradedTrue, + newProgressingCondition: progressingFalse, + clusterIP: "", + dnsDaemonset: makeDaemonSet(6, 6), + nrDaemonset: makeDaemonSet(6, 6), + expectRequeue: true, + expected: operatorv1.ConditionTrue, }, { - name: "too few pods DNS pods available (percentage)", - clusterIP: "172.30.0.10", - dnsDaemonset: makeDaemonSet(100, 89), - expected: operatorv1.ConditionTrue, + name: "no cluster ip, previously Degraded=true and Progressing=true condition", + oldDegradedCondition: degradedTrue, + newProgressingCondition: progressingTrue, + clusterIP: "", + dnsDaemonset: makeDaemonSet(6, 6), + nrDaemonset: makeDaemonSet(6, 6), + expected: operatorv1.ConditionFalse, }, { - name: "node-resolver pods unavailable is ok (percentage)", - clusterIP: "172.30.0.10", - dnsDaemonset: makeDaemonSet(100, 100), - expected: operatorv1.ConditionFalse, + name: "0 available, previously Degraded=true condition", + oldDegradedCondition: degradedTrue, + newProgressingCondition: progressingFalse, + clusterIP: "172.30.0.10", + dnsDaemonset: makeDaemonSet(6, 0), + nrDaemonset: makeDaemonSet(6, 6), + expectRequeue: true, + expected: operatorv1.ConditionTrue, }, { - name: "too few DNS pods available (integer)", - clusterIP: "172.30.0.10", - dnsDaemonset: makeDaemonSet(6, 4), - expected: operatorv1.ConditionTrue, + name: "node-resolver 0 available is ok", + oldDegradedCondition: degradedFalse, + newProgressingCondition: progressingFalse, + clusterIP: "172.30.0.10", + dnsDaemonset: makeDaemonSet(10, 10), + nrDaemonset: makeDaemonSet(6, 0), + expected: operatorv1.ConditionFalse, }, { - name: "enough available (integer)", - clusterIP: "172.30.0.10", - dnsDaemonset: makeDaemonSet(6, 5), - expected: operatorv1.ConditionFalse, + name: "both ok", + oldDegradedCondition: degradedFalse, + newProgressingCondition: progressingFalse, + clusterIP: "172.30.0.10", + dnsDaemonset: makeDaemonSet(6, 6), + nrDaemonset: makeDaemonSet(6, 6), + expected: operatorv1.ConditionFalse, }, { - name: "all available", - clusterIP: "172.30.0.10", - dnsDaemonset: makeDaemonSet(6, 6), - expected: operatorv1.ConditionFalse, + name: "no clusterIP, 0 available, previously Degraded=false", + oldDegradedCondition: degradedFalse, + newProgressingCondition: progressingFalse, + clusterIP: "", + dnsDaemonset: makeDaemonSet(6, 0), + nrDaemonset: makeDaemonSet(6, 0), + expectRequeue: false, // Don't requeue if lastTransitionTime is recent. + expected: operatorv1.ConditionFalse, + }, + { + name: "no clusterIP, 0 available, previously Degraded=true", + oldDegradedCondition: degradedTrue, + newProgressingCondition: progressingFalse, + clusterIP: "", + dnsDaemonset: makeDaemonSet(6, 0), + nrDaemonset: makeDaemonSet(6, 0), + expectRequeue: true, // Requeue while it's still degraded. + expected: operatorv1.ConditionTrue, + }, + { + name: "no clusterIP, previously Degraded=false", + oldDegradedCondition: degradedFalse, + newProgressingCondition: progressingFalse, + clusterIP: "", + dnsDaemonset: makeDaemonSet(6, 6), + nrDaemonset: makeDaemonSet(6, 6), + expectRequeue: false, // Don't requeue if lastTransitionTime is recent. + expected: operatorv1.ConditionFalse, + }, + { + name: "no clusterIP, previously Degraded=true", + oldDegradedCondition: degradedTrue, + newProgressingCondition: progressingFalse, + clusterIP: "", + dnsDaemonset: makeDaemonSet(6, 6), + nrDaemonset: makeDaemonSet(6, 6), + expectRequeue: true, // Requeue while it's still degraded. + expected: operatorv1.ConditionTrue, + }, + { + name: "0 desired, previously Degraded=False", + oldDegradedCondition: degradedFalse, + newProgressingCondition: progressingFalse, + clusterIP: "172.30.0.10", + dnsDaemonset: makeDaemonSet(0, 0), + nrDaemonset: makeDaemonSet(0, 0), + expectRequeue: false, // Don't requeue if lastTransitionTime is recent. + expected: operatorv1.ConditionFalse, + }, + { + name: "0 desired, previously Degraded=true", + oldDegradedCondition: degradedTrue, + newProgressingCondition: progressingFalse, + clusterIP: "172.30.0.10", + dnsDaemonset: makeDaemonSet(0, 0), + nrDaemonset: makeDaemonSet(0, 0), + expectRequeue: true, // Requeue while it's still degraded. + expected: operatorv1.ConditionTrue, }, } for _, tc := range testCases { - oldCondition := &operatorv1.OperatorCondition{ - Type: operatorv1.OperatorStatusTypeDegraded, - Status: operatorv1.ConditionUnknown, + actual, retryErr := computeDNSDegradedCondition(tc.oldDegradedCondition, tc.newProgressingCondition, tc.clusterIP, true, tc.dnsDaemonset, 0, time.Time{}) + switch e := retryErr.(type) { + case retryable.Error: + if !tc.expectRequeue { + t.Errorf("%q: expected not to be told to requeue", tc.name) + } + if tc.expectAfter.Seconds() != e.After().Seconds() { + t.Errorf("%q: expected requeue after %s, got %s", tc.name, tc.expectAfter.String(), e.After().String()) + } + case nil: + if tc.expectRequeue { + t.Errorf("%q: expected to be told to requeue", tc.name) + } + default: + t.Errorf("%q: unexpected error: %v", tc.name, retryErr) + continue } - actual := computeDNSDegradedCondition(oldCondition, tc.clusterIP, true, tc.dnsDaemonset, 0, time.Time{}) if actual.Status != tc.expected { t.Errorf("%q: expected status to be %s, got %s: %#v", tc.name, tc.expected, actual.Status, actual) } @@ -360,7 +512,7 @@ func TestComputeDNSProgressingCondition(t *testing.T) { expected: operatorv1.ConditionFalse, }, { - name: "0/6 available DNS pods with MaxUnavailable 10%", + name: "0/6 available DNS pods", clusterIP: "172.30.0.10", dnsDaemonset: makeDaemonSet(6, 0, 0, defaultSelector, defaultTolerations), nrDaemonset: makeDaemonSet(6, 6, 6, defaultSelector, defaultTolerations), @@ -395,15 +547,6 @@ func TestComputeDNSProgressingCondition(t *testing.T) { tolerations: customTolerations, expected: operatorv1.ConditionFalse, }, - { - name: "5/6 available", - clusterIP: "172.30.0.10", - dnsDaemonset: makeDaemonSet(6, 5, 5, defaultSelector, defaultTolerations), - nrDaemonset: makeDaemonSet(6, 6, 6, defaultSelector, defaultTolerations), - nodeSelector: defaultSelector, - tolerations: defaultTolerations, - expected: operatorv1.ConditionTrue, - }, { name: "6/6 DNS pods missing default node selector", clusterIP: "172.30.0.10", @@ -452,26 +595,28 @@ func TestComputeDNSProgressingCondition(t *testing.T) { } for _, tc := range testCases { - oldCondition := operatorv1.OperatorCondition{ - Type: operatorv1.OperatorStatusTypeProgressing, - Status: operatorv1.ConditionUnknown, - } - dns := &operatorv1.DNS{ - Spec: operatorv1.DNSSpec{ - NodePlacement: operatorv1.DNSNodePlacement{ - NodeSelector: tc.nodeSelector, - Tolerations: tc.tolerations, + t.Run(tc.name, func(t *testing.T) { + oldCondition := operatorv1.OperatorCondition{ + Type: operatorv1.OperatorStatusTypeProgressing, + Status: operatorv1.ConditionUnknown, + } + dns := &operatorv1.DNS{ + Spec: operatorv1.DNSSpec{ + NodePlacement: operatorv1.DNSNodePlacement{ + NodeSelector: tc.nodeSelector, + Tolerations: tc.tolerations, + }, }, - }, - Status: operatorv1.DNSStatus{ - Conditions: []operatorv1.OperatorCondition{oldCondition}, - }, - } - var reconcileResult reconcile.Result - actual := computeDNSProgressingCondition(&oldCondition, dns, tc.clusterIP, true, tc.dnsDaemonset, true, tc.nrDaemonset, 0, time.Time{}, &reconcileResult) - if actual.Status != tc.expected { - t.Errorf("%q: expected status to be %s, got %s: %#v", tc.name, tc.expected, actual.Status, actual) - } + Status: operatorv1.DNSStatus{ + Conditions: []operatorv1.OperatorCondition{oldCondition}, + }, + } + var reconcileResult reconcile.Result + actual := computeDNSProgressingCondition(&oldCondition, dns, tc.clusterIP, true, tc.dnsDaemonset, true, tc.nrDaemonset, 0, time.Time{}, &reconcileResult) + if actual.Status != tc.expected { + t.Errorf("%q: expected status to be %s, got %s: %#v", tc.name, tc.expected, actual.Status, actual) + } + }) } } @@ -499,39 +644,22 @@ func TestSkippingStatusUpdates(t *testing.T) { } } testCases := []struct { - name string - clusterIP string - oldCondition operatorv1.OperatorCondition - currentTime time.Time - toleration time.Duration - expected operatorv1.ConditionStatus - reconcileResult reconcile.Result + name string + clusterIP string + dnsDaemonset *appsv1.DaemonSet + nrDaemonset *appsv1.DaemonSet + oldCondition operatorv1.OperatorCondition + progressingCondition operatorv1.OperatorCondition + currentTime time.Time + toleration time.Duration + expected operatorv1.ConditionStatus + reconcileResult reconcile.Result }{ { - name: "there is a clusterIP, and time toleration doesn't matter, should return Progressing=ConditionFalse", - clusterIP: "1.2.3.4", - oldCondition: operatorv1.OperatorCondition{ - Type: operatorv1.OperatorStatusTypeProgressing, - Status: operatorv1.ConditionFalse, - }, - expected: operatorv1.ConditionFalse, - }, - { - name: "no clusterIP, should return Progressing=ConditionTrue", - clusterIP: "", - oldCondition: operatorv1.OperatorCondition{ - Type: operatorv1.OperatorStatusTypeProgressing, - Status: operatorv1.ConditionFalse, - LastTransitionTime: metav1.NewTime(time.Date(2022, time.Month(5), 19, 1, 10, 44, 0, time.UTC)), - }, - currentTime: time.Date(2022, time.Month(5), 19, 1, 10, 50, 0, time.UTC), - // last-curr = 6s, tolerate 5s, so shouldn't prevent the flap. - toleration: 5 * time.Second, - expected: operatorv1.ConditionTrue, - }, - { - name: "no clusterIP, would return Progressing=ConditionTrue, but Progressing was set to false within tolerated duration, so returns Progressing=ConditionFalse", - clusterIP: "", + name: "would return Progressing=ConditionTrue, but Progressing was set to false within tolerated duration, so returns Progressing=ConditionFalse", + clusterIP: "1.2.3.4", + dnsDaemonset: makeDaemonSet(6, 1, 1), + nrDaemonset: makeDaemonSet(6, 6, 6), oldCondition: operatorv1.OperatorCondition{ Type: operatorv1.OperatorStatusTypeProgressing, Status: operatorv1.ConditionFalse, @@ -547,40 +675,110 @@ func TestSkippingStatusUpdates(t *testing.T) { }, }, { - name: "there is a clusterIP, and time toleration doesn't matter, should return Degraded=ConditionFalse", - clusterIP: "1.2.3.4", + name: "there is a clusterIP, and time toleration doesn't matter, should return Degraded=ConditionFalse", + clusterIP: "1.2.3.4", + dnsDaemonset: makeDaemonSet(6, 5, 6), + nrDaemonset: makeDaemonSet(6, 6, 6), oldCondition: operatorv1.OperatorCondition{ Type: operatorv1.OperatorStatusTypeDegraded, Status: operatorv1.ConditionFalse, }, + progressingCondition: operatorv1.OperatorCondition{ + Type: operatorv1.OperatorStatusTypeProgressing, + Status: operatorv1.ConditionFalse, + }, expected: operatorv1.ConditionFalse, }, { - name: "no clusterIP, should return Degraded=ConditionTrue", - clusterIP: "", + name: "should return Degraded=ConditionTrue because enough time has elapsed for an update", + clusterIP: "1.2.3.4", + dnsDaemonset: makeDaemonSet(6, 0, 6), + nrDaemonset: makeDaemonSet(6, 6, 6), oldCondition: operatorv1.OperatorCondition{ Type: operatorv1.OperatorStatusTypeDegraded, Status: operatorv1.ConditionFalse, LastTransitionTime: metav1.NewTime(time.Date(2022, time.Month(5), 12, 1, 10, 50, 0, time.UTC)), }, + progressingCondition: operatorv1.OperatorCondition{ + Type: operatorv1.OperatorStatusTypeProgressing, + Status: operatorv1.ConditionFalse, + }, currentTime: time.Date(2022, time.Month(5), 19, 1, 10, 50, 0, time.UTC), - // last-curr = 1w, tolerate 5s, so shouldn't prevent the flap. + // last - current = 1w, and we tolerate 5s, so this should change from False to True. toleration: 5 * time.Second, expected: operatorv1.ConditionTrue, + reconcileResult: reconcile.Result{ + Requeue: true, + RequeueAfter: 5 * time.Second, + }, }, { - name: "no clusterIP, would return Progressing=ConditionTrue, but Degraded was set to false within tolerated duration, so returns Progressing=ConditionFalse", - clusterIP: "", + name: "should return Degraded=ConditionFalse because not enough time has elapsed for an update", + clusterIP: "1.2.3.4", + dnsDaemonset: makeDaemonSet(6, 0, 6), + nrDaemonset: makeDaemonSet(6, 6, 6), + oldCondition: operatorv1.OperatorCondition{ + Type: operatorv1.OperatorStatusTypeDegraded, + Status: operatorv1.ConditionFalse, + LastTransitionTime: metav1.NewTime(time.Date(2022, time.Month(5), 12, 1, 10, 50, 0, time.UTC)), + }, + currentTime: time.Date(2022, time.Month(5), 12, 1, 10, 52, 0, time.UTC), + // last - current = 2s, and we tolerate 5s, so this should not change from False. + toleration: 5 * time.Second, + expected: operatorv1.ConditionFalse, + }, + { + name: "should return Degraded=ConditionFalse because it is Progressing", + clusterIP: "1.2.3.4", + dnsDaemonset: makeDaemonSet(6, 0, 6), + nrDaemonset: makeDaemonSet(6, 6, 6), + oldCondition: operatorv1.OperatorCondition{ + Type: operatorv1.OperatorStatusTypeDegraded, + Status: operatorv1.ConditionFalse, + LastTransitionTime: metav1.NewTime(time.Date(2022, time.Month(5), 12, 1, 10, 50, 0, time.UTC)), + }, + progressingCondition: operatorv1.OperatorCondition{ + Type: operatorv1.OperatorStatusTypeProgressing, + Status: operatorv1.ConditionTrue, + }, + currentTime: time.Date(2022, time.Month(5), 19, 1, 10, 50, 0, time.UTC), + toleration: 5 * time.Second, + expected: operatorv1.ConditionFalse, + }, + { + name: "would return Degraded=ConditionTrue, but Degraded was set to false within tolerated duration, so returns Degraded=ConditionFalse", + clusterIP: "1.2.3.4", + dnsDaemonset: makeDaemonSet(6, 1, 6), + nrDaemonset: makeDaemonSet(6, 6, 6), oldCondition: operatorv1.OperatorCondition{ Type: operatorv1.OperatorStatusTypeDegraded, Status: operatorv1.ConditionFalse, LastTransitionTime: metav1.NewTime(time.Date(2022, time.Month(5), 19, 1, 9, 50, 0, time.UTC)), }, currentTime: time.Date(2022, time.Month(5), 19, 1, 10, 50, 0, time.UTC), - // last-curr = 1m, tolerate 1m, so should prevent the flap. - toleration: 1 * time.Minute, + // last-curr = 1m, tolerate 2m, so should prevent the flap. + toleration: 2 * time.Minute, expected: operatorv1.ConditionFalse, }, + { + name: "should return Degraded=ConditionTrue, because Degraded was set to false before the tolerated interval", + clusterIP: "1.2.3.4", + dnsDaemonset: makeDaemonSet(6, 1, 6), + nrDaemonset: makeDaemonSet(6, 6, 6), + oldCondition: operatorv1.OperatorCondition{ + Type: operatorv1.OperatorStatusTypeDegraded, + Status: operatorv1.ConditionFalse, + LastTransitionTime: metav1.NewTime(time.Date(2022, time.Month(5), 19, 1, 9, 50, 0, time.UTC)), + }, + currentTime: time.Date(2022, time.Month(5), 19, 1, 11, 50, 0, time.UTC), + // last-curr = 2m, so change to Degraded=ConditionTrue is correct. + toleration: 1 * time.Minute, + expected: operatorv1.ConditionTrue, + reconcileResult: reconcile.Result{ + Requeue: true, + RequeueAfter: 1 * time.Minute, + }, + }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { @@ -595,23 +793,36 @@ func TestSkippingStatusUpdates(t *testing.T) { Conditions: []operatorv1.OperatorCondition{tc.oldCondition}, }, } - dnsDaemonset := makeDaemonSet(6, 6, 6) - nrDaemonset := makeDaemonSet(6, 6, 6) + var actual operatorv1.OperatorCondition var actualReconcileResult reconcile.Result + var retryErr error if tc.oldCondition.Type == operatorv1.OperatorStatusTypeProgressing { - actual = computeDNSProgressingCondition(&tc.oldCondition, dns, tc.clusterIP, true, dnsDaemonset, true, nrDaemonset, tc.toleration, tc.currentTime, &actualReconcileResult) - } else if tc.oldCondition.Type == operatorv1.OperatorStatusTypeDegraded { - actual = computeDNSDegradedCondition(&tc.oldCondition, tc.clusterIP, true, dnsDaemonset, tc.toleration, tc.currentTime) + actual = computeDNSProgressingCondition(&tc.oldCondition, dns, tc.clusterIP, true, tc.dnsDaemonset, true, tc.nrDaemonset, tc.toleration, tc.currentTime, &actualReconcileResult) + if actualReconcileResult != tc.reconcileResult { + t.Errorf("%q: expected requeue to be %+v, got %+v", tc.name, tc.reconcileResult, actualReconcileResult) + } } else { - t.Fatalf("Unknown condition type: %s", tc.oldCondition.Type) + actual, retryErr = computeDNSDegradedCondition(&tc.oldCondition, &tc.progressingCondition, tc.clusterIP, true, tc.dnsDaemonset, tc.toleration, tc.currentTime) + switch e := retryErr.(type) { + case retryable.Error: + if !tc.reconcileResult.Requeue { + t.Errorf("%q: expected not to be told to requeue", tc.name) + } + if tc.reconcileResult.RequeueAfter.Seconds() != e.After().Seconds() { + t.Errorf("%q: expected requeue after %s, got %s", tc.name, tc.reconcileResult.RequeueAfter.String(), e.After().String()) + } + case nil: + if tc.reconcileResult.Requeue { + t.Errorf("%q: expected to be told to requeue", tc.name) + } + default: + t.Errorf("%q: unexpected error: %v", tc.name, retryErr) + } } if actual.Status != tc.expected { t.Errorf("%q: expected status to be %s, got %s: %#v", tc.name, tc.expected, actual.Status, actual) } - if actualReconcileResult != tc.reconcileResult { - t.Errorf("%q: expected requeue to be %+v, got %+v", tc.name, tc.reconcileResult, actualReconcileResult) - } }) } } diff --git a/pkg/operator/controller/status/controller.go b/pkg/operator/controller/status/controller.go index 66c381053..d9de75249 100644 --- a/pkg/operator/controller/status/controller.go +++ b/pkg/operator/controller/status/controller.go @@ -181,7 +181,7 @@ func (r *reconciler) Reconcile(ctx context.Context, request reconcile.Request) ( co.Status.Conditions = mergeConditions(co.Status.Conditions, computeOperatorAvailableCondition(state.haveDNS, &state.dns), operatorProgressingCondition, - computeOperatorDegradedCondition(state.haveDNS, &state.dns), + computeOperatorDegradedCondition(state.haveDNS, &state.dns, oldVersions, newVersions, curVersions), ) co.Status.Versions = computeOperatorStatusVersions(curVersions) co.Status.Conditions = mergeConditions(co.Status.Conditions, computeOperatorUpgradeableCondition(&state.dns)) @@ -401,7 +401,9 @@ func computeOperatorUpgradeableCondition(dns *operatorv1.DNS) configv1.ClusterOp } // computeOperatorDegradedCondition computes the operator's current Degraded status state. -func computeOperatorDegradedCondition(haveDNS bool, dns *operatorv1.DNS) configv1.ClusterOperatorStatusCondition { +func computeOperatorDegradedCondition(haveDNS bool, dns *operatorv1.DNS, oldVersions, newVersions, curVersions map[string]string) configv1.ClusterOperatorStatusCondition { + var messages []string + if !haveDNS { return configv1.ClusterOperatorStatusCondition{ Type: configv1.OperatorDegraded, @@ -411,18 +413,23 @@ func computeOperatorDegradedCondition(haveDNS bool, dns *operatorv1.DNS) configv } } - var degraded bool - for _, cond := range dns.Status.Conditions { - if cond.Type == operatorv1.OperatorStatusTypeDegraded && cond.Status == operatorv1.ConditionTrue { - degraded = true + // See OCPBUGS-14346. If the operator is upgrading, we can't consider it as degraded. + upgrading, _ := isUpgrading(curVersions, oldVersions, newVersions) + if !upgrading { + var degraded bool + for _, cond := range dns.Status.Conditions { + if cond.Type == operatorv1.OperatorStatusTypeDegraded && cond.Status == operatorv1.ConditionTrue { + degraded = true + messages = append(messages, cond.Message) + } } - } - if degraded { - return configv1.ClusterOperatorStatusCondition{ - Type: configv1.OperatorDegraded, - Status: configv1.ConditionTrue, - Reason: "DNSDegraded", - Message: fmt.Sprintf("DNS %s is degraded", dns.Name), + if degraded { + return configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorDegraded, + Status: configv1.ConditionTrue, + Reason: "DNSDegraded", + Message: fmt.Sprintf("DNS %s is degraded: %s", dns.Name, strings.Join(messages, "\n")), + } } } return configv1.ClusterOperatorStatusCondition{ @@ -468,24 +475,17 @@ func computeOperatorProgressingCondition(haveDNS bool, dns *operatorv1.DNS, oldV } } - upgrading := false - for name, curVersion := range curVersions { - if oldVersion, ok := oldVersions[name]; ok && oldVersion != curVersion { - messages = append(messages, fmt.Sprintf("Upgraded %s to %q.", name, curVersion)) - } - if newVersion, ok := newVersions[name]; ok && curVersion != newVersion { - upgrading = true - messages = append(messages, fmt.Sprintf("Upgrading %s to %q.", name, newVersion)) - } - } + // If the operator is upgrading, note it as a Progressing reason and add the upgrading messages + upgrading, upgradingMessages := isUpgrading(curVersions, oldVersions, newVersions) if upgrading { status = configv1.ConditionTrue + messages = append(messages, strings.Join(upgradingMessages, "And ")) progressingReasons = append(progressingReasons, "Upgrading") } if len(progressingReasons) != 0 { progressingCondition.Status = status - progressingCondition.Reason = strings.Join(progressingReasons, "And") + progressingCondition.Reason = strings.Join(progressingReasons, "And ") progressingCondition.Message = strings.Join(messages, "\n") } else { progressingCondition.Status = configv1.ConditionFalse @@ -496,6 +496,22 @@ func computeOperatorProgressingCondition(haveDNS bool, dns *operatorv1.DNS, oldV return progressingCondition } +func isUpgrading(curVersions, oldVersions, newVersions map[string]string) (bool, []string) { + var messages []string + upgrading := false + + for name, curVersion := range curVersions { + if oldVersion, ok := oldVersions[name]; ok && oldVersion != curVersion { + messages = append(messages, fmt.Sprintf("Upgraded %s to %q.", name, curVersion)) + } + if newVersion, ok := newVersions[name]; ok && curVersion != newVersion { + upgrading = true + messages = append(messages, fmt.Sprintf("Upgrading %s to %q.", name, newVersion)) + } + } + return upgrading, messages +} + // computeOldVersions returns a map of operand name to version computed from the // given clusteroperator status. func computeOldVersions(oldVersions []configv1.OperandVersion) map[string]string { diff --git a/pkg/operator/controller/status/controller_test.go b/pkg/operator/controller/status/controller_test.go index cb762a583..7f116d85c 100644 --- a/pkg/operator/controller/status/controller_test.go +++ b/pkg/operator/controller/status/controller_test.go @@ -631,3 +631,157 @@ func TestComputeCurrentVersions(t *testing.T) { } } } + +func TestComputeOperatorDegradedCondition(t *testing.T) { + type versions struct { + operator, operand string + } + + testCases := []struct { + description string + dnsMissing bool + oldVersions versions + newVersions versions + curVersions versions + expectDegraded configv1.ConditionStatus + }{ + { + description: "dns does not exist", + dnsMissing: true, + expectDegraded: configv1.ConditionTrue, + }, + { + description: "versions match", + oldVersions: versions{"v1", "dns-v1"}, + newVersions: versions{"v1", "dns-v1"}, + curVersions: versions{"v1", "dns-v1"}, + expectDegraded: configv1.ConditionFalse, + }, + { + description: "operator upgrade in progress", + oldVersions: versions{"v1", "dns-v1"}, + newVersions: versions{"v2", "dns-v1"}, + curVersions: versions{"v1", "dns-v1"}, + expectDegraded: configv1.ConditionFalse, + }, + { + description: "operand upgrade in progress", + oldVersions: versions{"v1", "dns-v1"}, + newVersions: versions{"v1", "dns-v2"}, + curVersions: versions{"v1", "dns-v1"}, + expectDegraded: configv1.ConditionFalse, + }, + { + description: "operator and operand upgrade in progress", + oldVersions: versions{"v1", "dns-v1"}, + newVersions: versions{"v2", "dns-v2"}, + curVersions: versions{"v1", "dns-v1"}, + expectDegraded: configv1.ConditionFalse, + }, + { + description: "operator upgrade done", + oldVersions: versions{"v1", "dns-v1"}, + newVersions: versions{"v2", "dns-v1"}, + curVersions: versions{"v2", "dns-v1"}, + expectDegraded: configv1.ConditionFalse, + }, + { + description: "operand upgrade done", + oldVersions: versions{"v1", "dns-v1"}, + newVersions: versions{"v1", "dns-v2"}, + curVersions: versions{"v1", "dns-v2"}, + expectDegraded: configv1.ConditionFalse, + }, + { + description: "operator and operand upgrade done", + oldVersions: versions{"v1", "dns-v1"}, + newVersions: versions{"v2", "dns-v2"}, + curVersions: versions{"v2", "dns-v2"}, + expectDegraded: configv1.ConditionFalse, + }, + { + description: "operator upgrade in progress, operand upgrade done", + oldVersions: versions{"v1", "dns-v1"}, + newVersions: versions{"v2", "dns-v2"}, + curVersions: versions{"v1", "dns-v2"}, + expectDegraded: configv1.ConditionFalse, + }, + { + description: "operator upgrade in progress but no dns", + dnsMissing: true, + oldVersions: versions{"v1", "dns-v1"}, + newVersions: versions{"v2", "dns-v1"}, + curVersions: versions{"v1", "dns-v1"}, + expectDegraded: configv1.ConditionTrue, + }, + { + description: "operand upgrade in progress, but no dns", + dnsMissing: true, + oldVersions: versions{"v1", "dns-v1"}, + newVersions: versions{"v1", "dns-v2"}, + curVersions: versions{"v1", "dns-v1"}, + expectDegraded: configv1.ConditionTrue, + }, + { + description: "operator and operand upgrade in progress, but no dns", + dnsMissing: true, + oldVersions: versions{"v1", "dns-v1"}, + newVersions: versions{"v2", "dns-v2"}, + curVersions: versions{"v1", "dns-v1"}, + expectDegraded: configv1.ConditionTrue, + }, + } + + for _, tc := range testCases { + var ( + haveDNS bool + dns *operatorv1.DNS + ) + if !tc.dnsMissing { + haveDNS = true + degradedStatus := operatorv1.ConditionFalse + if tc.dnsMissing { + degradedStatus = operatorv1.ConditionTrue + } + dns = &operatorv1.DNS{ + Status: operatorv1.DNSStatus{ + Conditions: []operatorv1.OperatorCondition{{ + Type: operatorv1.OperatorStatusTypeDegraded, + Status: degradedStatus, + }}, + }, + } + } + oldVersions := map[string]string{ + OperatorVersionName: tc.oldVersions.operator, + CoreDNSVersionName: tc.oldVersions.operand, + OpenshiftCLIVersionName: tc.oldVersions.operand, + KubeRBACProxyName: tc.oldVersions.operand, + } + newVersions := map[string]string{ + OperatorVersionName: tc.newVersions.operator, + CoreDNSVersionName: tc.newVersions.operand, + OpenshiftCLIVersionName: tc.newVersions.operand, + KubeRBACProxyName: tc.newVersions.operand, + } + curVersions := map[string]string{ + OperatorVersionName: tc.curVersions.operator, + CoreDNSVersionName: tc.curVersions.operand, + OpenshiftCLIVersionName: tc.curVersions.operand, + KubeRBACProxyName: tc.curVersions.operand, + } + + expected := configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorDegraded, + Status: tc.expectDegraded, + } + + actual := computeOperatorDegradedCondition(haveDNS, dns, oldVersions, newVersions, curVersions) + conditionsCmpOpts := []cmp.Option{ + cmpopts.IgnoreFields(configv1.ClusterOperatorStatusCondition{}, "LastTransitionTime", "Reason", "Message"), + } + if !cmp.Equal(actual, expected, conditionsCmpOpts...) { + t.Errorf("%q: expected %#v, got %#v", tc.description, expected, actual) + } + } +} diff --git a/pkg/util/conditions/conditions.go b/pkg/util/conditions/conditions.go new file mode 100644 index 000000000..fa94328ed --- /dev/null +++ b/pkg/util/conditions/conditions.go @@ -0,0 +1,32 @@ +// Package conditions is copied in from openshift/cluster-ingress-operator and should be moved to library-go to be shared. +package conditions + +import ( + "fmt" + + operatorv1 "github.com/openshift/api/operator/v1" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" +) + +func FormatConditions(conditions []*operatorv1.OperatorCondition) string { + var formatted string + if len(conditions) == 0 { + return "" + } + for _, cond := range conditions { + formatted = formatted + fmt.Sprintf(", %s=%s (%s: %s)", cond.Type, cond.Status, cond.Reason, cond.Message) + } + formatted = formatted[2:] + return formatted +} + +func ConditionsEqual(a, b []operatorv1.OperatorCondition) bool { + conditionCmpOpts := []cmp.Option{ + cmpopts.EquateEmpty(), + cmpopts.SortSlices(func(a, b operatorv1.OperatorCondition) bool { return a.Type < b.Type }), + } + + return cmp.Equal(a, b, conditionCmpOpts...) +} diff --git a/pkg/util/retryableerror/retryableerror.go b/pkg/util/retryableerror/retryableerror.go new file mode 100644 index 000000000..c2ac65146 --- /dev/null +++ b/pkg/util/retryableerror/retryableerror.go @@ -0,0 +1,60 @@ +// Package retryableerror is copied in from openshift/cluster-ingress-operator and should be moved to library-go to be shared. +package retryableerror + +import ( + "time" + + utilerrors "k8s.io/apimachinery/pkg/util/errors" +) + +// Error represents an error for an operation that should be retried after the +// specified duration. +type Error interface { + error + // After is the time period after which the operation that caused the + // error should be retried. + After() time.Duration +} + +// New returns a new RetryableError with the given error and time period. +func New(err error, after time.Duration) Error { + return retryableError{err, after} +} + +type retryableError struct { + error + after time.Duration +} + +// After returns the time period after which the operation that caused the error +// should be retried. +func (r retryableError) After() time.Duration { + return r.after +} + +// NewMaybeRetryableAggregate converts a slice of errors into a single error +// value. Nil values will be filtered from the slice. If the filtered slice is +// empty, the return value will be nil. Else, if any values are non-retryable +// errors, the result will be an Aggregate interface. Else, if all errors are +// retryable, the result will be a retryable Error interface, with After() equal +// to the minimum of all the errors' After() values. +func NewMaybeRetryableAggregate(errs []error) error { + aggregate := utilerrors.NewAggregate(errs) + if aggregate == nil { + return nil + } + afterHasInitialValue := false + var after time.Duration + for _, err := range aggregate.Errors() { + switch e := err.(type) { + case Error: + if !afterHasInitialValue || e.After() < after { + after = e.After() + } + afterHasInitialValue = true + default: + return aggregate + } + } + return New(aggregate, after) +} diff --git a/pkg/util/retryableerror/retryableerror_test.go b/pkg/util/retryableerror/retryableerror_test.go new file mode 100644 index 000000000..c053416e7 --- /dev/null +++ b/pkg/util/retryableerror/retryableerror_test.go @@ -0,0 +1,65 @@ +package retryableerror + +import ( + "errors" + "testing" + "time" + + utilerrors "k8s.io/apimachinery/pkg/util/errors" +) + +func TestRetryableError(t *testing.T) { + tests := []struct { + name string + errors []error + expectRetryable bool + expectAggregate bool + expectAfter time.Duration + }{ + { + name: "empty list", + }, + { + name: "nil error", + errors: []error{nil}, + }, + { + name: "non-retryable errors", + errors: []error{errors.New("foo"), errors.New("bar")}, + expectAggregate: true, + }, + { + name: "mix of retryable and non-retryable errors", + errors: []error{ + errors.New("foo"), + errors.New("bar"), + New(errors.New("baz"), time.Second*15), + New(errors.New("quux"), time.Minute), + }, + expectAggregate: true, + }, + { + name: "only retryable errors", + errors: []error{ + New(errors.New("baz"), time.Second*15), + New(errors.New("quux"), time.Minute), + nil, + }, + expectRetryable: true, + expectAfter: time.Second * 15, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + err := NewMaybeRetryableAggregate(test.errors) + if retryable, gotRetryable := err.(Error); gotRetryable != test.expectRetryable { + t.Errorf("expected retryable %T, got %T: %v", test.expectRetryable, gotRetryable, err) + } else if gotRetryable && retryable.After() != test.expectAfter { + t.Errorf("expected after %v, got %v: %v", test.expectAfter, retryable.After(), err) + } + if _, gotAggregate := err.(utilerrors.Aggregate); gotAggregate != test.expectAggregate { + t.Errorf("expected aggregate %T, got %T: %v", test.expectAggregate, gotAggregate, err) + } + }) + } +}