From df74afebc966bf54b1f2c840bc3cfce4081e54dd Mon Sep 17 00:00:00 2001 From: Jamo Luhrsen Date: Fri, 19 Dec 2025 15:30:59 -0800 Subject: [PATCH] add retry logic for transient api server errors CNO is going Degraded on the first connection issue with the API server, but that can happen briefly on a new rollout. This is seen periodically in test cases doing a new rollout on purpose like this one [0]. even the test case does a retry [1] because of this. [0] https://github.com/openshift/origin/blob/3854d32174b5e9ddaded1dfcc8a865bb28ca04ad/test/extended/networking/services.go#L26 [1] https://github.com/openshift/origin/blob/3854d32174b5e9ddaded1dfcc8a865bb28ca04ad/test/extended/networking/services.go#L57-L63 Signed-off-by: Jamo Luhrsen --- pkg/apply/apply.go | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/pkg/apply/apply.go b/pkg/apply/apply.go index a98a06fe61..2b93dc06a6 100644 --- a/pkg/apply/apply.go +++ b/pkg/apply/apply.go @@ -5,6 +5,7 @@ import ( "fmt" "log" "strings" + "time" cnoclient "github.com/openshift/cluster-network-operator/pkg/client" "github.com/openshift/cluster-network-operator/pkg/names" @@ -16,6 +17,8 @@ import ( "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/util/retry" utilpointer "k8s.io/utils/ptr" ) @@ -123,8 +126,29 @@ func ApplyObject(ctx context.Context, client cnoclient.Client, obj Object, subco return fmt.Errorf("could not encode for patching: %w", err) } - _, err = clusterClient.Dynamic().Resource(rm.Resource).Namespace(namespace).Patch(ctx, name, types.ApplyPatchType, data, patchOptions, subresources...) + // Retry with backoff to handle transient API server issues. + var backoff = wait.Backoff{ + Steps: 6, + Duration: 5 * time.Second, + Factor: 1.0, + Jitter: 0.1, + } + + var attempt int + err = retry.OnError(backoff, func(err error) bool { + // Don't retry on context cancellation (graceful shutdown) + return !errors.Is(err, context.Canceled) && !errors.Is(err, context.DeadlineExceeded) + }, func() error { + attempt++ + _, err := clusterClient.Dynamic().Resource(rm.Resource).Namespace(namespace).Patch(ctx, name, types.ApplyPatchType, data, patchOptions, subresources...) + if err != nil { + log.Printf("Error applying %s (attempt %d/%d): %v", objDesc, attempt, backoff.Steps, err) + } + return err + }) + if err != nil { + log.Printf("Failed to apply %s after %d attempts", objDesc, attempt) return fmt.Errorf("failed to apply / update %s: %w", objDesc, err) }