From df74afebc966bf54b1f2c840bc3cfce4081e54dd Mon Sep 17 00:00:00 2001
From: Jamo Luhrsen <jluhrsen@gmail.com>
Date: Fri, 19 Dec 2025 15:30:59 -0800
Subject: [PATCH] add retry logic for transient api server errors

CNO is going Degraded on the first connection issue with the API
server, but that can happen briefly on a new rollout. This is seen
periodically in test cases doing a new rollout on purpose like
this one [0]. even the test case does a retry [1] because of this.

[0] https://github.com/openshift/origin/blob/3854d32174b5e9ddaded1dfcc8a865bb28ca04ad/test/extended/networking/services.go#L26
[1] https://github.com/openshift/origin/blob/3854d32174b5e9ddaded1dfcc8a865bb28ca04ad/test/extended/networking/services.go#L57-L63

Signed-off-by: Jamo Luhrsen <jluhrsen@gmail.com>
---
 pkg/apply/apply.go | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/pkg/apply/apply.go b/pkg/apply/apply.go
index a98a06fe61..2b93dc06a6 100644
--- a/pkg/apply/apply.go
+++ b/pkg/apply/apply.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"log"
 	"strings"
+	"time"
 
 	cnoclient "github.com/openshift/cluster-network-operator/pkg/client"
 	"github.com/openshift/cluster-network-operator/pkg/names"
@@ -16,6 +17,8 @@ import (
 	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/apimachinery/pkg/util/wait"
+	"k8s.io/client-go/util/retry"
 	utilpointer "k8s.io/utils/ptr"
 )
 
@@ -123,8 +126,29 @@ func ApplyObject(ctx context.Context, client cnoclient.Client, obj Object, subco
 		return fmt.Errorf("could not encode for patching: %w", err)
 	}
 
-	_, err = clusterClient.Dynamic().Resource(rm.Resource).Namespace(namespace).Patch(ctx, name, types.ApplyPatchType, data, patchOptions, subresources...)
+	// Retry with backoff to handle transient API server issues.
+	var backoff = wait.Backoff{
+		Steps:    6,
+		Duration: 5 * time.Second,
+		Factor:   1.0,
+		Jitter:   0.1,
+	}
+
+	var attempt int
+	err = retry.OnError(backoff, func(err error) bool {
+		// Don't retry on context cancellation (graceful shutdown)
+		return !errors.Is(err, context.Canceled) && !errors.Is(err, context.DeadlineExceeded)
+	}, func() error {
+		attempt++
+		_, err := clusterClient.Dynamic().Resource(rm.Resource).Namespace(namespace).Patch(ctx, name, types.ApplyPatchType, data, patchOptions, subresources...)
+		if err != nil {
+			log.Printf("Error applying %s (attempt %d/%d): %v", objDesc, attempt, backoff.Steps, err)
+		}
+		return err
+	})
+
 	if err != nil {
+		log.Printf("Failed to apply %s after %d attempts", objDesc, attempt)
 		return fmt.Errorf("failed to apply / update %s: %w", objDesc, err)
 	}