Skip to content

Commit dd0058d

Browse files
committed
add retry logic for transient api server errors
CNO is going Degraded on the first connection issue with the API server, but that can happen briefly on a new rollout. This is seen periodically in test cases doing a new rollout on purpose like this one [0]. even the test case does a retry [1] because of this. [0] https://github.com/openshift/origin/blob/3854d32174b5e9ddaded1dfcc8a865bb28ca04ad/test/extended/networking/services.go#L26 [1] https://github.com/openshift/origin/blob/3854d32174b5e9ddaded1dfcc8a865bb28ca04ad/test/extended/networking/services.go#L57-L63 Signed-off-by: Jamo Luhrsen <jluhrsen@gmail.com>
1 parent 9d8ab48 commit dd0058d

File tree

1 file changed

+22
-1
lines changed

1 file changed

+22
-1
lines changed

pkg/apply/apply.go

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
"log"
77
"strings"
8+
"time"
89

910
cnoclient "github.com/openshift/cluster-network-operator/pkg/client"
1011
"github.com/openshift/cluster-network-operator/pkg/names"
@@ -16,6 +17,8 @@ import (
1617
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
1718
"k8s.io/apimachinery/pkg/runtime"
1819
"k8s.io/apimachinery/pkg/types"
20+
"k8s.io/apimachinery/pkg/util/wait"
21+
"k8s.io/client-go/util/retry"
1922
utilpointer "k8s.io/utils/ptr"
2023
)
2124

@@ -123,8 +126,26 @@ func ApplyObject(ctx context.Context, client cnoclient.Client, obj Object, subco
123126
return fmt.Errorf("could not encode for patching: %w", err)
124127
}
125128

126-
_, err = clusterClient.Dynamic().Resource(rm.Resource).Namespace(namespace).Patch(ctx, name, types.ApplyPatchType, data, patchOptions, subresources...)
129+
// Retry with backoff to handle transient API server issues.
130+
var backoff = wait.Backoff{
131+
Steps: 6,
132+
Duration: 5 * time.Second,
133+
Factor: 1.0,
134+
Jitter: 0.1,
135+
}
136+
137+
var attempt int
138+
err = retry.OnError(backoff, func(error) bool { return true }, func() error {
139+
attempt++
140+
_, err := clusterClient.Dynamic().Resource(rm.Resource).Namespace(namespace).Patch(ctx, name, types.ApplyPatchType, data, patchOptions, subresources...)
141+
if err != nil {
142+
log.Printf("Error applying %s (attempt %d/%d): %v", objDesc, attempt, backoff.Steps, err)
143+
}
144+
return err
145+
})
146+
127147
if err != nil {
148+
log.Printf("Failed to apply %s after %d attempts", objDesc, attempt)
128149
return fmt.Errorf("failed to apply / update %s: %w", objDesc, err)
129150
}
130151

0 commit comments

Comments
 (0)