1- # An example ClusterConfig that uses a managed node group with auto repair.
1+ # An example ClusterConfig that demonstrates node repair configuration
2+ # for EKS managed nodegroups with various configuration options.
23
34apiVersion : eksctl.io/v1alpha5
45kind : ClusterConfig
@@ -8,6 +9,77 @@ metadata:
89 region : us-west-2
910
1011managedNodeGroups :
11- - name : ng-1
12- nodeRepairConfig :
13- enabled : true
12+ # Example 1: Basic node repair
13+ - name : basic-repair-ng
14+ instanceType : m5.large
15+ desiredCapacity : 3
16+ nodeRepairConfig :
17+ enabled : true
18+
19+ # Example 2: Node repair with percentage-based thresholds
20+ - name : percentage-repair-ng
21+ instanceType : m5.large
22+ desiredCapacity : 3
23+ minSize : 1
24+ maxSize : 5
25+ nodeRepairConfig :
26+ enabled : true
27+ # Stop repair actions when 20% of nodes are unhealthy
28+ maxUnhealthyNodeThresholdPercentage : 20
29+ # Repair at most 15% of unhealthy nodes in parallel
30+ maxParallelNodesRepairedPercentage : 15
31+
32+ # Example 3: Node repair with count-based thresholds
33+ - name : count-repair-ng
34+ instanceType : m5.xlarge
35+ desiredCapacity : 10
36+ minSize : 5
37+ maxSize : 20
38+ nodeRepairConfig :
39+ enabled : true
40+ # Stop repair actions when 3 nodes are unhealthy
41+ maxUnhealthyNodeThresholdCount : 3
42+ # Repair at most 2 unhealthy nodes in parallel
43+ maxParallelNodesRepairedCount : 2
44+
45+ # Example 4: GPU workload with custom repair overrides
46+ - name : gpu-repair-ng
47+ instanceType : g4dn.xlarge
48+ desiredCapacity : 4
49+ minSize : 2
50+ maxSize : 8
51+ nodeRepairConfig :
52+ enabled : true
53+ maxUnhealthyNodeThresholdPercentage : 25
54+ maxParallelNodesRepairedCount : 1
55+ # Custom repair behavior for specific failure scenarios
56+ nodeRepairConfigOverrides :
57+ # Handle GPU-related failures with immediate termination
58+ - nodeMonitoringCondition : " AcceleratedInstanceNotReady"
59+ nodeUnhealthyReason : " NvidiaXID13Error"
60+ minRepairWaitTimeMins : 5
61+ repairAction : " Terminate"
62+ # Handle network issues with restart first
63+ - nodeMonitoringCondition : " NetworkNotReady"
64+ nodeUnhealthyReason : " InterfaceNotUp"
65+ minRepairWaitTimeMins : 15
66+ repairAction : " Restart"
67+
68+ # Example 5: Conservative repair for critical workloads
69+ - name : critical-repair-ng
70+ instanceType : c5.2xlarge
71+ desiredCapacity : 6
72+ minSize : 3
73+ maxSize : 12
74+ nodeRepairConfig :
75+ enabled : true
76+ # Conservative settings - stop repair when only 10% of nodes are unhealthy
77+ maxUnhealthyNodeThresholdPercentage : 10
78+ # Repair only 1 node at a time
79+ maxParallelNodesRepairedCount : 1
80+ nodeRepairConfigOverrides :
81+ # Wait longer before taking action on critical workloads
82+ - nodeMonitoringCondition : " NetworkNotReady"
83+ nodeUnhealthyReason : " InterfaceNotUp"
84+ minRepairWaitTimeMins : 45
85+ repairAction : " Restart"
0 commit comments