Skip to content

Commit db5ca15

Browse files
committed
update user docs
1 parent 236fe6f commit db5ca15

File tree

5 files changed

+92
-595
lines changed

5 files changed

+92
-595
lines changed

examples/44-enhanced-node-repair.yaml

Lines changed: 0 additions & 103 deletions
This file was deleted.

examples/44-node-repair.yaml

Lines changed: 76 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
# An example ClusterConfig that uses a managed node group with auto repair.
1+
# An example ClusterConfig that demonstrates node repair configuration
2+
# for EKS managed nodegroups with various configuration options.
23

34
apiVersion: eksctl.io/v1alpha5
45
kind: ClusterConfig
@@ -8,6 +9,77 @@ metadata:
89
region: us-west-2
910

1011
managedNodeGroups:
11-
- name: ng-1
12-
nodeRepairConfig:
13-
enabled: true
12+
# Example 1: Basic node repair
13+
- name: basic-repair-ng
14+
instanceType: m5.large
15+
desiredCapacity: 3
16+
nodeRepairConfig:
17+
enabled: true
18+
19+
# Example 2: Node repair with percentage-based thresholds
20+
- name: percentage-repair-ng
21+
instanceType: m5.large
22+
desiredCapacity: 3
23+
minSize: 1
24+
maxSize: 5
25+
nodeRepairConfig:
26+
enabled: true
27+
# Stop repair actions when 20% of nodes are unhealthy
28+
maxUnhealthyNodeThresholdPercentage: 20
29+
# Repair at most 15% of unhealthy nodes in parallel
30+
maxParallelNodesRepairedPercentage: 15
31+
32+
# Example 3: Node repair with count-based thresholds
33+
- name: count-repair-ng
34+
instanceType: m5.xlarge
35+
desiredCapacity: 10
36+
minSize: 5
37+
maxSize: 20
38+
nodeRepairConfig:
39+
enabled: true
40+
# Stop repair actions when 3 nodes are unhealthy
41+
maxUnhealthyNodeThresholdCount: 3
42+
# Repair at most 2 unhealthy nodes in parallel
43+
maxParallelNodesRepairedCount: 2
44+
45+
# Example 4: GPU workload with custom repair overrides
46+
- name: gpu-repair-ng
47+
instanceType: g4dn.xlarge
48+
desiredCapacity: 4
49+
minSize: 2
50+
maxSize: 8
51+
nodeRepairConfig:
52+
enabled: true
53+
maxUnhealthyNodeThresholdPercentage: 25
54+
maxParallelNodesRepairedCount: 1
55+
# Custom repair behavior for specific failure scenarios
56+
nodeRepairConfigOverrides:
57+
# Handle GPU-related failures with immediate termination
58+
- nodeMonitoringCondition: "AcceleratedInstanceNotReady"
59+
nodeUnhealthyReason: "NvidiaXID13Error"
60+
minRepairWaitTimeMins: 5
61+
repairAction: "Terminate"
62+
# Handle network issues with restart first
63+
- nodeMonitoringCondition: "NetworkNotReady"
64+
nodeUnhealthyReason: "InterfaceNotUp"
65+
minRepairWaitTimeMins: 15
66+
repairAction: "Restart"
67+
68+
# Example 5: Conservative repair for critical workloads
69+
- name: critical-repair-ng
70+
instanceType: c5.2xlarge
71+
desiredCapacity: 6
72+
minSize: 3
73+
maxSize: 12
74+
nodeRepairConfig:
75+
enabled: true
76+
# Conservative settings - stop repair when only 10% of nodes are unhealthy
77+
maxUnhealthyNodeThresholdPercentage: 10
78+
# Repair only 1 node at a time
79+
maxParallelNodesRepairedCount: 1
80+
nodeRepairConfigOverrides:
81+
# Wait longer before taking action on critical workloads
82+
- nodeMonitoringCondition: "NetworkNotReady"
83+
nodeUnhealthyReason: "InterfaceNotUp"
84+
minRepairWaitTimeMins: 45
85+
repairAction: "Restart"

0 commit comments

Comments
 (0)