Skip to content

Commit 3d28712

Browse files
committed
update user docs
1 parent 236fe6f commit 3d28712

File tree

4 files changed

+92
-121
lines changed

4 files changed

+92
-121
lines changed

examples/44-enhanced-node-repair.yaml

Lines changed: 0 additions & 103 deletions
This file was deleted.

examples/44-node-repair.yaml

Lines changed: 76 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
# An example ClusterConfig that uses a managed node group with auto repair.
1+
# An example ClusterConfig that demonstrates node repair configuration
2+
# for EKS managed nodegroups with various configuration options.
23

34
apiVersion: eksctl.io/v1alpha5
45
kind: ClusterConfig
@@ -8,6 +9,77 @@ metadata:
89
region: us-west-2
910

1011
managedNodeGroups:
11-
- name: ng-1
12-
nodeRepairConfig:
13-
enabled: true
12+
# Example 1: Basic node repair
13+
- name: basic-repair-ng
14+
instanceType: m5.large
15+
desiredCapacity: 3
16+
nodeRepairConfig:
17+
enabled: true
18+
19+
# Example 2: Node repair with percentage-based thresholds
20+
- name: percentage-repair-ng
21+
instanceType: m5.large
22+
desiredCapacity: 3
23+
minSize: 1
24+
maxSize: 5
25+
nodeRepairConfig:
26+
enabled: true
27+
# Stop repair actions when 20% of nodes are unhealthy
28+
maxUnhealthyNodeThresholdPercentage: 20
29+
# Repair at most 15% of unhealthy nodes in parallel
30+
maxParallelNodesRepairedPercentage: 15
31+
32+
# Example 3: Node repair with count-based thresholds
33+
- name: count-repair-ng
34+
instanceType: m5.xlarge
35+
desiredCapacity: 10
36+
minSize: 5
37+
maxSize: 20
38+
nodeRepairConfig:
39+
enabled: true
40+
# Stop repair actions when 3 nodes are unhealthy
41+
maxUnhealthyNodeThresholdCount: 3
42+
# Repair at most 2 unhealthy nodes in parallel
43+
maxParallelNodesRepairedCount: 2
44+
45+
# Example 4: GPU workload with custom repair overrides
46+
- name: gpu-repair-ng
47+
instanceType: g4dn.xlarge
48+
desiredCapacity: 4
49+
minSize: 2
50+
maxSize: 8
51+
nodeRepairConfig:
52+
enabled: true
53+
maxUnhealthyNodeThresholdPercentage: 25
54+
maxParallelNodesRepairedCount: 1
55+
# Custom repair behavior for specific failure scenarios
56+
nodeRepairConfigOverrides:
57+
# Handle GPU-related failures with immediate termination
58+
- nodeMonitoringCondition: "AcceleratedInstanceNotReady"
59+
nodeUnhealthyReason: "NvidiaXID13Error"
60+
minRepairWaitTimeMins: 5
61+
repairAction: "Terminate"
62+
# Handle network issues with restart first
63+
- nodeMonitoringCondition: "NetworkNotReady"
64+
nodeUnhealthyReason: "InterfaceNotUp"
65+
minRepairWaitTimeMins: 15
66+
repairAction: "Restart"
67+
68+
# Example 5: Conservative repair for critical workloads
69+
- name: critical-repair-ng
70+
instanceType: c5.2xlarge
71+
desiredCapacity: 6
72+
minSize: 3
73+
maxSize: 12
74+
nodeRepairConfig:
75+
enabled: true
76+
# Conservative settings - stop repair when only 10% of nodes are unhealthy
77+
maxUnhealthyNodeThresholdPercentage: 10
78+
# Repair only 1 node at a time
79+
maxParallelNodesRepairedCount: 1
80+
nodeRepairConfigOverrides:
81+
# Wait longer before taking action on critical workloads
82+
- nodeMonitoringCondition: "NetworkNotReady"
83+
nodeUnhealthyReason: "InterfaceNotUp"
84+
minRepairWaitTimeMins: 45
85+
repairAction: "Restart"

pkg/apis/eksctl.io/v1alpha5/assets/schema.json

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2304,31 +2304,31 @@
23042304
},
23052305
"maxParallelNodesRepairedCount": {
23062306
"type": "integer",
2307-
"description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a count of unhealthy nodes. When using this, you cannot also set maxParallelNodesRepairedPercentage at the same time",
2308-
"x-intellij-html-description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a count of unhealthy nodes. When using this, you cannot also set maxParallelNodesRepairedPercentage at the same time"
2307+
"description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a count of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedPercentage at the same time.",
2308+
"x-intellij-html-description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a count of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedPercentage at the same time."
23092309
},
23102310
"maxParallelNodesRepairedPercentage": {
23112311
"type": "integer",
2312-
"description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a percentage of unhealthy nodes. When using this, you cannot also set maxParallelNodesRepairedCount at the same time",
2313-
"x-intellij-html-description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a percentage of unhealthy nodes. When using this, you cannot also set maxParallelNodesRepairedCount at the same time"
2312+
"description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a percentage of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedCount at the same time.",
2313+
"x-intellij-html-description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a percentage of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedCount at the same time."
23142314
},
23152315
"maxUnhealthyNodeThresholdCount": {
23162316
"type": "integer",
2317-
"description": "specifies a count threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set maxUnhealthyNodeThresholdPercentage at the same time",
2318-
"x-intellij-html-description": "specifies a count threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set maxUnhealthyNodeThresholdPercentage at the same time"
2317+
"description": "specifies a count threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdPercentage at the same time.",
2318+
"x-intellij-html-description": "specifies a count threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdPercentage at the same time."
23192319
},
23202320
"maxUnhealthyNodeThresholdPercentage": {
23212321
"type": "integer",
2322-
"description": "specifies a percentage threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set maxUnhealthyNodeThresholdCount at the same time",
2323-
"x-intellij-html-description": "specifies a percentage threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set maxUnhealthyNodeThresholdCount at the same time"
2322+
"description": "specifies a percentage threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdCount at the same time.",
2323+
"x-intellij-html-description": "specifies a percentage threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdCount at the same time."
23242324
},
23252325
"nodeRepairConfigOverrides": {
23262326
"items": {
23272327
"$ref": "#/definitions/NodeRepairConfigOverride"
23282328
},
23292329
"type": "array",
2330-
"description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values",
2331-
"x-intellij-html-description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values"
2330+
"description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values.",
2331+
"x-intellij-html-description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values."
23322332
}
23332333
},
23342334
"preferredOrder": [
@@ -2467,8 +2467,8 @@
24672467
"properties": {
24682468
"minRepairWaitTimeMins": {
24692469
"type": "integer",
2470-
"description": "specifies the minimum time in minutes to wait before attempting to repair a node with this specific nodeMonitoringCondition and nodeUnhealthyReason",
2471-
"x-intellij-html-description": "specifies the minimum time in minutes to wait before attempting to repair a node with this specific nodeMonitoringCondition and nodeUnhealthyReason"
2470+
"description": "specifies the minimum time in minutes to wait before attempting to repair a node with this specific NodeMonitoringCondition and NodeUnhealthyReason",
2471+
"x-intellij-html-description": "specifies the minimum time in minutes to wait before attempting to repair a node with this specific NodeMonitoringCondition and NodeUnhealthyReason"
24722472
},
24732473
"nodeMonitoringCondition": {
24742474
"type": "string",
@@ -2493,8 +2493,8 @@
24932493
"repairAction"
24942494
],
24952495
"additionalProperties": false,
2496-
"description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values",
2497-
"x-intellij-html-description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values"
2496+
"description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values.",
2497+
"x-intellij-html-description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values."
24982498
},
24992499
"OIDCIdentityProvider": {
25002500
"required": [

userdocs/src/usage/nodegroup-node-repair-config.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,8 @@ managedNodeGroups:
130130
131131
## Complete Configuration Examples
132132
133+
For a comprehensive example with all configuration options, see [examples/44-node-repair.yaml](https://github.com/eksctl-io/eksctl/blob/main/examples/44-node-repair.yaml).
134+
133135
### Example 1: Basic repair with percentage thresholds
134136
135137
```yaml

0 commit comments

Comments
 (0)