Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 76 additions & 4 deletions examples/44-node-repair.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# An example ClusterConfig that uses a managed node group with auto repair.
# An example ClusterConfig that demonstrates node repair configuration
# for EKS managed nodegroups with various configuration options.

apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
Expand All @@ -8,6 +9,77 @@ metadata:
region: us-west-2

managedNodeGroups:
- name: ng-1
nodeRepairConfig:
enabled: true
# Example 1: Basic node repair
- name: basic-repair-ng
instanceType: m5.large
desiredCapacity: 3
nodeRepairConfig:
enabled: true

# Example 2: Node repair with percentage-based thresholds
- name: percentage-repair-ng
instanceType: m5.large
desiredCapacity: 3
minSize: 1
maxSize: 5
nodeRepairConfig:
enabled: true
# Stop repair actions when 20% of nodes are unhealthy
maxUnhealthyNodeThresholdPercentage: 20
# Repair at most 15% of unhealthy nodes in parallel
maxParallelNodesRepairedPercentage: 15

# Example 3: Node repair with count-based thresholds
- name: count-repair-ng
instanceType: m5.xlarge
desiredCapacity: 10
minSize: 5
maxSize: 20
nodeRepairConfig:
enabled: true
# Stop repair actions when 3 nodes are unhealthy
maxUnhealthyNodeThresholdCount: 3
# Repair at most 2 unhealthy nodes in parallel
maxParallelNodesRepairedCount: 2

# Example 4: GPU workload with custom repair overrides
- name: gpu-repair-ng
instanceType: g4dn.xlarge
desiredCapacity: 4
minSize: 2
maxSize: 8
nodeRepairConfig:
enabled: true
maxUnhealthyNodeThresholdPercentage: 25
maxParallelNodesRepairedCount: 1
# Custom repair behavior for specific failure scenarios
nodeRepairConfigOverrides:
# Handle GPU-related failures with immediate termination
- nodeMonitoringCondition: "AcceleratedInstanceNotReady"
nodeUnhealthyReason: "NvidiaXID13Error"
minRepairWaitTimeMins: 5
repairAction: "Terminate"
# Handle network issues with restart first
- nodeMonitoringCondition: "NetworkNotReady"
nodeUnhealthyReason: "InterfaceNotUp"
minRepairWaitTimeMins: 15
repairAction: "Restart"

# Example 5: Conservative repair for critical workloads
- name: critical-repair-ng
instanceType: c5.2xlarge
desiredCapacity: 6
minSize: 3
maxSize: 12
nodeRepairConfig:
enabled: true
# Conservative settings - stop repair when only 10% of nodes are unhealthy
maxUnhealthyNodeThresholdPercentage: 10
# Repair only 1 node at a time
maxParallelNodesRepairedCount: 1
nodeRepairConfigOverrides:
# Wait longer before taking action on critical workloads
- nodeMonitoringCondition: "NetworkNotReady"
nodeUnhealthyReason: "InterfaceNotUp"
minRepairWaitTimeMins: 45
repairAction: "Restart"
149 changes: 149 additions & 0 deletions integration/tests/enhanced_node_repair/enhanced_node_repair_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
//go:build integration
// +build integration

package enhancednoderepair

import (
"fmt"
"os"
"testing"
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

. "github.com/weaveworks/eksctl/integration/runner"
"github.com/weaveworks/eksctl/integration/tests"
"github.com/weaveworks/eksctl/pkg/testutils"
)

var params *tests.Params

func init() {
// Call testing.Init() prior to tests.NewParams(), as otherwise -test.* will not be recognised. See also: https://golang.org/doc/go1.13#testing
testing.Init()
params = tests.NewParamsWithGivenClusterName("enhanced-node-repair", "test-enhanced-node-repair")
}

func TestEnhancedNodeRepair(t *testing.T) {
testutils.RegisterAndRun(t)
}

var _ = Describe("(Integration) Enhanced Node Repair Configuration", func() {

Context("CloudFormation template generation", func() {
It("should generate correct CloudFormation template with CLI flags", func() {
By("testing CLI flags generate correct CloudFormation")
cmd := params.EksctlCreateCmd.WithArgs(
"cluster",
"--name", "test-cli-template",
"--region", params.Region,
"--managed",
"--enable-node-repair",
"--node-repair-max-unhealthy-percentage=25",
"--node-repair-max-parallel-count=2",
"--dry-run",
)
Expect(cmd).To(RunSuccessfully())
})

It("should generate correct CloudFormation template with YAML config", func() {
By("creating temporary config file")
configFile := fmt.Sprintf("/tmp/test-enhanced-node-repair-%d.yaml", time.Now().Unix())
yamlConfig := fmt.Sprintf(`
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig

metadata:
name: test-yaml-template
region: %s

managedNodeGroups:
- name: enhanced-ng
instanceType: t3.medium
desiredCapacity: 2
nodeRepairConfig:
enabled: true
maxUnhealthyNodeThresholdPercentage: 20
maxParallelNodesRepairedPercentage: 15
nodeRepairConfigOverrides:
- nodeMonitoringCondition: "NetworkNotReady"
nodeUnhealthyReason: "InterfaceNotUp"
minRepairWaitTimeMins: 15
repairAction: "Restart"
`, params.Region)

err := os.WriteFile(configFile, []byte(yamlConfig), 0644)
Expect(err).NotTo(HaveOccurred())
defer os.Remove(configFile)

By("testing YAML config generates correct CloudFormation")
cmd := params.EksctlCreateCmd.WithArgs(
"cluster",
"--config-file", configFile,
"--dry-run",
).WithoutArg("--region", params.Region)
Expect(cmd).To(RunSuccessfully())
})

It("should validate backward compatibility with existing config", func() {
By("testing existing node repair config still works")
cmd := params.EksctlCreateCmd.WithArgs(
"cluster",
"--name", "test-backward-compat",
"--region", params.Region,
"--managed",
"--enable-node-repair",
"--dry-run",
)
Expect(cmd).To(RunSuccessfully())
})
})

Context("error handling", func() {
It("should handle invalid CLI flag combinations gracefully", func() {
By("testing with unmanaged nodegroup (should fail)")
cmd := params.EksctlCreateCmd.WithArgs(
"cluster",
"--name", "test-error-handling",
"--region", params.Region,
"--managed=false",
"--enable-node-repair",
"--dry-run",
)
Expect(cmd).NotTo(RunSuccessfully())
})

It("should handle invalid YAML configuration gracefully", func() {
By("creating config file with invalid node repair config")
configFile := fmt.Sprintf("/tmp/test-invalid-config-%d.yaml", time.Now().Unix())
invalidConfig := fmt.Sprintf(`
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig

metadata:
name: test-invalid
region: %s

nodeGroups:
- name: unmanaged-ng
instanceType: t3.medium
nodeRepairConfig:
enabled: true
`, params.Region)

err := os.WriteFile(configFile, []byte(invalidConfig), 0644)
Expect(err).NotTo(HaveOccurred())
defer os.Remove(configFile)

By("testing invalid config is rejected")
cmd := params.EksctlCreateCmd.WithArgs(
"cluster",
"--config-file", configFile,
"--dry-run",
).WithoutArg("--region", params.Region)
// This should fail because nodeRepairConfig is not supported for unmanaged nodegroups
Expect(cmd).NotTo(RunSuccessfully())
})
})
})
68 changes: 67 additions & 1 deletion pkg/apis/eksctl.io/v1alpha5/assets/schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -2307,10 +2307,43 @@
"type": "boolean",
"description": "Enables the auto repair feature for the nodegroup",
"x-intellij-html-description": "Enables the auto repair feature for the nodegroup"
},
"maxParallelNodesRepairedCount": {
"type": "integer",
"description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a count of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedPercentage at the same time.",
"x-intellij-html-description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a count of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedPercentage at the same time."
},
"maxParallelNodesRepairedPercentage": {
"type": "integer",
"description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a percentage of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedCount at the same time.",
"x-intellij-html-description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a percentage of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedCount at the same time."
},
"maxUnhealthyNodeThresholdCount": {
"type": "integer",
"description": "specifies a count threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdPercentage at the same time.",
"x-intellij-html-description": "specifies a count threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdPercentage at the same time."
},
"maxUnhealthyNodeThresholdPercentage": {
"type": "integer",
"description": "specifies a percentage threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdCount at the same time.",
"x-intellij-html-description": "specifies a percentage threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdCount at the same time."
},
"nodeRepairConfigOverrides": {
"items": {
"$ref": "#/definitions/NodeRepairConfigOverride"
},
"type": "array",
"description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values.",
"x-intellij-html-description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values."
}
},
"preferredOrder": [
"enabled"
"enabled",
"maxUnhealthyNodeThresholdPercentage",
"maxUnhealthyNodeThresholdCount",
"maxParallelNodesRepairedPercentage",
"maxParallelNodesRepairedCount",
"nodeRepairConfigOverrides"
],
"additionalProperties": false,
"description": "contains the auto repair configuration for the nodegroup",
Expand Down Expand Up @@ -2436,6 +2469,39 @@
"description": "contains the configuration for updating NodeGroups.",
"x-intellij-html-description": "contains the configuration for updating NodeGroups."
},
"NodeRepairConfigOverride": {
"properties": {
"minRepairWaitTimeMins": {
"type": "integer",
"description": "specifies the minimum time in minutes to wait before attempting to repair a node with this specific NodeMonitoringCondition and NodeUnhealthyReason",
"x-intellij-html-description": "specifies the minimum time in minutes to wait before attempting to repair a node with this specific NodeMonitoringCondition and NodeUnhealthyReason"
},
"nodeMonitoringCondition": {
"type": "string",
"description": "specifies an unhealthy condition reported by the node monitoring agent that this override would apply to",
"x-intellij-html-description": "specifies an unhealthy condition reported by the node monitoring agent that this override would apply to"
},
"nodeUnhealthyReason": {
"type": "string",
"description": "specifies a reason reported by the node monitoring agent that this override would apply to",
"x-intellij-html-description": "specifies a reason reported by the node monitoring agent that this override would apply to"
},
"repairAction": {
"type": "string",
"description": "specifies the repair action to take for nodes when all of the specified conditions are met",
"x-intellij-html-description": "specifies the repair action to take for nodes when all of the specified conditions are met"
}
},
"preferredOrder": [
"nodeMonitoringCondition",
"nodeUnhealthyReason",
"minRepairWaitTimeMins",
"repairAction"
],
"additionalProperties": false,
"description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values.",
"x-intellij-html-description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values."
},
"OIDCIdentityProvider": {
"required": [
"name",
Expand Down
42 changes: 42 additions & 0 deletions pkg/apis/eksctl.io/v1alpha5/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -1609,6 +1609,48 @@ type (
// Enables the auto repair feature for the nodegroup
// +optional
Enabled *bool `json:"enabled,omitempty"`

// MaxUnhealthyNodeThresholdPercentage specifies a percentage threshold of unhealthy nodes, above which node auto
// repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdCount at the same time.
// +optional
MaxUnhealthyNodeThresholdPercentage *int `json:"maxUnhealthyNodeThresholdPercentage,omitempty"`

// MaxUnhealthyNodeThresholdCount specifies a count threshold of unhealthy nodes, above which node auto
// repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdPercentage at the same time.
// +optional
MaxUnhealthyNodeThresholdCount *int `json:"maxUnhealthyNodeThresholdCount,omitempty"`

// MaxParallelNodesRepairedPercentage specifies the maximum number of nodes that can be repaired concurrently or in parallel,
// expressed as a percentage of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedCount at the same time.
// +optional
MaxParallelNodesRepairedPercentage *int `json:"maxParallelNodesRepairedPercentage,omitempty"`

// MaxParallelNodesRepairedCount specifies the maximum number of nodes that can be repaired concurrently or in parallel,
// expressed as a count of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedPercentage at the same time.
// +optional
MaxParallelNodesRepairedCount *int `json:"maxParallelNodesRepairedCount,omitempty"`

// NodeRepairConfigOverrides specifies granular overrides for specific repair actions. These overrides control the
// repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values.
// +optional
NodeRepairConfigOverrides []NodeRepairConfigOverride `json:"nodeRepairConfigOverrides,omitempty"`
}

// NodeRepairConfigOverride specifies granular overrides for specific repair actions. These overrides control the
// repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values.
NodeRepairConfigOverride struct {
// NodeMonitoringCondition specifies an unhealthy condition reported by the node monitoring agent that this override would apply to
NodeMonitoringCondition string `json:"nodeMonitoringCondition"`

// NodeUnhealthyReason specifies a reason reported by the node monitoring agent that this override would apply to
NodeUnhealthyReason string `json:"nodeUnhealthyReason"`

// MinRepairWaitTimeMins specifies the minimum time in minutes to wait before attempting to repair a node
// with this specific NodeMonitoringCondition and NodeUnhealthyReason
MinRepairWaitTimeMins int `json:"minRepairWaitTimeMins"`

// RepairAction specifies the repair action to take for nodes when all of the specified conditions are met
RepairAction string `json:"repairAction"`
}
)

Expand Down
Loading
Loading