Skip to content

Commit fc44df7

Browse files
CKS: create HA cluster with 3 control VMs instead 2 (#8297)
This PR fixes the test failures with CKS HA-cluster upgrade. In production, the CKS HA cluster should have at least 3 control VMs as well. The etcd cluster requires 3 members to achieve reliable HA. The etcd daemon in control VMs uses RAFT protocol to determine the roles of nodes. During upgrade of CKS with HA, the etcd become unreliable if there are only 2 control VMs.
1 parent 231a9ea commit fc44df7

File tree

4 files changed

+8
-8
lines changed

4 files changed

+8
-8
lines changed

plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/actionworkers/KubernetesClusterUpgradeWorker.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ private void upgradeKubernetesClusterNodes() {
9191
}
9292
try {
9393
result = SshHelper.sshExecute(publicIpAddress, sshPort, getControlNodeLoginUser(), sshKeyFile, null,
94-
String.format("sudo /opt/bin/kubectl drain %s --ignore-daemonsets --delete-local-data", hostName),
94+
String.format("sudo /opt/bin/kubectl drain %s --ignore-daemonsets --delete-emptydir-data", hostName),
9595
10000, 10000, 60000);
9696
} catch (Exception e) {
9797
logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, unable to drain Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, e);

plugins/integrations/kubernetes-service/src/main/resources/script/upgrade-kubernetes.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ if [ -d "$BINARIES_DIR" ]; then
137137

138138
systemctl stop kubelet
139139
cp -a ${BINARIES_DIR}/k8s/{kubelet,kubectl} /opt/bin
140-
chmod +x {kubelet,kubectl}
140+
chmod +x /opt/bin/{kubelet,kubectl}
141141

142142
systemctl daemon-reload
143143
systemctl restart containerd

test/integration/smoke/test_kubernetes_clusters.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -526,7 +526,7 @@ def test_06_delete_kubernetes_cluster(self):
526526
@attr(tags=["advanced", "smoke"], required_hardware="true")
527527
@skipTestIf("hypervisorNotSupported")
528528
def test_07_deploy_kubernetes_ha_cluster(self):
529-
"""Test to deploy a new Kubernetes cluster
529+
"""Test to deploy a new HA Kubernetes cluster
530530
531531
# Validate the following:
532532
# 1. createKubernetesCluster should return valid info for new cluster
@@ -537,14 +537,14 @@ def test_07_deploy_kubernetes_ha_cluster(self):
537537
if self.default_network:
538538
self.skipTest("HA cluster on shared network requires external ip address, skipping it")
539539
global k8s_cluster
540-
k8s_cluster = self.getValidKubernetesCluster(1, 2)
540+
k8s_cluster = self.getValidKubernetesCluster(1, 3)
541541
self.debug("HA Kubernetes cluster with ID: %s successfully deployed" % k8s_cluster.id)
542542
return
543543

544544
@attr(tags=["advanced", "smoke"], required_hardware="true")
545545
@skipTestIf("hypervisorNotSupported")
546546
def test_08_upgrade_kubernetes_ha_cluster(self):
547-
"""Test to upgrade a Kubernetes cluster to newer version
547+
"""Test to upgrade a HA Kubernetes cluster to newer version
548548
549549
# Validate the following:
550550
# 1. upgradeKubernetesCluster should return valid info for the cluster
@@ -554,7 +554,7 @@ def test_08_upgrade_kubernetes_ha_cluster(self):
554554
if self.default_network:
555555
self.skipTest("HA cluster on shared network requires external ip address, skipping it")
556556
global k8s_cluster
557-
k8s_cluster = self.getValidKubernetesCluster(1, 2, version=self.kubernetes_version_v1)
557+
k8s_cluster = self.getValidKubernetesCluster(1, 3, version=self.kubernetes_version_v1)
558558
time.sleep(self.services["sleep"])
559559

560560
self.debug("Upgrading HA Kubernetes cluster with ID: %s" % k8s_cluster.id)
@@ -581,7 +581,7 @@ def test_09_delete_kubernetes_ha_cluster(self):
581581
if self.default_network:
582582
self.skipTest("HA cluster on shared network requires external ip address, skipping it")
583583
global k8s_cluster
584-
k8s_cluster = self.getValidKubernetesCluster(1, 2)
584+
k8s_cluster = self.getValidKubernetesCluster(1, 3)
585585

586586
self.debug("Deleting Kubernetes cluster with ID: %s" % k8s_cluster.id)
587587
return

ui/src/views/compute/CreateKubernetesCluster.vue

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ export default {
278278
initForm () {
279279
this.formRef = ref()
280280
this.form = reactive({
281-
controlnodes: 2,
281+
controlnodes: 3,
282282
size: 1,
283283
noderootdisksize: 8
284284
})

0 commit comments

Comments
 (0)