From 99e501ea9ea6fcbfee62a42bdf567ad180d1b5c7 Mon Sep 17 00:00:00 2001 From: shreyas Date: Tue, 27 Jan 2026 21:42:20 +0000 Subject: [PATCH 1/7] Adding unique node subnet for nodepools --- swiftv2kubeobjects/createclusterforping.sh | 66 ++++++++++++++++++---- 1 file changed, 54 insertions(+), 12 deletions(-) diff --git a/swiftv2kubeobjects/createclusterforping.sh b/swiftv2kubeobjects/createclusterforping.sh index 38616bcd9a..88c9efeab8 100755 --- a/swiftv2kubeobjects/createclusterforping.sh +++ b/swiftv2kubeobjects/createclusterforping.sh @@ -96,6 +96,9 @@ create_and_verify_nodepool() { nodepool_cmd+=" --node-taints \"${taints}\"" fi + # Set max-surge to 1 node instead of percentage + nodepool_cmd+=" --max-surge 1" + # Add any extra arguments if [[ -n "$extra_args" ]]; then nodepool_cmd+=" ${extra_args}" @@ -320,16 +323,42 @@ vnetName="net" vnetAddressSpaceCIDR="10.0.0.0/9" az network vnet create -n ${vnetName} -g ${RG} --address-prefixes ${vnetAddressSpaceCIDR} -l ${LOCATION} -echo "create vnet subnet" -vnetSubnetNameNodes="nodes" +echo "create vnet subnets" + +# Pod subnet (shared across all pools) vnetSubnetNamePods="pods" -vnetSubnetNodesCIDR="10.0.0.0/16" vnetSubnetPodsCIDR="10.1.0.0/16" podCIDR="10.128.0.0/9" natGatewayID=$(az network nat gateway list -g ${RG} | jq -r '.[].id') -az network vnet subnet create -n ${vnetSubnetNameNodes} --vnet-name ${vnetName} --address-prefixes ${vnetSubnetNodesCIDR} --nat-gateway ${natGatewayID} --default-outbound-access false -g ${RG} + +# Create shared pod subnet az network vnet subnet create -n ${vnetSubnetNamePods} --vnet-name ${vnetName} --address-prefixes ${vnetSubnetPodsCIDR} --nat-gateway $NAT_GW_NAME --default-outbound-access false -g ${RG} +# ============================================================================= +# NODE SUBNETS: Each nodepool gets its own /20 subnet (4094 IPs each) +# Layout within 10.0.0.0/16: +# - 10.0.0.0/20 : system pool (nodes-system) +# - 10.0.16.0/20 : userpool1 (nodes-userpool1) +# - 10.0.32.0/20 : userpool2 (nodes-userpool2) +# - ... up to 10.0.128.0/20 : userpool8 +# - 10.0.144.0/20 : buffer pool (nodes-buffer) +# ============================================================================= + +# Helper function to create node subnet with calculated CIDR +create_node_subnet() { + local subnet_name=$1 + local subnet_index=$2 # 0=system, 1-8=userpools, 9=buffer + local third_octet=$((subnet_index * 16)) + local subnet_cidr="10.0.${third_octet}.0/20" + + echo "Creating node subnet: ${subnet_name} with CIDR: ${subnet_cidr}" + az network vnet subnet create -n ${subnet_name} --vnet-name ${vnetName} --address-prefixes ${subnet_cidr} --nat-gateway ${natGatewayID} --default-outbound-access false -g ${RG} +} + +# Create system pool node subnet (index 0) +vnetSubnetNameNodesSystem="nodes-system" +create_node_subnet "${vnetSubnetNameNodesSystem}" 0 + # az role assignment create --assignee d0fdeb79-ee9b-464c-ae0f-ba72d307208d --role "Network Contributor" --scope /subscriptions/${SUBSCRIPTION}/resourceGroups/$RG/providers/Microsoft.Network/virtualNetworks/$vnetName # set az account to subnetDelegator az account set --subscription $SD_SUB @@ -344,11 +373,11 @@ az account set --subscription $SUBSCRIPTION # create cluster echo "create cluster" vnetID=$(az network vnet list -g ${RG} | jq -r '.[].id') -nodeSubnetID=$(az network vnet subnet list -g ${RG} --vnet-name ${vnetName} --query "[?name=='${vnetSubnetNameNodes}']" | jq -r '.[].id') +systemNodeSubnetID=$(az network vnet subnet list -g ${RG} --vnet-name ${vnetName} --query "[?name=='${vnetSubnetNameNodesSystem}']" | jq -r '.[].id') podSubnetID=$(az network vnet subnet list -g ${RG} --vnet-name ${vnetName} --query "[?name=='${vnetSubnetNamePods}']" | jq -r '.[].id') # Call the function to create the cluster (ACR permissions will be automatically configured) -create_aks_cluster "${CLUSTER}" "${RG}" "${LOCATION}" "${nodeSubnetID}" "${podSubnetID}" +create_aks_cluster "${CLUSTER}" "${RG}" "${LOCATION}" "${systemNodeSubnetID}" "${podSubnetID}" # Wait for cluster to be ready with retry logic and timeout echo "Waiting for cluster to be ready..." @@ -447,10 +476,16 @@ for i in $(seq 1 $USER_NODEPOOL_COUNT); do fi pool_name="userpool${i}" + + # Create dedicated node subnet for this user pool (index i corresponds to userpool number) + userPoolSubnetName="nodes-${pool_name}" + create_node_subnet "${userPoolSubnetName}" ${i} + userPoolNodeSubnetID=$(az network vnet subnet list -g ${RG} --vnet-name ${vnetName} --query "[?name=='${userPoolSubnetName}']" | jq -r '.[].id') + labels="slo=true testscenario=swiftv2 agentpool=${pool_name}" taints="slo=true:NoSchedule" - echo "Creating user nodepool $pool_name (1/${USER_NODEPOOL_COUNT} initial node)" - if ! create_and_verify_nodepool "${CLUSTER}" "${pool_name}" "${RG}" "${INITIAL_USER_NODES}" "${VM_SKU}" "${nodeSubnetID}" "${podSubnetID}" "${labels}" "${taints}"; then + echo "Creating user nodepool $pool_name (1/${USER_NODEPOOL_COUNT} initial node) with subnet ${userPoolSubnetName}" + if ! create_and_verify_nodepool "${CLUSTER}" "${pool_name}" "${RG}" "${INITIAL_USER_NODES}" "${VM_SKU}" "${userPoolNodeSubnetID}" "${podSubnetID}" "${labels}" "${taints}"; then echo "ERROR: Failed to create user nodepool ${pool_name}" exit 1 fi @@ -464,10 +499,17 @@ else # The scale-cluster.sh script will scale userpoolBuffer to handle any shortfall echo "Creating buffer nodepool with $INITIAL_USER_NODES node (will be scaled later if needed)..." - pool_name="userpoolBuffer" - labels="slo=true testscenario=swiftv2 agentpool=${pool_name}" - taints="slo=true:NoSchedule" - if ! create_and_verify_nodepool "${CLUSTER}" "${pool_name}" "${RG}" "${INITIAL_USER_NODES}" "${VM_SKU}" "${nodeSubnetID}" "${podSubnetID}" "${labels}" "${taints}"; then + pool_name="userpoolBuffer" + + # Create dedicated node subnet for buffer pool (index = USER_NODEPOOL_COUNT + 1 to follow user pools) + bufferSubnetIndex=$((USER_NODEPOOL_COUNT + 1)) + bufferPoolSubnetName="nodes-buffer" + create_node_subnet "${bufferPoolSubnetName}" ${bufferSubnetIndex} + bufferPoolNodeSubnetID=$(az network vnet subnet list -g ${RG} --vnet-name ${vnetName} --query "[?name=='${bufferPoolSubnetName}']" | jq -r '.[].id') + + labels="slo=true testscenario=swiftv2 agentpool=${pool_name}" + taints="slo=true:NoSchedule" + if ! create_and_verify_nodepool "${CLUSTER}" "${pool_name}" "${RG}" "${INITIAL_USER_NODES}" "${VM_SKU}" "${bufferPoolNodeSubnetID}" "${podSubnetID}" "${labels}" "${taints}"; then echo "ERROR: Failed to create buffer nodepool" exit 1 fi From 324ed5ede454cf7f8682f344df60c7d354365756 Mon Sep 17 00:00:00 2001 From: shreyas Date: Tue, 27 Jan 2026 22:58:18 +0000 Subject: [PATCH 2/7] Adding settings for 4000 nodes for static reservation --- .../swiftv2-staticres-gradual-matrix.yml | 42 ------------------- pipelines/system/new-pipeline-test.yml | 2 +- 2 files changed, 1 insertion(+), 43 deletions(-) diff --git a/pipelines/system/matrices/swiftv2-staticres-gradual-matrix.yml b/pipelines/system/matrices/swiftv2-staticres-gradual-matrix.yml index 3a8347decf..4cf0f367cf 100644 --- a/pipelines/system/matrices/swiftv2-staticres-gradual-matrix.yml +++ b/pipelines/system/matrices/swiftv2-staticres-gradual-matrix.yml @@ -22,27 +22,6 @@ parameters: - name: matrix type: object default: - pps=35_ppp=1_pods=70_nodes=10: - node_count: 10 - pods_per_step: 35 - pods_per_pni: 1 - test_timeout: "2m" - test_type: Gradual_StaticRes_PPS=35_PPP=1_Pods=70_Nodes=10 - cl2_config_file: swiftv2_deployment_staticres_scale_config.yaml - pps=50_ppp=1_pods=3500_nodes=500: - node_count: 500 - pods_per_step: 50 - pods_per_pni: 1 - test_timeout: "2m" - test_type: Gradual_StaticRes_PPS=50_PPP=1_Pods=3500_Nodes=500 - cl2_config_file: swiftv2_deployment_staticres_scale_config.yaml - pps=50_ppp=1_pods=7000_nodes=1000: - node_count: 1000 - pods_per_step: 50 - pods_per_pni: 1 - test_timeout: "2m" - test_type: Gradual_StaticRes_PPS=50_PPP=1_Pods=7000_Nodes=1000 - cl2_config_file: swiftv2_deployment_staticres_scale_config.yaml pps=50_ppp=1_pods=14000_nodes=2000: node_count: 2000 pods_per_step: 50 @@ -50,27 +29,6 @@ parameters: test_timeout: "2m" test_type: Gradual_StaticRes_PPS=50_PPP=1_Pods=14000_Nodes=2000 cl2_config_file: swiftv2_deployment_staticres_scale_config.yaml - pps=50_ppp=1_pods=17500_nodes=2500: - node_count: 2500 - pods_per_step: 50 - pods_per_pni: 1 - test_timeout: "2m" - test_type: Gradual_StaticRes_PPS=50_PPP=1_Pods=17500_Nodes=2500 - cl2_config_file: swiftv2_deployment_staticres_scale_config.yaml - pps=50_ppp=1_pods=21000_nodes=3000: - node_count: 3000 - pods_per_step: 50 - pods_per_pni: 1 - test_timeout: "2m" - test_type: Gradual_StaticRes_PPS=50_PPP=1_Pods=21000_Nodes=3000 - cl2_config_file: swiftv2_deployment_staticres_scale_config.yaml - pps=50_ppp=1_pods=35000_nodes=5000: - node_count: 5000 - pods_per_step: 50 - pods_per_pni: 1 - test_timeout: "2m" - test_type: Gradual_StaticRes_PPS=50_PPP=1_Pods=35000_Nodes=5000 - cl2_config_file: swiftv2_deployment_staticres_scale_config.yaml jobs: - template: /jobs/competitive-test.yml diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 432045663e..02a472ba86 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -101,7 +101,7 @@ stages: base_run_id: $(BASE_RUN_ID) - stage: dynamicres_gradual - condition: succeededOrFailed() + condition: false displayName: 'Dynamic Gradual' dependsOn: - generate_base_run_id From 7aaf30b895f5fb08431ab61554a3d4d2ef43dd1b Mon Sep 17 00:00:00 2001 From: shreyas Date: Thu, 29 Jan 2026 21:58:23 +0000 Subject: [PATCH 3/7] Adding canada central --- pipelines/system/new-pipeline-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 02a472ba86..ea17590c0d 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -7,7 +7,7 @@ variables: AZURE_SERVICE_CONNECTION: "Azure-for-Telescope-internal" AZURE_STORAGE_ACCOUNT_NAME: "akstelescope" AZURE_TELESCOPE_STORAGE_ACCOUNT_NAME: "telescopedata" - LOCATION: "uksouth" + LOCATION: "canadacentral" CREATESWIFTV2PING: "true" CLEANUP_RESOURCES: "true" K8S_VERSION: "1.33" From 1afb281e6e76b6d502a346685231a3718bac9eed Mon Sep 17 00:00:00 2001 From: shreyas Date: Wed, 4 Feb 2026 18:07:17 +0000 Subject: [PATCH 4/7] Adding 25 pods per step --- .../system/matrices/swiftv2-staticres-gradual-matrix.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipelines/system/matrices/swiftv2-staticres-gradual-matrix.yml b/pipelines/system/matrices/swiftv2-staticres-gradual-matrix.yml index 4cf0f367cf..81106ce916 100644 --- a/pipelines/system/matrices/swiftv2-staticres-gradual-matrix.yml +++ b/pipelines/system/matrices/swiftv2-staticres-gradual-matrix.yml @@ -22,12 +22,12 @@ parameters: - name: matrix type: object default: - pps=50_ppp=1_pods=14000_nodes=2000: + pps=25_ppp=1_pods=14000_nodes=2000: node_count: 2000 - pods_per_step: 50 + pods_per_step: 25 pods_per_pni: 1 test_timeout: "2m" - test_type: Gradual_StaticRes_PPS=50_PPP=1_Pods=14000_Nodes=2000 + test_type: Gradual_StaticRes_PPS=25_PPP=1_Pods=14000_Nodes=2000 cl2_config_file: swiftv2_deployment_staticres_scale_config.yaml jobs: From 29f96a4cd3a3f6e2998c62ebd51b97bbe2662172 Mon Sep 17 00:00:00 2001 From: shreyas Date: Wed, 4 Feb 2026 22:57:51 +0000 Subject: [PATCH 5/7] increasing max pods --- pipelines/system/new-pipeline-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index ea17590c0d..cd25d6c963 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -37,7 +37,7 @@ variables: cpu_per_node: 16 # max_pods = PODS_PER_NODE + default k8s system pods (5) + log analytics pod (1 if ENABLE_LOG_ANALYTICS=true) # Current: 7 + 5 = 12 (increase to 13 if enabling Log Analytics) - max_pods: 12 + max_pods: 13 # Common matrix parameters repeats: 1 node_label: "swiftv2slo=true" From 0260153d1444ab00fb711e2f1829a52a65740d2b Mon Sep 17 00:00:00 2001 From: shreyas Date: Mon, 9 Feb 2026 18:12:43 +0000 Subject: [PATCH 6/7] Clean up flag as false --- pipelines/system/new-pipeline-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index cd25d6c963..45e7f33c81 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -9,7 +9,7 @@ variables: AZURE_TELESCOPE_STORAGE_ACCOUNT_NAME: "telescopedata" LOCATION: "canadacentral" CREATESWIFTV2PING: "true" - CLEANUP_RESOURCES: "true" + CLEANUP_RESOURCES: "false" K8S_VERSION: "1.33" PROVISION_BUFFER_NODES: "false" # Log Analytics are expensive - disable and delete after use From bc503a20fb8604ff868945911ca52b42ca8307a9 Mon Sep 17 00:00:00 2001 From: shreyas Date: Mon, 9 Feb 2026 19:13:24 +0000 Subject: [PATCH 7/7] Scaling to 3k nodes --- pipelines/system/matrices/swiftv2-staticres-gradual-matrix.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/system/matrices/swiftv2-staticres-gradual-matrix.yml b/pipelines/system/matrices/swiftv2-staticres-gradual-matrix.yml index 81106ce916..dfc0950047 100644 --- a/pipelines/system/matrices/swiftv2-staticres-gradual-matrix.yml +++ b/pipelines/system/matrices/swiftv2-staticres-gradual-matrix.yml @@ -23,7 +23,7 @@ parameters: type: object default: pps=25_ppp=1_pods=14000_nodes=2000: - node_count: 2000 + node_count: 3000 pods_per_step: 25 pods_per_pni: 1 test_timeout: "2m"