Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 3 additions & 45 deletions pipelines/system/matrices/swiftv2-staticres-gradual-matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,54 +22,12 @@ parameters:
- name: matrix
type: object
default:
pps=35_ppp=1_pods=70_nodes=10:
node_count: 10
pods_per_step: 35
pods_per_pni: 1
test_timeout: "2m"
test_type: Gradual_StaticRes_PPS=35_PPP=1_Pods=70_Nodes=10
cl2_config_file: swiftv2_deployment_staticres_scale_config.yaml
pps=50_ppp=1_pods=3500_nodes=500:
node_count: 500
pods_per_step: 50
pods_per_pni: 1
test_timeout: "2m"
test_type: Gradual_StaticRes_PPS=50_PPP=1_Pods=3500_Nodes=500
cl2_config_file: swiftv2_deployment_staticres_scale_config.yaml
pps=50_ppp=1_pods=7000_nodes=1000:
node_count: 1000
pods_per_step: 50
pods_per_pni: 1
test_timeout: "2m"
test_type: Gradual_StaticRes_PPS=50_PPP=1_Pods=7000_Nodes=1000
cl2_config_file: swiftv2_deployment_staticres_scale_config.yaml
pps=50_ppp=1_pods=14000_nodes=2000:
node_count: 2000
pods_per_step: 50
pods_per_pni: 1
test_timeout: "2m"
test_type: Gradual_StaticRes_PPS=50_PPP=1_Pods=14000_Nodes=2000
cl2_config_file: swiftv2_deployment_staticres_scale_config.yaml
pps=50_ppp=1_pods=17500_nodes=2500:
node_count: 2500
pods_per_step: 50
pods_per_pni: 1
test_timeout: "2m"
test_type: Gradual_StaticRes_PPS=50_PPP=1_Pods=17500_Nodes=2500
cl2_config_file: swiftv2_deployment_staticres_scale_config.yaml
pps=50_ppp=1_pods=21000_nodes=3000:
pps=25_ppp=1_pods=14000_nodes=2000:
node_count: 3000
pods_per_step: 50
pods_per_pni: 1
test_timeout: "2m"
test_type: Gradual_StaticRes_PPS=50_PPP=1_Pods=21000_Nodes=3000
cl2_config_file: swiftv2_deployment_staticres_scale_config.yaml
pps=50_ppp=1_pods=35000_nodes=5000:
node_count: 5000
pods_per_step: 50
pods_per_step: 25
pods_per_pni: 1
test_timeout: "2m"
test_type: Gradual_StaticRes_PPS=50_PPP=1_Pods=35000_Nodes=5000
test_type: Gradual_StaticRes_PPS=25_PPP=1_Pods=14000_Nodes=2000
cl2_config_file: swiftv2_deployment_staticres_scale_config.yaml

jobs:
Expand Down
8 changes: 4 additions & 4 deletions pipelines/system/new-pipeline-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ variables:
AZURE_SERVICE_CONNECTION: "Azure-for-Telescope-internal"
AZURE_STORAGE_ACCOUNT_NAME: "akstelescope"
AZURE_TELESCOPE_STORAGE_ACCOUNT_NAME: "telescopedata"
LOCATION: "uksouth"
LOCATION: "canadacentral"
CREATESWIFTV2PING: "true"
CLEANUP_RESOURCES: "true"
CLEANUP_RESOURCES: "false"
K8S_VERSION: "1.33"
PROVISION_BUFFER_NODES: "false"
# Log Analytics are expensive - disable and delete after use
Expand Down Expand Up @@ -37,7 +37,7 @@ variables:
cpu_per_node: 16
# max_pods = PODS_PER_NODE + default k8s system pods (5) + log analytics pod (1 if ENABLE_LOG_ANALYTICS=true)
# Current: 7 + 5 = 12 (increase to 13 if enabling Log Analytics)
max_pods: 12
max_pods: 13
# Common matrix parameters
repeats: 1
node_label: "swiftv2slo=true"
Expand Down Expand Up @@ -101,7 +101,7 @@ stages:
base_run_id: $(BASE_RUN_ID)

- stage: dynamicres_gradual
condition: succeededOrFailed()
condition: false
displayName: 'Dynamic Gradual'
dependsOn:
- generate_base_run_id
Expand Down
66 changes: 54 additions & 12 deletions swiftv2kubeobjects/createclusterforping.sh
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ create_and_verify_nodepool() {
nodepool_cmd+=" --node-taints \"${taints}\""
fi

# Set max-surge to 1 node instead of percentage
nodepool_cmd+=" --max-surge 1"

# Add any extra arguments
if [[ -n "$extra_args" ]]; then
nodepool_cmd+=" ${extra_args}"
Expand Down Expand Up @@ -320,16 +323,42 @@ vnetName="net"
vnetAddressSpaceCIDR="10.0.0.0/9"

az network vnet create -n ${vnetName} -g ${RG} --address-prefixes ${vnetAddressSpaceCIDR} -l ${LOCATION}
echo "create vnet subnet"
vnetSubnetNameNodes="nodes"
echo "create vnet subnets"

# Pod subnet (shared across all pools)
vnetSubnetNamePods="pods"
vnetSubnetNodesCIDR="10.0.0.0/16"
vnetSubnetPodsCIDR="10.1.0.0/16"
podCIDR="10.128.0.0/9"
natGatewayID=$(az network nat gateway list -g ${RG} | jq -r '.[].id')
az network vnet subnet create -n ${vnetSubnetNameNodes} --vnet-name ${vnetName} --address-prefixes ${vnetSubnetNodesCIDR} --nat-gateway ${natGatewayID} --default-outbound-access false -g ${RG}

# Create shared pod subnet
az network vnet subnet create -n ${vnetSubnetNamePods} --vnet-name ${vnetName} --address-prefixes ${vnetSubnetPodsCIDR} --nat-gateway $NAT_GW_NAME --default-outbound-access false -g ${RG}

# =============================================================================
# NODE SUBNETS: Each nodepool gets its own /20 subnet (4094 IPs each)
# Layout within 10.0.0.0/16:
# - 10.0.0.0/20 : system pool (nodes-system)
# - 10.0.16.0/20 : userpool1 (nodes-userpool1)
# - 10.0.32.0/20 : userpool2 (nodes-userpool2)
# - ... up to 10.0.128.0/20 : userpool8
# - 10.0.144.0/20 : buffer pool (nodes-buffer)
# =============================================================================

# Helper function to create node subnet with calculated CIDR
create_node_subnet() {
local subnet_name=$1
local subnet_index=$2 # 0=system, 1-8=userpools, 9=buffer
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we assuming that we'll only have 8 userpools? we have 500 nodes per userpool so for 5000 nodes we'll need 10 pools. I'm concerned we might not have enough subnet space (third octet)

local third_octet=$((subnet_index * 16))
local subnet_cidr="10.0.${third_octet}.0/20"

echo "Creating node subnet: ${subnet_name} with CIDR: ${subnet_cidr}"
az network vnet subnet create -n ${subnet_name} --vnet-name ${vnetName} --address-prefixes ${subnet_cidr} --nat-gateway ${natGatewayID} --default-outbound-access false -g ${RG}
}

# Create system pool node subnet (index 0)
vnetSubnetNameNodesSystem="nodes-system"
create_node_subnet "${vnetSubnetNameNodesSystem}" 0

# az role assignment create --assignee d0fdeb79-ee9b-464c-ae0f-ba72d307208d --role "Network Contributor" --scope /subscriptions/${SUBSCRIPTION}/resourceGroups/$RG/providers/Microsoft.Network/virtualNetworks/$vnetName
# set az account to subnetDelegator
az account set --subscription $SD_SUB
Expand All @@ -344,11 +373,11 @@ az account set --subscription $SUBSCRIPTION
# create cluster
echo "create cluster"
vnetID=$(az network vnet list -g ${RG} | jq -r '.[].id')
nodeSubnetID=$(az network vnet subnet list -g ${RG} --vnet-name ${vnetName} --query "[?name=='${vnetSubnetNameNodes}']" | jq -r '.[].id')
systemNodeSubnetID=$(az network vnet subnet list -g ${RG} --vnet-name ${vnetName} --query "[?name=='${vnetSubnetNameNodesSystem}']" | jq -r '.[].id')
podSubnetID=$(az network vnet subnet list -g ${RG} --vnet-name ${vnetName} --query "[?name=='${vnetSubnetNamePods}']" | jq -r '.[].id')

# Call the function to create the cluster (ACR permissions will be automatically configured)
create_aks_cluster "${CLUSTER}" "${RG}" "${LOCATION}" "${nodeSubnetID}" "${podSubnetID}"
create_aks_cluster "${CLUSTER}" "${RG}" "${LOCATION}" "${systemNodeSubnetID}" "${podSubnetID}"

# Wait for cluster to be ready with retry logic and timeout
echo "Waiting for cluster to be ready..."
Expand Down Expand Up @@ -447,10 +476,16 @@ for i in $(seq 1 $USER_NODEPOOL_COUNT); do
fi

pool_name="userpool${i}"

# Create dedicated node subnet for this user pool (index i corresponds to userpool number)
userPoolSubnetName="nodes-${pool_name}"
create_node_subnet "${userPoolSubnetName}" ${i}
userPoolNodeSubnetID=$(az network vnet subnet list -g ${RG} --vnet-name ${vnetName} --query "[?name=='${userPoolSubnetName}']" | jq -r '.[].id')

labels="slo=true testscenario=swiftv2 agentpool=${pool_name}"
taints="slo=true:NoSchedule"
echo "Creating user nodepool $pool_name (1/${USER_NODEPOOL_COUNT} initial node)"
if ! create_and_verify_nodepool "${CLUSTER}" "${pool_name}" "${RG}" "${INITIAL_USER_NODES}" "${VM_SKU}" "${nodeSubnetID}" "${podSubnetID}" "${labels}" "${taints}"; then
echo "Creating user nodepool $pool_name (1/${USER_NODEPOOL_COUNT} initial node) with subnet ${userPoolSubnetName}"
if ! create_and_verify_nodepool "${CLUSTER}" "${pool_name}" "${RG}" "${INITIAL_USER_NODES}" "${VM_SKU}" "${userPoolNodeSubnetID}" "${podSubnetID}" "${labels}" "${taints}"; then
echo "ERROR: Failed to create user nodepool ${pool_name}"
exit 1
fi
Expand All @@ -464,10 +499,17 @@ else
# The scale-cluster.sh script will scale userpoolBuffer to handle any shortfall

echo "Creating buffer nodepool with $INITIAL_USER_NODES node (will be scaled later if needed)..."
pool_name="userpoolBuffer"
labels="slo=true testscenario=swiftv2 agentpool=${pool_name}"
taints="slo=true:NoSchedule"
if ! create_and_verify_nodepool "${CLUSTER}" "${pool_name}" "${RG}" "${INITIAL_USER_NODES}" "${VM_SKU}" "${nodeSubnetID}" "${podSubnetID}" "${labels}" "${taints}"; then
pool_name="userpoolBuffer"

# Create dedicated node subnet for buffer pool (index = USER_NODEPOOL_COUNT + 1 to follow user pools)
bufferSubnetIndex=$((USER_NODEPOOL_COUNT + 1))
bufferPoolSubnetName="nodes-buffer"
create_node_subnet "${bufferPoolSubnetName}" ${bufferSubnetIndex}
bufferPoolNodeSubnetID=$(az network vnet subnet list -g ${RG} --vnet-name ${vnetName} --query "[?name=='${bufferPoolSubnetName}']" | jq -r '.[].id')

labels="slo=true testscenario=swiftv2 agentpool=${pool_name}"
taints="slo=true:NoSchedule"
if ! create_and_verify_nodepool "${CLUSTER}" "${pool_name}" "${RG}" "${INITIAL_USER_NODES}" "${VM_SKU}" "${bufferPoolNodeSubnetID}" "${podSubnetID}" "${labels}" "${taints}"; then
echo "ERROR: Failed to create buffer nodepool"
exit 1
fi
Expand Down