From 837481be542edd883ea62bdb81c30157f5909678 Mon Sep 17 00:00:00 2001 From: Sumanth Reddy Chinna Pullaiah Date: Tue, 21 Jan 2025 20:07:18 +0000 Subject: [PATCH 1/3] cas 200 nodes --- ...autoscaler-benchmark-nodes200-pods2000.yml | 46 +++++++++++++++++++ .../terraform-inputs/azure.tfvars | 44 ++++++++++++++++++ .../terraform-test-inputs/azure.json | 4 ++ 3 files changed, 94 insertions(+) create mode 100644 pipelines/perf-eval/Autoscale Benchmark/cluster-autoscaler-benchmark-nodes200-pods2000.yml create mode 100644 scenarios/perf-eval/cas-c4n200p2000 /terraform-inputs/azure.tfvars create mode 100644 scenarios/perf-eval/cas-c4n200p2000 /terraform-test-inputs/azure.json diff --git a/pipelines/perf-eval/Autoscale Benchmark/cluster-autoscaler-benchmark-nodes200-pods2000.yml b/pipelines/perf-eval/Autoscale Benchmark/cluster-autoscaler-benchmark-nodes200-pods2000.yml new file mode 100644 index 0000000000..58322d5288 --- /dev/null +++ b/pipelines/perf-eval/Autoscale Benchmark/cluster-autoscaler-benchmark-nodes200-pods2000.yml @@ -0,0 +1,46 @@ +trigger: none +schedules: + - cron: "0 */12 * * *" + displayName: "Every 12 Hour" + branches: + include: + - main + always: true + +variables: + SCENARIO_TYPE: perf-eval + SCENARIO_NAME: cas-c4n200p2k + SCENARIO_VERSION: main + +stages: + - stage: azure_eastus2 + dependsOn: [] + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: azure + regions: + - eastus2 + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20241022" + topology: cluster-autoscaler + matrix: + azure_cni: + cpu_per_node: 4 + node_count: 200 + node_per_step: 20 + pod_count: 2000 + max_pods: 110 + scale_up_timeout: "15m" + scale_down_timeout: "15m" + node_label_selector: "cas = dedicated" + node_selector: "{cas: dedicated}" + loop_count: 5 + cilium_enabled: False + cl2_config_file: cluster-scale-config.yaml + service_test: False + max_parallel: 1 + timeout_in_minutes: 720 + credential_type: service_connection + ssh_key_enabled: false diff --git a/scenarios/perf-eval/cas-c4n200p2000 /terraform-inputs/azure.tfvars b/scenarios/perf-eval/cas-c4n200p2000 /terraform-inputs/azure.tfvars new file mode 100644 index 0000000000..696000c394 --- /dev/null +++ b/scenarios/perf-eval/cas-c4n200p2000 /terraform-inputs/azure.tfvars @@ -0,0 +1,44 @@ +scenario_type = "perf-eval" +scenario_name = "cas-c4n200p2000" +deletion_delay = "2h" +owner = "aks" + +aks_config_list = [ + { + role = "cas" + aks_name = "cas" + dns_prefix = "cas" + subnet_name = "aks-network" + sku_tier = "Standard" + network_profile = { + network_plugin = "azure" + network_plugin_mode = "overlay" + } + default_node_pool = { + name = "default" + node_count = 5 + auto_scaling_enabled = false + vm_size = "Standard_D8_v3" + os_disk_type = "Managed" + only_critical_addons_enabled = false + temporary_name_for_rotation = "defaulttmp" + } + extra_node_pool = [ + { + name = "userpool" + node_count = 0 + min_count = 0 + max_count = 200 + auto_scaling_enabled = true + vm_size = "Standard_D4_v3" + max_pods = 110 + node_labels = { "cas" = "dedicated" } + } + ] + kubernetes_version = "1.31" + auto_scaler_profile = { + scale_down_delay_after_add = "0m" + scale_down_unneeded = "0m" + } + } +] diff --git a/scenarios/perf-eval/cas-c4n200p2000 /terraform-test-inputs/azure.json b/scenarios/perf-eval/cas-c4n200p2000 /terraform-test-inputs/azure.json new file mode 100644 index 0000000000..ea27a572c6 --- /dev/null +++ b/scenarios/perf-eval/cas-c4n200p2000 /terraform-test-inputs/azure.json @@ -0,0 +1,4 @@ +{ + "run_id" : "123456789", + "region" : "eastus" +} From 9f1a258359ede6fdc23a8a5c4f56f3ceba8e2898 Mon Sep 17 00:00:00 2001 From: Sumanth Reddy Chinna Pullaiah Date: Wed, 22 Jan 2025 19:45:23 +0000 Subject: [PATCH 2/3] update pods --- .../clusterloader2/cri/config/config.yaml | 2 + .../cri/config/deployment_template.yaml | 5 ++ modules/python/clusterloader2/cri/cri.py | 18 +++-- ...autoscaler-benchmark-nodes200-pods200.yml} | 21 +++--- .../azurelinux-resource-consume.yml | 68 +++++++++++++++++++ .../CRI Benchmark/cri-resource-consume.yml | 58 +++++++++++++--- .../terraform-inputs/azure.tfvars | 67 ++++++++++++++++++ .../terraform-test-inputs/azure.json | 0 .../terraform-inputs/azure.tfvars | 8 +-- .../terraform-test-inputs/azure.json | 4 ++ steps/engine/clusterloader2/cri/collect.yml | 2 +- steps/engine/clusterloader2/cri/execute.yml | 3 +- steps/terraform/set-input-variables-azure.yml | 4 +- 13 files changed, 227 insertions(+), 33 deletions(-) rename pipelines/perf-eval/Autoscale Benchmark/{cluster-autoscaler-benchmark-nodes200-pods2000.yml => cluster-autoscaler-benchmark-nodes200-pods200.yml} (63%) create mode 100644 pipelines/perf-eval/CRI Benchmark/azurelinux-resource-consume.yml create mode 100644 scenarios/perf-eval/azurelinux-resource-consume/terraform-inputs/azure.tfvars rename scenarios/perf-eval/{cas-c4n200p2000 => azurelinux-resource-consume}/terraform-test-inputs/azure.json (100%) rename scenarios/perf-eval/{cas-c4n200p2000 => cas-c4n200p200}/terraform-inputs/azure.tfvars (86%) create mode 100644 scenarios/perf-eval/cas-c4n200p200/terraform-test-inputs/azure.json diff --git a/modules/python/clusterloader2/cri/config/config.yaml b/modules/python/clusterloader2/cri/config/config.yaml index 5f0ddf9110..464cd5ddd5 100644 --- a/modules/python/clusterloader2/cri/config/config.yaml +++ b/modules/python/clusterloader2/cri/config/config.yaml @@ -10,6 +10,7 @@ name: resource-consumer {{$agentPoolPrefix := DefaultParam .CL2_AGENTPOOL_PREFIX "userpool"}} {{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "5m"}} {{$podStartupLatencyThreshold := DefaultParam .CL2_POD_STARTUP_LATENCY_THRESHOLD "15s"}} +{{$loadType := DefaultParam .CL2_LOAD_TYPE "memory"}} namespace: number: 1 @@ -67,6 +68,7 @@ steps: MemoryRequest: {{$memoryKi}} CPURequest: {{$cpu}}m AgentPool: {{$agentPoolPrefix}}{{$i}} + LoadType: {{$loadType}} - name: Waiting for latency pods to be running measurements: diff --git a/modules/python/clusterloader2/cri/config/deployment_template.yaml b/modules/python/clusterloader2/cri/config/deployment_template.yaml index 0764e9a568..8a26b19109 100644 --- a/modules/python/clusterloader2/cri/config/deployment_template.yaml +++ b/modules/python/clusterloader2/cri/config/deployment_template.yaml @@ -3,6 +3,7 @@ {{$MemoryRequest := DefaultParam .MemoryRequest "1000Ki"}} {{$CPURequest := DefaultParam .CPURequest "100m"}} {{$AgentPool := DefaultParam .AgentPool "userpool1"}} +{{$LoadType := DefaultParam .LoadType "memory"}} apiVersion: apps/v1 kind: Deployment @@ -24,6 +25,7 @@ spec: nodeSelector: agentpool: {{$AgentPool}} containers: +{{if eq $LoadType "memory"}} - name: resource-consumer-memory image: registry.k8s.io/e2e-test-images/resource-consumer:1.9 command: @@ -40,6 +42,8 @@ spec: resources: requests: memory: {{$MemoryRequest}} +{{end}} +{{if eq $LoadType "cpu"}} - name: resource-consumer-cpu image: registry.k8s.io/e2e-test-images/resource-consumer:1.9 command: @@ -50,6 +54,7 @@ spec: resources: requests: cpu: {{$CPURequest}} +{{end}} tolerations: - key: "cri-resource-consume" operator: "Equal" diff --git a/modules/python/clusterloader2/cri/cri.py b/modules/python/clusterloader2/cri/cri.py index faba58f93d..af8ed22cc4 100644 --- a/modules/python/clusterloader2/cri/cri.py +++ b/modules/python/clusterloader2/cri/cri.py @@ -11,6 +11,7 @@ "aws": 2, "aks": 6 } +MEMORY_SCALE_FACTOR = 0.95 # 95% of the total allocatable memory to account for error margin def _get_daemonsets_pods_allocated_resources(client, node_name): pods = client.get_pods_by_namespace("kube-system", field_selector=f"spec.nodeName={node_name}") @@ -23,7 +24,7 @@ def _get_daemonsets_pods_allocated_resources(client, node_name): memory_request += int(container.resources.requests.get("memory", "0Mi").replace("Mi", "")) return cpu_request, memory_request * 1024 -def override_config_clusterloader2(node_count, max_pods, repeats, operation_timeout, provider, override_file): +def override_config_clusterloader2(node_count, max_pods, repeats, operation_timeout, load_type, provider, override_file): client = KubernetesClient(os.path.expanduser("~/.kube/config")) nodes = client.get_nodes(label_selector="cri-resource-consume=true") if len(nodes) == 0: @@ -48,7 +49,7 @@ def override_config_clusterloader2(node_count, max_pods, repeats, operation_time pod_count = max_pods - DAEMONSETS_PER_NODE_MAP[provider] replica = pod_count * node_count cpu_request = cpu_value // pod_count - memory_request_in_Ki = math.ceil(memory_value // pod_count - 20) + memory_request_in_Ki = math.ceil(memory_value * MEMORY_SCALE_FACTOR // pod_count) memory_request_in_K = int(memory_request_in_Ki // 1.024) print(f"CPU request for each pod: {cpu_request}m, memory request for each pod: {memory_request_in_K}K, total replica: {replica}") @@ -65,6 +66,7 @@ def override_config_clusterloader2(node_count, max_pods, repeats, operation_time file.write("CL2_PROMETHEUS_MEMORY_SCALE_FACTOR: 30.0\n") file.write("CL2_PROMETHEUS_NODE_SELECTOR: \"prometheus: \\\"true\\\"\"\n") file.write("CL2_POD_STARTUP_LATENCY_THRESHOLD: 3m\n") + file.write(f"CL2_LOAD_TYPE: {load_type}\n") file.close() @@ -75,6 +77,7 @@ def collect_clusterloader2( node_count, max_pods, repeats, + load_type, cl2_report_dir, cloud_info, run_id, @@ -95,6 +98,7 @@ def collect_clusterloader2( "node_count": node_count, "max_pods": max_pods, "churn_rate": repeats, + "load_type": load_type, "status": status, "group": None, "measurement": None, @@ -137,6 +141,8 @@ def main(): parser_override.add_argument("max_pods", type=int, help="Number of maximum pods per node") parser_override.add_argument("repeats", type=int, help="Number of times to repeat the resource consumer deployment") parser_override.add_argument("operation_timeout", type=str, default="2m", help="Operation timeout") + parser_override.add_argument("load_type", type=str, choices=["memory", "cpu"], + default="memory", help="Type of load to generate") parser_override.add_argument("provider", type=str, help="Cloud provider name") parser_override.add_argument("cl2_override_file", type=str, help="Path to the overrides of CL2 config file") @@ -153,6 +159,8 @@ def main(): parser_collect.add_argument("node_count", type=int, help="Number of nodes") parser_collect.add_argument("max_pods", type=int, help="Number of maximum pods per node") parser_collect.add_argument("repeats", type=int, help="Number of times to repeat the resource consumer deployment") + parser_collect.add_argument("load_type", type=str, choices=["memory", "cpu"], + default="memory", help="Type of load to generate") parser_collect.add_argument("cl2_report_dir", type=str, help="Path to the CL2 report directory") parser_collect.add_argument("cloud_info", type=str, help="Cloud information") parser_collect.add_argument("run_id", type=str, help="Run ID") @@ -162,11 +170,13 @@ def main(): args = parser.parse_args() if args.command == "override": - override_config_clusterloader2(args.node_count, args.max_pods, args.repeats, args.operation_timeout, args.provider, args.cl2_override_file) + override_config_clusterloader2(args.node_count, args.max_pods, args.repeats, args.operation_timeout, args.load_type, + args.provider, args.cl2_override_file) elif args.command == "execute": execute_clusterloader2(args.cl2_image, args.cl2_config_dir, args.cl2_report_dir, args.kubeconfig, args.provider) elif args.command == "collect": - collect_clusterloader2(args.node_count, args.max_pods, args.repeats, args.cl2_report_dir, args.cloud_info, args.run_id, args.run_url, args.result_file) + collect_clusterloader2(args.node_count, args.max_pods, args.repeats, args.load_type, + args.cl2_report_dir, args.cloud_info, args.run_id, args.run_url, args.result_file) if __name__ == "__main__": main() \ No newline at end of file diff --git a/pipelines/perf-eval/Autoscale Benchmark/cluster-autoscaler-benchmark-nodes200-pods2000.yml b/pipelines/perf-eval/Autoscale Benchmark/cluster-autoscaler-benchmark-nodes200-pods200.yml similarity index 63% rename from pipelines/perf-eval/Autoscale Benchmark/cluster-autoscaler-benchmark-nodes200-pods2000.yml rename to pipelines/perf-eval/Autoscale Benchmark/cluster-autoscaler-benchmark-nodes200-pods200.yml index 58322d5288..d9b51955e6 100644 --- a/pipelines/perf-eval/Autoscale Benchmark/cluster-autoscaler-benchmark-nodes200-pods2000.yml +++ b/pipelines/perf-eval/Autoscale Benchmark/cluster-autoscaler-benchmark-nodes200-pods200.yml @@ -1,7 +1,7 @@ trigger: none schedules: - - cron: "0 */12 * * *" - displayName: "Every 12 Hour" + - cron: "0 */4 * * *" + displayName: "Every 4 Hour" branches: include: - main @@ -9,7 +9,7 @@ schedules: variables: SCENARIO_TYPE: perf-eval - SCENARIO_NAME: cas-c4n200p2k + SCENARIO_NAME: cas-c4n200p200 SCENARIO_VERSION: main stages: @@ -23,24 +23,19 @@ stages: - eastus2 engine: clusterloader2 engine_input: - image: "ghcr.io/azure/clusterloader2:v20241022" + image: "ghcr.io/azure/clusterloader2:v20241002" topology: cluster-autoscaler matrix: - azure_cni: + c4-n200-p200: cpu_per_node: 4 node_count: 200 - node_per_step: 20 - pod_count: 2000 - max_pods: 110 + pod_count: 200 scale_up_timeout: "15m" scale_down_timeout: "15m" node_label_selector: "cas = dedicated" node_selector: "{cas: dedicated}" - loop_count: 5 - cilium_enabled: False - cl2_config_file: cluster-scale-config.yaml - service_test: False + loop_count: 3 max_parallel: 1 - timeout_in_minutes: 720 + timeout_in_minutes: 360 credential_type: service_connection ssh_key_enabled: false diff --git a/pipelines/perf-eval/CRI Benchmark/azurelinux-resource-consume.yml b/pipelines/perf-eval/CRI Benchmark/azurelinux-resource-consume.yml new file mode 100644 index 0000000000..09667bad5a --- /dev/null +++ b/pipelines/perf-eval/CRI Benchmark/azurelinux-resource-consume.yml @@ -0,0 +1,68 @@ +trigger: none +schedules: + - cron: "0 */4 * * *" + displayName: "Every 4 Hour" + branches: + include: + - main + always: true + +variables: + SCENARIO_TYPE: perf-eval + SCENARIO_NAME: azurelinux-resource-consume + SCENARIO_VERSION: main + +stages: + - stage: azurelinux_v3_westeurope + dependsOn: [] + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: azure + regions: + - westeurope + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20241016" + topology: cri-resource-consume + matrix: + n10-p300-memory: + node_count: 10 + max_pods: 30 + repeats: 1 + operation_timeout: 3m + load_type: memory + n10-p700-memory: + node_count: 10 + max_pods: 70 + repeats: 1 + operation_timeout: 7m + load_type: memory + n10-p1100-memory: + node_count: 10 + max_pods: 110 + repeats: 1 + operation_timeout: 11m + load_type: memory + n10-p300-cpu: + node_count: 10 + max_pods: 30 + repeats: 1 + operation_timeout: 3m + load_type: cpu + n10-p700-cpu: + node_count: 10 + max_pods: 70 + repeats: 1 + operation_timeout: 7m + load_type: cpu + n10-p1100-cpu: + node_count: 10 + max_pods: 110 + repeats: 1 + operation_timeout: 11m + load_type: cpu + max_parallel: 3 + timeout_in_minutes: 120 + credential_type: service_connection + ssh_key_enabled: false diff --git a/pipelines/perf-eval/CRI Benchmark/cri-resource-consume.yml b/pipelines/perf-eval/CRI Benchmark/cri-resource-consume.yml index 86794bf4e7..cd4c2129a6 100644 --- a/pipelines/perf-eval/CRI Benchmark/cri-resource-consume.yml +++ b/pipelines/perf-eval/CRI Benchmark/cri-resource-consume.yml @@ -1,7 +1,7 @@ trigger: none schedules: - - cron: "0 */12 * * *" - displayName: "Every 12 Hour" + - cron: "0 */4 * * *" + displayName: "Every 4 Hour" branches: include: - main @@ -26,21 +26,42 @@ stages: image: "ghcr.io/azure/clusterloader2:v20241016" topology: cri-resource-consume matrix: - n10-p300: + n10-p300-memory: node_count: 10 max_pods: 30 repeats: 1 operation_timeout: 3m - n10-p700: + load_type: memory + n10-p700-memory: node_count: 10 max_pods: 70 repeats: 1 operation_timeout: 7m - n10-p1100: + load_type: memory + n10-p1100-memory: node_count: 10 max_pods: 110 repeats: 1 operation_timeout: 11m + load_type: memory + n10-p300-cpu: + node_count: 10 + max_pods: 30 + repeats: 1 + operation_timeout: 3m + load_type: cpu + n10-p700-cpu: + node_count: 10 + max_pods: 70 + repeats: 1 + operation_timeout: 7m + load_type: cpu + n10-p1100-cpu: + node_count: 10 + max_pods: 110 + repeats: 1 + operation_timeout: 11m + load_type: cpu max_parallel: 3 timeout_in_minutes: 120 credential_type: service_connection @@ -58,21 +79,42 @@ stages: image: "ghcr.io/azure/clusterloader2:v20241016" topology: cri-resource-consume matrix: - n10-p300: + n10-p300-memory: + node_count: 10 + max_pods: 30 + repeats: 1 + operation_timeout: 3m + load_type: memory + n10-p700-memory: + node_count: 10 + max_pods: 70 + repeats: 1 + operation_timeout: 7m + load_type: memory + n10-p1100-memory: + node_count: 10 + max_pods: 110 + repeats: 1 + operation_timeout: 11m + load_type: memory + n10-p300-cpu: node_count: 10 max_pods: 30 repeats: 1 operation_timeout: 3m - n10-p700: + load_type: cpu + n10-p700-cpu: node_count: 10 max_pods: 70 repeats: 1 operation_timeout: 7m - n10-p1100: + load_type: cpu + n10-p1100-cpu: node_count: 10 max_pods: 110 repeats: 1 operation_timeout: 11m + load_type: cpu max_parallel: 3 timeout_in_minutes: 120 credential_type: service_connection diff --git a/scenarios/perf-eval/azurelinux-resource-consume/terraform-inputs/azure.tfvars b/scenarios/perf-eval/azurelinux-resource-consume/terraform-inputs/azure.tfvars new file mode 100644 index 0000000000..10b75e342a --- /dev/null +++ b/scenarios/perf-eval/azurelinux-resource-consume/terraform-inputs/azure.tfvars @@ -0,0 +1,67 @@ +scenario_type = "perf-eval" +scenario_name = "azurelinux-resource-consume" +deletion_delay = "2h" +owner = "aks" + +network_config_list = [ + { + role = "client" + vnet_name = "cri-vnet" + vnet_address_space = "10.0.0.0/9" + subnet = [ + { + name = "cri-subnet-1" + address_prefix = "10.0.0.0/16" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + } +] + +aks_config_list = [ + { + role = "client" + aks_name = "cri-resource-consume" + dns_prefix = "cri" + subnet_name = "cri-vnet" + sku_tier = "Standard" + network_profile = { + network_plugin = "azure" + network_plugin_mode = "overlay" + pod_cidr = "10.0.0.0/9" + service_cidr = "192.168.0.0/16" + dns_service_ip = "192.168.0.10" + } + default_node_pool = { + name = "default" + node_count = 3 + vm_size = "Standard_D16s_v3" + os_disk_type = "Managed" + os_sku = "AzureLinux" + only_critical_addons_enabled = true + temporary_name_for_rotation = "defaulttmp" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D16_v3" + os_sku = "AzureLinux" + node_labels = { "prometheus" = "true" } + }, + { + name = "userpool0" + node_count = 10 + auto_scaling_enabled = false + vm_size = "Standard_D16s_v3" + os_sku = "AzureLinux" + node_taints = ["cri-resource-consume=true:NoSchedule"] + node_labels = { "cri-resource-consume" = "true" } + } + ] + kubernetes_version = "1.31" + } +] diff --git a/scenarios/perf-eval/cas-c4n200p2000 /terraform-test-inputs/azure.json b/scenarios/perf-eval/azurelinux-resource-consume/terraform-test-inputs/azure.json similarity index 100% rename from scenarios/perf-eval/cas-c4n200p2000 /terraform-test-inputs/azure.json rename to scenarios/perf-eval/azurelinux-resource-consume/terraform-test-inputs/azure.json diff --git a/scenarios/perf-eval/cas-c4n200p2000 /terraform-inputs/azure.tfvars b/scenarios/perf-eval/cas-c4n200p200/terraform-inputs/azure.tfvars similarity index 86% rename from scenarios/perf-eval/cas-c4n200p2000 /terraform-inputs/azure.tfvars rename to scenarios/perf-eval/cas-c4n200p200/terraform-inputs/azure.tfvars index 696000c394..4e643468d7 100644 --- a/scenarios/perf-eval/cas-c4n200p2000 /terraform-inputs/azure.tfvars +++ b/scenarios/perf-eval/cas-c4n200p200/terraform-inputs/azure.tfvars @@ -1,12 +1,12 @@ scenario_type = "perf-eval" -scenario_name = "cas-c4n200p2000" +scenario_name = "cas-c4n200p200" deletion_delay = "2h" owner = "aks" aks_config_list = [ { role = "cas" - aks_name = "cas" + aks_name = "cas-c4n200p200" dns_prefix = "cas" subnet_name = "aks-network" sku_tier = "Standard" @@ -18,7 +18,7 @@ aks_config_list = [ name = "default" node_count = 5 auto_scaling_enabled = false - vm_size = "Standard_D8_v3" + vm_size = "Standard_D4_v3" os_disk_type = "Managed" only_critical_addons_enabled = false temporary_name_for_rotation = "defaulttmp" @@ -30,7 +30,7 @@ aks_config_list = [ min_count = 0 max_count = 200 auto_scaling_enabled = true - vm_size = "Standard_D4_v3" + vm_size = "Standard_B4ps_v2" max_pods = 110 node_labels = { "cas" = "dedicated" } } diff --git a/scenarios/perf-eval/cas-c4n200p200/terraform-test-inputs/azure.json b/scenarios/perf-eval/cas-c4n200p200/terraform-test-inputs/azure.json new file mode 100644 index 0000000000..ea27a572c6 --- /dev/null +++ b/scenarios/perf-eval/cas-c4n200p200/terraform-test-inputs/azure.json @@ -0,0 +1,4 @@ +{ + "run_id" : "123456789", + "region" : "eastus" +} diff --git a/steps/engine/clusterloader2/cri/collect.yml b/steps/engine/clusterloader2/cri/collect.yml index 2f404fea50..663204f80a 100644 --- a/steps/engine/clusterloader2/cri/collect.yml +++ b/steps/engine/clusterloader2/cri/collect.yml @@ -16,7 +16,7 @@ steps: set -eo pipefail PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect \ - $NODE_COUNT $MAX_PODS $REPEATS \ + $NODE_COUNT $MAX_PODS $REPEATS $LOAD_TYPE \ $CL2_REPORT_DIR "$CLOUD_INFO" $RUN_ID $RUN_URL $TEST_RESULTS_FILE workingDirectory: modules/python/clusterloader2 env: diff --git a/steps/engine/clusterloader2/cri/execute.yml b/steps/engine/clusterloader2/cri/execute.yml index 54e1ce8c61..a02ba46dcf 100644 --- a/steps/engine/clusterloader2/cri/execute.yml +++ b/steps/engine/clusterloader2/cri/execute.yml @@ -13,7 +13,8 @@ steps: set -eo pipefail PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE override \ - $NODE_COUNT $MAX_PODS $REPEATS $OPERATION_TIMEOUT $CLOUD ${CL2_CONFIG_DIR}/overrides.yaml + $NODE_COUNT $MAX_PODS $REPEATS $OPERATION_TIMEOUT $LOAD_TYPE \ + $CLOUD ${CL2_CONFIG_DIR}/overrides.yaml PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \ ${CL2_IMAGE} ${CL2_CONFIG_DIR} $CL2_REPORT_DIR ${HOME}/.kube/config $CLOUD workingDirectory: modules/python/clusterloader2 diff --git a/steps/terraform/set-input-variables-azure.yml b/steps/terraform/set-input-variables-azure.yml index 6f5fb3f82c..cd3c8573c3 100644 --- a/steps/terraform/set-input-variables-azure.yml +++ b/steps/terraform/set-input-variables-azure.yml @@ -19,10 +19,10 @@ steps: SYSTEM_NODE_POOL=${SYSTEM_NODE_POOL:-null} USER_NODE_POOL=${USER_NODE_POOL:-null} - if [ -z "$(AKS_CLI_CUSTOM_HEADERS)" ]; then + if [ -z "$AKS_CLI_CUSTOM_HEADERS" ]; then AKS_CUSTOM_HEADERS='[]' else - IFS=', ' read -r -a aks_custom_headers_array <<< "$(AKS_CLI_CUSTOM_HEADERS)" + IFS=', ' read -r -a aks_custom_headers_array <<< "$AKS_CLI_CUSTOM_HEADERS" AKS_CUSTOM_HEADERS=$(printf '%s\n' "${aks_custom_headers_array[@]}" | jq -R . | jq -s .) fi From 9c18f66ee2cb2693c2c05c19e0171020020028fc Mon Sep 17 00:00:00 2001 From: Sumanth Reddy Chinna Pullaiah Date: Wed, 22 Jan 2025 20:22:48 +0000 Subject: [PATCH 3/3] update config --- ...-autoscaler-benchmark-nodes200-pods200.yml | 40 +++++-- .../terraform-inputs/aws.tfvars | 103 ++++++++++++++++++ .../terraform-inputs/azure.tfvars | 6 +- .../terraform-test-inputs/aws.json | 4 + .../terraform-test-inputs/azure.json | 2 +- 5 files changed, 144 insertions(+), 11 deletions(-) create mode 100644 scenarios/perf-eval/cas-c2n200p200/terraform-inputs/aws.tfvars rename scenarios/perf-eval/{cas-c4n200p200 => cas-c2n200p200}/terraform-inputs/azure.tfvars (90%) create mode 100644 scenarios/perf-eval/cas-c2n200p200/terraform-test-inputs/aws.json rename scenarios/perf-eval/{cas-c4n200p200 => cas-c2n200p200}/terraform-test-inputs/azure.json (53%) diff --git a/pipelines/perf-eval/Autoscale Benchmark/cluster-autoscaler-benchmark-nodes200-pods200.yml b/pipelines/perf-eval/Autoscale Benchmark/cluster-autoscaler-benchmark-nodes200-pods200.yml index d9b51955e6..9b83c1c05f 100644 --- a/pipelines/perf-eval/Autoscale Benchmark/cluster-autoscaler-benchmark-nodes200-pods200.yml +++ b/pipelines/perf-eval/Autoscale Benchmark/cluster-autoscaler-benchmark-nodes200-pods200.yml @@ -1,7 +1,7 @@ trigger: none schedules: - - cron: "0 */4 * * *" - displayName: "Every 4 Hour" + - cron: "0 16 * * *" + displayName: "Every day at 4:00 PM" branches: include: - main @@ -9,7 +9,7 @@ schedules: variables: SCENARIO_TYPE: perf-eval - SCENARIO_NAME: cas-c4n200p200 + SCENARIO_NAME: cas-c2n200p200 SCENARIO_VERSION: main stages: @@ -26,16 +26,42 @@ stages: image: "ghcr.io/azure/clusterloader2:v20241002" topology: cluster-autoscaler matrix: - c4-n200-p200: - cpu_per_node: 4 + c2-n200-p200: + cpu_per_node: 2 node_count: 200 pod_count: 200 scale_up_timeout: "15m" scale_down_timeout: "15m" node_label_selector: "cas = dedicated" node_selector: "{cas: dedicated}" - loop_count: 3 + loop_count: 1 max_parallel: 1 - timeout_in_minutes: 360 + timeout_in_minutes: 180 + credential_type: service_connection + ssh_key_enabled: false + - stage: aws_eastus2 + dependsOn: [] + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: aws + regions: + - us-east-2 + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20241002" + topology: cluster-autoscaler + matrix: + c2-n200-p200: + cpu_per_node: 2 + node_count: 200 + pod_count: 200 + scale_up_timeout: "15m" + scale_down_timeout: "15m" + node_label_selector: "cas = dedicated" + node_selector: "{cas: dedicated}" + loop_count: 1 + max_parallel: 1 + timeout_in_minutes: 180 credential_type: service_connection ssh_key_enabled: false diff --git a/scenarios/perf-eval/cas-c2n200p200/terraform-inputs/aws.tfvars b/scenarios/perf-eval/cas-c2n200p200/terraform-inputs/aws.tfvars new file mode 100644 index 0000000000..808e58c2c5 --- /dev/null +++ b/scenarios/perf-eval/cas-c2n200p200/terraform-inputs/aws.tfvars @@ -0,0 +1,103 @@ +scenario_type = "perf-eval" +scenario_name = "cas-c2n200p200" +deletion_delay = "2h" +owner = "aks" + +network_config_list = [ + { + role = "cas" + vpc_name = "cas-vpc" + vpc_cidr_block = "10.0.0.0/16" + subnet = [ + { + name = "cas-subnet" + cidr_block = "10.0.32.0/19" + zone_suffix = "a" + map_public_ip_on_launch = true + }, + { + name = "cas-subnet-2" + cidr_block = "10.0.64.0/19" + zone_suffix = "b" + map_public_ip_on_launch = true + }, + { + name = "cas-subnet-3" + cidr_block = "10.0.96.0/19" + zone_suffix = "c" + map_public_ip_on_launch = true + } + ] + security_group_name = "cas-sg" + route_tables = [ + { + name = "internet-rt" + cidr_block = "0.0.0.0/0" + } + ], + route_table_associations = [ + { + name = "cas-subnet-rt-assoc" + subnet_name = "cas-subnet" + route_table_name = "internet-rt" + }, + { + name = "cas-subnet-rt-assoc-2" + subnet_name = "cas-subnet-2" + route_table_name = "internet-rt" + }, + { + name = "cas-subnet-rt-assoc-3" + subnet_name = "cas-subnet-3" + route_table_name = "internet-rt" + } + ] + sg_rules = { + ingress = [] + egress = [ + { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_block = "0.0.0.0/0" + } + ] + } + } +] + +eks_config_list = [{ + role = "cas" + eks_name = "cas-c2n200p200" + enable_cluster_autoscaler = true + vpc_name = "cas-vpc" + policy_arns = ["AmazonEKSClusterPolicy", "AmazonEKSVPCResourceController", "AmazonEKSWorkerNodePolicy", "AmazonEKS_CNI_Policy", "AmazonEC2ContainerRegistryReadOnly", "AmazonSSMManagedInstanceCore"] + eks_managed_node_groups = [ + { + name = "default" + ami_type = "AL2_x86_64" + instance_types = ["m4.large"] + min_size = 5 + max_size = 5 + desired_size = 5 + capacity_type = "ON_DEMAND" + }, + { + name = "userpool" + ami_type = "AL2_x86_64" + instance_types = ["m6i.large"] + min_size = 0 + max_size = 200 + desired_size = 0 + capacity_type = "ON_DEMAND" + labels = { "cas" = "dedicated" } + taints = [] + } + ] + eks_addons = [] + kubernetes_version = "1.31" + auto_scaler_profile = { + scale_down_delay_after_add = "0m" + scale_down_unneeded = "0m" + } +}] diff --git a/scenarios/perf-eval/cas-c4n200p200/terraform-inputs/azure.tfvars b/scenarios/perf-eval/cas-c2n200p200/terraform-inputs/azure.tfvars similarity index 90% rename from scenarios/perf-eval/cas-c4n200p200/terraform-inputs/azure.tfvars rename to scenarios/perf-eval/cas-c2n200p200/terraform-inputs/azure.tfvars index 4e643468d7..9db1fe7e78 100644 --- a/scenarios/perf-eval/cas-c4n200p200/terraform-inputs/azure.tfvars +++ b/scenarios/perf-eval/cas-c2n200p200/terraform-inputs/azure.tfvars @@ -1,12 +1,12 @@ scenario_type = "perf-eval" -scenario_name = "cas-c4n200p200" +scenario_name = "cas-c2n200p200" deletion_delay = "2h" owner = "aks" aks_config_list = [ { role = "cas" - aks_name = "cas-c4n200p200" + aks_name = "cas-c2n200p200" dns_prefix = "cas" subnet_name = "aks-network" sku_tier = "Standard" @@ -30,7 +30,7 @@ aks_config_list = [ min_count = 0 max_count = 200 auto_scaling_enabled = true - vm_size = "Standard_B4ps_v2" + vm_size = "Standard_D2_v5" max_pods = 110 node_labels = { "cas" = "dedicated" } } diff --git a/scenarios/perf-eval/cas-c2n200p200/terraform-test-inputs/aws.json b/scenarios/perf-eval/cas-c2n200p200/terraform-test-inputs/aws.json new file mode 100644 index 0000000000..cb30052b14 --- /dev/null +++ b/scenarios/perf-eval/cas-c2n200p200/terraform-test-inputs/aws.json @@ -0,0 +1,4 @@ +{ + "run_id" : "123456789", + "region" : "us-east-2" +} diff --git a/scenarios/perf-eval/cas-c4n200p200/terraform-test-inputs/azure.json b/scenarios/perf-eval/cas-c2n200p200/terraform-test-inputs/azure.json similarity index 53% rename from scenarios/perf-eval/cas-c4n200p200/terraform-test-inputs/azure.json rename to scenarios/perf-eval/cas-c2n200p200/terraform-test-inputs/azure.json index ea27a572c6..2229b1696b 100644 --- a/scenarios/perf-eval/cas-c4n200p200/terraform-test-inputs/azure.json +++ b/scenarios/perf-eval/cas-c2n200p200/terraform-test-inputs/azure.json @@ -1,4 +1,4 @@ { "run_id" : "123456789", - "region" : "eastus" + "region" : "eastus2" }