From 5854625c7129b5d68493a3333409afa20925905b Mon Sep 17 00:00:00 2001 From: vickylin Date: Thu, 12 Feb 2026 20:40:13 +1100 Subject: [PATCH 1/2] 1000n acr image pull 1000n acr image pull 1000n acr image pull 1000n acr image pull 1000n acr image pull 1000n test 1000n test fix desired node clean up test changes format and update test fix format fix format Revert new-pipeline-test.yml to match main --- modules/python/clusterloader2/cri/cri.py | 36 +++++++++-- modules/python/tests/test_cri.py | 32 +++++++++- .../ACR Benchmark/image-pull-n10.yml | 2 + .../ACR Benchmark/image-pull-n1000.yml | 40 ++++++++++++ .../perf-eval/image-pull-n1000/README.md | 47 ++++++++++++++ .../terraform-inputs/azure.tfvars | 64 +++++++++++++++++++ .../terraform-test-inputs/azure.json | 4 ++ steps/engine/clusterloader2/cri/execute.yml | 1 + .../clusterloader2/large-cluster/validate.yml | 2 +- .../image-pull/validate-resources.yml | 3 +- 10 files changed, 220 insertions(+), 11 deletions(-) create mode 100644 pipelines/perf-eval/ACR Benchmark/image-pull-n1000.yml create mode 100644 scenarios/perf-eval/image-pull-n1000/README.md create mode 100644 scenarios/perf-eval/image-pull-n1000/terraform-inputs/azure.tfvars create mode 100644 scenarios/perf-eval/image-pull-n1000/terraform-test-inputs/azure.json diff --git a/modules/python/clusterloader2/cri/cri.py b/modules/python/clusterloader2/cri/cri.py index fd6d756c86..32ae3b7131 100644 --- a/modules/python/clusterloader2/cri/cri.py +++ b/modules/python/clusterloader2/cri/cri.py @@ -19,7 +19,7 @@ def override_config_clusterloader2( node_count, node_per_step, max_pods, repeats, operation_timeout, load_type, scale_enabled, pod_startup_latency_threshold, provider, - registry_endpoint, test_image, os_type, scrape_kubelets, scrape_containerd, containerd_scrape_interval, host_network, override_file): + registry_endpoint, test_image, os_type, scrape_kubelets, scrape_containerd, containerd_scrape_interval, host_network, override_file, memory_request_override=None): client = KubernetesClient(os.path.expanduser("~/.kube/config")) nodes = client.get_nodes(label_selector="cri-resource-consume=true") if len(nodes) == 0: @@ -53,12 +53,29 @@ def override_config_clusterloader2( logger.info(f"Node {node.metadata.name} has {daemonset_count} daemonset pods") pod_count = max_pods - daemonset_count cpu_request = cpu_value // pod_count - memory_request_in_ki = math.ceil(memory_value * MEMORY_SCALE_FACTOR // pod_count) - memory_request_in_k = int(memory_request_in_ki // 1.024) - memory_request_in_m = int(memory_request_in_k // 1000) - memory_request = ( - memory_request_in_m if os_type == "windows" else memory_request_in_k - ) + + # Use override if provided, otherwise calculate + if memory_request_override: + if memory_request_override.endswith("Mi"): + memory_request_in_ki = int(memory_request_override.replace("Mi", "")) * 1024 + elif memory_request_override.endswith("Gi"): + memory_request_in_ki = int(memory_request_override.replace("Gi", "")) * 1024 * 1024 + elif memory_request_override.endswith("Ki"): + memory_request_in_ki = int(memory_request_override.replace("Ki", "")) + else: + memory_request_in_ki = int(memory_request_override) + memory_request_in_k = int(memory_request_in_ki // 1.024) + memory_request_in_m = int(memory_request_in_k // 1000) + memory_request = memory_request_in_m if os_type == "windows" else memory_request_in_k + logger.info(f"Using memory request override: {memory_request_override}") + else: + memory_request_in_ki = math.ceil(memory_value * MEMORY_SCALE_FACTOR // pod_count) + memory_request_in_k = int(memory_request_in_ki // 1.024) + memory_request_in_m = int(memory_request_in_k // 1000) + memory_request = ( + memory_request_in_m if os_type == "windows" else memory_request_in_k + ) + logger.info( f"CPU request for each pod: {cpu_request}m, memory request for each pod: {memory_request}, " f"total pod per node: {pod_count}, os_type: {os_type}" @@ -301,10 +318,14 @@ def main(): "--registry_endpoint", type=str, help="Container registry endpoint" ) parser_override.add_argument( +<<<<<<< HEAD "--test_image", type=str, default="e2e-test-images/resource-consumer:1.13", help="Test image to pull (relative to registry endpoint)" +======= + "--memory_request_override", type=str, default=None, help="Override memory request per pod (e.g., 100Mi, 1Gi, 500Ki)" +>>>>>>> f0f10720 (1000n acr image pull) ) # Sub-command for execute_clusterloader2 @@ -405,6 +426,7 @@ def main(): args.containerd_scrape_interval, args.host_network, args.cl2_override_file, + args.memory_request_override, ) elif args.command == "execute": execute_clusterloader2( diff --git a/modules/python/tests/test_cri.py b/modules/python/tests/test_cri.py index bc95943f29..aba77c85de 100644 --- a/modules/python/tests/test_cri.py +++ b/modules/python/tests/test_cri.py @@ -249,7 +249,7 @@ def test_override_command(self, mock_override): with patch.object(sys, 'argv', test_args): main() mock_override.assert_called_once_with( - 5, 1, 110, 3, "2m", "cpu", True, "10s", "aws", "test registry endpoint", "e2e-test-images/resource-consumer:1.13", "linux", False, False, "20s", False, "/tmp/override.yaml" + 5, 1, 110, 3, "2m", "cpu", True, "10s", "aws", "test registry endpoint", "e2e-test-images/resource-consumer:1.13", "linux", False, False, "20s", False, "/tmp/override.yaml", None ) @patch("clusterloader2.cri.cri.override_config_clusterloader2") @@ -274,7 +274,35 @@ def test_override_command_default_host_network(self, mock_override): with patch.object(sys, 'argv', test_args): main() mock_override.assert_called_once_with( - 5, 1, 110, 3, "2m", "cpu", True, "10s", "aws", "test registry endpoint", "e2e-test-images/resource-consumer:1.13", "linux", False, False, "15s", True, "/tmp/override.yaml" + 5, 1, 110, 3, "2m", "cpu", True, "10s", "aws", "test registry endpoint", "e2e-test-images/resource-consumer:1.13", "linux", False, False, "15s", True, "/tmp/override.yaml", None + ) + + @patch("clusterloader2.cri.cri.override_config_clusterloader2") + def test_override_command_with_memory_request_override(self, mock_override): + test_args = [ + "main.py", "override", + "--node_count", "1000", + "--node_per_step", "1000", + "--max_pods", "7", + "--repeats", "1", + "--operation_timeout", "60m", + "--load_type", "memory", + "--scale_enabled", "False", + "--pod_startup_latency_threshold", "10m", + "--provider", "aks", + "--registry_endpoint", "acrperftestaue.azurecr-test.io", + "--os_type", "linux", + "--scrape_kubelets", "False", + "--scrape_containerd", "True", + "--containerd_scrape_interval", "30s", + "--host_network", "True", + "--cl2_override_file", "/tmp/override.yaml", + "--memory_request_override", "1000Mi" + ] + with patch.object(sys, 'argv', test_args): + main() + mock_override.assert_called_once_with( + 1000, 1000, 7, 1, "60m", "memory", False, "10m", "aks", "acrperftestaue.azurecr-test.io", "e2e-test-images/resource-consumer:1.13", "linux", False, True, "30s", True, "/tmp/override.yaml", "1000Mi" ) @patch("clusterloader2.cri.cri.execute_clusterloader2") diff --git a/pipelines/perf-eval/ACR Benchmark/image-pull-n10.yml b/pipelines/perf-eval/ACR Benchmark/image-pull-n10.yml index 5c5075a18b..8df530c47b 100644 --- a/pipelines/perf-eval/ACR Benchmark/image-pull-n10.yml +++ b/pipelines/perf-eval/ACR Benchmark/image-pull-n10.yml @@ -26,6 +26,7 @@ stages: topology: image-pull matrix: image-pull-10pods-authenticated: + desired_nodes: 14 node_count: 10 max_pods: 26 repeats: 1 @@ -41,6 +42,7 @@ stages: pod_startup_latency_threshold: 600s anonymous_pull: False image-pull-10pods-anonymous: + desired_nodes: 14 node_count: 10 max_pods: 26 repeats: 1 diff --git a/pipelines/perf-eval/ACR Benchmark/image-pull-n1000.yml b/pipelines/perf-eval/ACR Benchmark/image-pull-n1000.yml new file mode 100644 index 0000000000..73c1b7696e --- /dev/null +++ b/pipelines/perf-eval/ACR Benchmark/image-pull-n1000.yml @@ -0,0 +1,40 @@ +trigger: none + +variables: + SCENARIO_TYPE: perf-eval + SCENARIO_NAME: image-pull-n1000 + +stages: + - stage: azure_australiaeast_image_pull + dependsOn: [] + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: azure + regions: + - australiaeast + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20250513" + topology: image-pull + matrix: + image-pull-1000nodes-anonymous: + desired_nodes: 1004 + node_count: 1000 + max_pods: 7 + repeats: 1 + operation_timeout: 60m + load_type: memory + memory_request_override: "1000Mi" + scrape_containerd: True + scrape_kubelets: False + scrape_registry: False + containerd_scrape_interval: 30s + registry_endpoint: acrperftestaue.azurecr-test.io + kubernetes_version: "1.34" + pod_startup_latency_threshold: 10m + anonymous_pull: True + max_parallel: 1 + credential_type: service_connection + ssh_key_enabled: false + timeout_in_minutes: 100 diff --git a/scenarios/perf-eval/image-pull-n1000/README.md b/scenarios/perf-eval/image-pull-n1000/README.md new file mode 100644 index 0000000000..f02502aa24 --- /dev/null +++ b/scenarios/perf-eval/image-pull-n1000/README.md @@ -0,0 +1,47 @@ +# image-pull-n1000 + +## Overview + +Measures containerd image pulling throughput (MB/s) and network plugin operation metrics using the CRI module with `scrape_containerd: True`. Uses the `cri-resource-consume` topology. + +**Note**: This test is only set up in dogfood environment with anonymous pull only. + +## Infrastructure + +| Component | Configuration | +|-----------|---------------| +| Cloud Provider | Azure | +| Region | australiaeast | +| Cluster SKU | Standard | +| Network Plugin | Azure CNI Overlay | +| Default Node Pool | 3 x Standard_D4s_v3 | +| Prometheus Pool | 1 x Standard_D64_v3 (larger size required for 1000 nodes - needs more memory and CPU for Prometheus) | +| User Pool | 1000 x Standard_D4s_v3 | + +## Test Workload + +| Component | Value | +|-----------|-------| +| Registry | Azure Container Registry (`acrperftestaue.azurecr-test.io`) | +| Image | `e2e-test-images/resource-consumer:1.13` | +| Image Size | ~5GB to ~30GB | + +## Metrics Collected + +### ContainerdCriImagePullingThroughput + +Image pull throughput (MB/s) with the following aggregations: + +| Metric | Description | +|--------|-------------| +| **Avg** | Weighted average throughput per image pull | +| **AvgPerNode** | Unweighted average - each node contributes equally | +| **Count** | Total number of image pulls | +| **Perc50** | 50th percentile (median) throughput across nodes | +| **Perc90** | 90th percentile throughput across nodes | +| **Perc99** | 99th percentile throughput across nodes | + +## References + +- [Best Practices](../../../docs/best-practices.md) +- [Test Scenario Implementation Guide](../../../docs/test-scenario-implementation-guide.md) diff --git a/scenarios/perf-eval/image-pull-n1000/terraform-inputs/azure.tfvars b/scenarios/perf-eval/image-pull-n1000/terraform-inputs/azure.tfvars new file mode 100644 index 0000000000..c6d9c3e395 --- /dev/null +++ b/scenarios/perf-eval/image-pull-n1000/terraform-inputs/azure.tfvars @@ -0,0 +1,64 @@ +scenario_type = "perf-eval" +scenario_name = "image-pull-n1000" +deletion_delay = "1h" +owner = "acr" + +network_config_list = [ + { + role = "client" + vnet_name = "imgpull-vnet" + vnet_address_space = "10.0.0.0/9" + subnet = [ + { + name = "imgpull-subnet-1" + address_prefix = "10.0.0.0/16" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + } +] + +aks_config_list = [ + { + role = "client" + aks_name = "img-pull-1000" + dns_prefix = "imgpull" + subnet_name = "imgpull-vnet" + sku_tier = "Standard" + network_profile = { + network_plugin = "azure" + network_plugin_mode = "overlay" + pod_cidr = "10.0.0.0/9" + service_cidr = "192.168.0.0/16" + dns_service_ip = "192.168.0.10" + } + default_node_pool = { + name = "default" + node_count = 3 + vm_size = "Standard_D4ds_v5" + os_disk_type = "Managed" + only_critical_addons_enabled = true + temporary_name_for_rotation = "defaulttmp" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D64_v3" + os_disk_type = "Managed" + node_labels = { "prometheus" = "true" } + }, + { + name = "userpool" + node_count = 1000 + auto_scaling_enabled = false + vm_size = "Standard_D4ds_v5" + os_disk_type = "Managed" + node_labels = { "cri-resource-consume" = "true" } + } + ] + } +] diff --git a/scenarios/perf-eval/image-pull-n1000/terraform-test-inputs/azure.json b/scenarios/perf-eval/image-pull-n1000/terraform-test-inputs/azure.json new file mode 100644 index 0000000000..122694a170 --- /dev/null +++ b/scenarios/perf-eval/image-pull-n1000/terraform-test-inputs/azure.json @@ -0,0 +1,4 @@ +{ + "run_id": "test-run", + "region": "australiaeast" +} diff --git a/steps/engine/clusterloader2/cri/execute.yml b/steps/engine/clusterloader2/cri/execute.yml index 8763c6d5b1..779266928c 100644 --- a/steps/engine/clusterloader2/cri/execute.yml +++ b/steps/engine/clusterloader2/cri/execute.yml @@ -29,6 +29,7 @@ steps: --scrape_containerd ${SCRAPE_CONTAINERD:-False} \ --containerd_scrape_interval ${CONTAINERD_SCRAPE_INTERVAL:-15s} \ --host_network ${HOST_NETWORK:-True} \ + ${MEMORY_REQUEST_OVERRIDE:+--memory_request_override "$MEMORY_REQUEST_OVERRIDE"} \ --cl2_override_file ${CL2_CONFIG_DIR}/overrides.yaml PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \ --cl2_image ${CL2_IMAGE} \ diff --git a/steps/engine/clusterloader2/large-cluster/validate.yml b/steps/engine/clusterloader2/large-cluster/validate.yml index 2abd5f8542..0ab577266b 100644 --- a/steps/engine/clusterloader2/large-cluster/validate.yml +++ b/steps/engine/clusterloader2/large-cluster/validate.yml @@ -1,6 +1,6 @@ parameters: - name: desired_nodes - type: number + type: string - name: validation_timeout_in_minutes type: number default: 10 diff --git a/steps/topology/image-pull/validate-resources.yml b/steps/topology/image-pull/validate-resources.yml index ac068363ad..6391baeed7 100644 --- a/steps/topology/image-pull/validate-resources.yml +++ b/steps/topology/image-pull/validate-resources.yml @@ -18,4 +18,5 @@ steps: region: ${{ parameters.regions[0] }} - template: /steps/engine/clusterloader2/large-cluster/validate.yml parameters: - desired_nodes: 14 + desired_nodes: $(desired_nodes) + validation_timeout_in_minutes: 20 From a4041c0dc5fc2a7e54d0dde7c187a769cdb87d88 Mon Sep 17 00:00:00 2001 From: vickylin Date: Wed, 18 Feb 2026 14:45:32 +1100 Subject: [PATCH 2/2] resolve merge conflict --- modules/python/clusterloader2/cri/cri.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/modules/python/clusterloader2/cri/cri.py b/modules/python/clusterloader2/cri/cri.py index 32ae3b7131..7b366dbd0d 100644 --- a/modules/python/clusterloader2/cri/cri.py +++ b/modules/python/clusterloader2/cri/cri.py @@ -318,14 +318,17 @@ def main(): "--registry_endpoint", type=str, help="Container registry endpoint" ) parser_override.add_argument( -<<<<<<< HEAD "--test_image", type=str, default="e2e-test-images/resource-consumer:1.13", help="Test image to pull (relative to registry endpoint)" -======= - "--memory_request_override", type=str, default=None, help="Override memory request per pod (e.g., 100Mi, 1Gi, 500Ki)" ->>>>>>> f0f10720 (1000n acr image pull) + ) + + parser_override.add_argument( + "--memory_request_override", + type=str, + default=None, + help="Override memory request per pod (e.g., 100Mi, 1Gi, 500Ki)" ) # Sub-command for execute_clusterloader2