Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 32 additions & 7 deletions modules/python/clusterloader2/cri/cri.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
def override_config_clusterloader2(
node_count, node_per_step, max_pods, repeats, operation_timeout,
load_type, scale_enabled, pod_startup_latency_threshold, provider,
registry_endpoint, test_image, os_type, scrape_kubelets, scrape_containerd, containerd_scrape_interval, host_network, override_file):
registry_endpoint, test_image, os_type, scrape_kubelets, scrape_containerd, containerd_scrape_interval, host_network, override_file, memory_request_override=None):
client = KubernetesClient(os.path.expanduser("~/.kube/config"))
nodes = client.get_nodes(label_selector="cri-resource-consume=true")
if len(nodes) == 0:
Expand Down Expand Up @@ -53,12 +53,29 @@ def override_config_clusterloader2(
logger.info(f"Node {node.metadata.name} has {daemonset_count} daemonset pods")
pod_count = max_pods - daemonset_count
cpu_request = cpu_value // pod_count
memory_request_in_ki = math.ceil(memory_value * MEMORY_SCALE_FACTOR // pod_count)
memory_request_in_k = int(memory_request_in_ki // 1.024)
memory_request_in_m = int(memory_request_in_k // 1000)
memory_request = (
memory_request_in_m if os_type == "windows" else memory_request_in_k
)

# Use override if provided, otherwise calculate
if memory_request_override:
if memory_request_override.endswith("Mi"):
memory_request_in_ki = int(memory_request_override.replace("Mi", "")) * 1024
elif memory_request_override.endswith("Gi"):
memory_request_in_ki = int(memory_request_override.replace("Gi", "")) * 1024 * 1024
elif memory_request_override.endswith("Ki"):
memory_request_in_ki = int(memory_request_override.replace("Ki", ""))
else:
memory_request_in_ki = int(memory_request_override)
memory_request_in_k = int(memory_request_in_ki // 1.024)
memory_request_in_m = int(memory_request_in_k // 1000)
memory_request = memory_request_in_m if os_type == "windows" else memory_request_in_k
logger.info(f"Using memory request override: {memory_request_override}")
else:
memory_request_in_ki = math.ceil(memory_value * MEMORY_SCALE_FACTOR // pod_count)
memory_request_in_k = int(memory_request_in_ki // 1.024)
memory_request_in_m = int(memory_request_in_k // 1000)
memory_request = (
memory_request_in_m if os_type == "windows" else memory_request_in_k
)

logger.info(
f"CPU request for each pod: {cpu_request}m, memory request for each pod: {memory_request}, "
f"total pod per node: {pod_count}, os_type: {os_type}"
Expand Down Expand Up @@ -307,6 +324,13 @@ def main():
help="Test image to pull (relative to registry endpoint)"
)

parser_override.add_argument(
"--memory_request_override",
type=str,
default=None,
help="Override memory request per pod (e.g., 100Mi, 1Gi, 500Ki)"
)

# Sub-command for execute_clusterloader2
parser_execute = subparsers.add_parser(
"execute", help="Execute resource consume operation"
Expand Down Expand Up @@ -405,6 +429,7 @@ def main():
args.containerd_scrape_interval,
args.host_network,
args.cl2_override_file,
args.memory_request_override,
)
elif args.command == "execute":
execute_clusterloader2(
Expand Down
32 changes: 30 additions & 2 deletions modules/python/tests/test_cri.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ def test_override_command(self, mock_override):
with patch.object(sys, 'argv', test_args):
main()
mock_override.assert_called_once_with(
5, 1, 110, 3, "2m", "cpu", True, "10s", "aws", "test registry endpoint", "e2e-test-images/resource-consumer:1.13", "linux", False, False, "20s", False, "/tmp/override.yaml"
5, 1, 110, 3, "2m", "cpu", True, "10s", "aws", "test registry endpoint", "e2e-test-images/resource-consumer:1.13", "linux", False, False, "20s", False, "/tmp/override.yaml", None
)

@patch("clusterloader2.cri.cri.override_config_clusterloader2")
Expand All @@ -274,7 +274,35 @@ def test_override_command_default_host_network(self, mock_override):
with patch.object(sys, 'argv', test_args):
main()
mock_override.assert_called_once_with(
5, 1, 110, 3, "2m", "cpu", True, "10s", "aws", "test registry endpoint", "e2e-test-images/resource-consumer:1.13", "linux", False, False, "15s", True, "/tmp/override.yaml"
5, 1, 110, 3, "2m", "cpu", True, "10s", "aws", "test registry endpoint", "e2e-test-images/resource-consumer:1.13", "linux", False, False, "15s", True, "/tmp/override.yaml", None
)

@patch("clusterloader2.cri.cri.override_config_clusterloader2")
def test_override_command_with_memory_request_override(self, mock_override):
test_args = [
"main.py", "override",
"--node_count", "1000",
"--node_per_step", "1000",
"--max_pods", "7",
"--repeats", "1",
"--operation_timeout", "60m",
"--load_type", "memory",
"--scale_enabled", "False",
"--pod_startup_latency_threshold", "10m",
"--provider", "aks",
"--registry_endpoint", "acrperftestaue.azurecr-test.io",
"--os_type", "linux",
"--scrape_kubelets", "False",
"--scrape_containerd", "True",
"--containerd_scrape_interval", "30s",
"--host_network", "True",
"--cl2_override_file", "/tmp/override.yaml",
"--memory_request_override", "1000Mi"
]
with patch.object(sys, 'argv', test_args):
main()
mock_override.assert_called_once_with(
1000, 1000, 7, 1, "60m", "memory", False, "10m", "aks", "acrperftestaue.azurecr-test.io", "e2e-test-images/resource-consumer:1.13", "linux", False, True, "30s", True, "/tmp/override.yaml", "1000Mi"
)

@patch("clusterloader2.cri.cri.execute_clusterloader2")
Expand Down
2 changes: 2 additions & 0 deletions pipelines/perf-eval/ACR Benchmark/image-pull-n10.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ stages:
topology: image-pull
matrix:
image-pull-10pods-authenticated:
desired_nodes: 14
node_count: 10
max_pods: 26
repeats: 1
Expand All @@ -41,6 +42,7 @@ stages:
pod_startup_latency_threshold: 600s
anonymous_pull: False
image-pull-10pods-anonymous:
desired_nodes: 14
node_count: 10
max_pods: 26
repeats: 1
Expand Down
40 changes: 40 additions & 0 deletions pipelines/perf-eval/ACR Benchmark/image-pull-n1000.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
trigger: none

variables:
SCENARIO_TYPE: perf-eval
SCENARIO_NAME: image-pull-n1000

stages:
- stage: azure_australiaeast_image_pull
dependsOn: []
jobs:
- template: /jobs/competitive-test.yml
parameters:
cloud: azure
regions:
- australiaeast
engine: clusterloader2
engine_input:
image: "ghcr.io/azure/clusterloader2:v20250513"
topology: image-pull
matrix:
image-pull-1000nodes-anonymous:
desired_nodes: 1004
node_count: 1000
max_pods: 7
repeats: 1
operation_timeout: 60m
load_type: memory
memory_request_override: "1000Mi"
scrape_containerd: True
scrape_kubelets: False
scrape_registry: False
containerd_scrape_interval: 30s
registry_endpoint: acrperftestaue.azurecr-test.io
kubernetes_version: "1.34"
pod_startup_latency_threshold: 10m
anonymous_pull: True
max_parallel: 1
credential_type: service_connection
ssh_key_enabled: false
timeout_in_minutes: 100
47 changes: 47 additions & 0 deletions scenarios/perf-eval/image-pull-n1000/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# image-pull-n1000

## Overview

Measures containerd image pulling throughput (MB/s) and network plugin operation metrics using the CRI module with `scrape_containerd: True`. Uses the `cri-resource-consume` topology.

**Note**: This test is only set up in dogfood environment with anonymous pull only.

## Infrastructure

| Component | Configuration |
|-----------|---------------|
| Cloud Provider | Azure |
| Region | australiaeast |
| Cluster SKU | Standard |
| Network Plugin | Azure CNI Overlay |
| Default Node Pool | 3 x Standard_D4s_v3 |
| Prometheus Pool | 1 x Standard_D64_v3 (larger size required for 1000 nodes - needs more memory and CPU for Prometheus) |
| User Pool | 1000 x Standard_D4s_v3 |

## Test Workload

| Component | Value |
|-----------|-------|
| Registry | Azure Container Registry (`acrperftestaue.azurecr-test.io`) |
| Image | `e2e-test-images/resource-consumer:1.13` |
| Image Size | ~5GB to ~30GB |

## Metrics Collected

### ContainerdCriImagePullingThroughput

Image pull throughput (MB/s) with the following aggregations:

| Metric | Description |
|--------|-------------|
| **Avg** | Weighted average throughput per image pull |
| **AvgPerNode** | Unweighted average - each node contributes equally |
| **Count** | Total number of image pulls |
| **Perc50** | 50th percentile (median) throughput across nodes |
| **Perc90** | 90th percentile throughput across nodes |
| **Perc99** | 99th percentile throughput across nodes |

## References

- [Best Practices](../../../docs/best-practices.md)
- [Test Scenario Implementation Guide](../../../docs/test-scenario-implementation-guide.md)
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
scenario_type = "perf-eval"
scenario_name = "image-pull-n1000"
deletion_delay = "1h"
owner = "acr"

network_config_list = [
{
role = "client"
vnet_name = "imgpull-vnet"
vnet_address_space = "10.0.0.0/9"
subnet = [
{
name = "imgpull-subnet-1"
address_prefix = "10.0.0.0/16"
}
]
network_security_group_name = ""
nic_public_ip_associations = []
nsr_rules = []
}
]

aks_config_list = [
{
role = "client"
aks_name = "img-pull-1000"
dns_prefix = "imgpull"
subnet_name = "imgpull-vnet"
sku_tier = "Standard"
network_profile = {
network_plugin = "azure"
network_plugin_mode = "overlay"
pod_cidr = "10.0.0.0/9"
service_cidr = "192.168.0.0/16"
dns_service_ip = "192.168.0.10"
}
default_node_pool = {
name = "default"
node_count = 3
vm_size = "Standard_D4ds_v5"
os_disk_type = "Managed"
only_critical_addons_enabled = true
temporary_name_for_rotation = "defaulttmp"
}
extra_node_pool = [
{
name = "prompool"
node_count = 1
auto_scaling_enabled = false
vm_size = "Standard_D64_v3"
os_disk_type = "Managed"
node_labels = { "prometheus" = "true" }
},
{
name = "userpool"
node_count = 1000
auto_scaling_enabled = false
vm_size = "Standard_D4ds_v5"
os_disk_type = "Managed"
node_labels = { "cri-resource-consume" = "true" }
}
]
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"run_id": "test-run",
"region": "australiaeast"
}
1 change: 1 addition & 0 deletions steps/engine/clusterloader2/cri/execute.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ steps:
--scrape_containerd ${SCRAPE_CONTAINERD:-False} \
--containerd_scrape_interval ${CONTAINERD_SCRAPE_INTERVAL:-15s} \
--host_network ${HOST_NETWORK:-True} \
${MEMORY_REQUEST_OVERRIDE:+--memory_request_override "$MEMORY_REQUEST_OVERRIDE"} \
--cl2_override_file ${CL2_CONFIG_DIR}/overrides.yaml
PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \
--cl2_image ${CL2_IMAGE} \
Expand Down
2 changes: 1 addition & 1 deletion steps/engine/clusterloader2/large-cluster/validate.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
parameters:
- name: desired_nodes
type: number
type: string
- name: validation_timeout_in_minutes
type: number
default: 10
Expand Down
3 changes: 2 additions & 1 deletion steps/topology/image-pull/validate-resources.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,5 @@ steps:
region: ${{ parameters.regions[0] }}
- template: /steps/engine/clusterloader2/large-cluster/validate.yml
parameters:
desired_nodes: 14
desired_nodes: $(desired_nodes)
validation_timeout_in_minutes: 20