Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions modules/python/clusterloader2/cri/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ name: resource-consumer
{{$agentPoolPrefix := DefaultParam .CL2_AGENTPOOL_PREFIX "userpool"}}
{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "5m"}}
{{$podStartupLatencyThreshold := DefaultParam .CL2_POD_STARTUP_LATENCY_THRESHOLD "15s"}}
{{$loadType := DefaultParam .CL2_LOAD_TYPE "memory"}}

namespace:
number: 1
Expand Down Expand Up @@ -67,6 +68,7 @@ steps:
MemoryRequest: {{$memoryKi}}
CPURequest: {{$cpu}}m
AgentPool: {{$agentPoolPrefix}}{{$i}}
LoadType: {{$loadType}}

- name: Waiting for latency pods to be running
measurements:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
{{$MemoryRequest := DefaultParam .MemoryRequest "1000Ki"}}
{{$CPURequest := DefaultParam .CPURequest "100m"}}
{{$AgentPool := DefaultParam .AgentPool "userpool1"}}
{{$LoadType := DefaultParam .LoadType "memory"}}

apiVersion: apps/v1
kind: Deployment
Expand All @@ -24,6 +25,7 @@ spec:
nodeSelector:
agentpool: {{$AgentPool}}
containers:
{{if eq $LoadType "memory"}}
- name: resource-consumer-memory
image: registry.k8s.io/e2e-test-images/resource-consumer:1.9
command:
Expand All @@ -40,6 +42,8 @@ spec:
resources:
requests:
memory: {{$MemoryRequest}}
{{end}}
{{if eq $LoadType "cpu"}}
- name: resource-consumer-cpu
image: registry.k8s.io/e2e-test-images/resource-consumer:1.9
command:
Expand All @@ -50,6 +54,7 @@ spec:
resources:
requests:
cpu: {{$CPURequest}}
{{end}}
tolerations:
- key: "cri-resource-consume"
operator: "Equal"
Expand Down
18 changes: 14 additions & 4 deletions modules/python/clusterloader2/cri/cri.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"aws": 2,
"aks": 6
}
MEMORY_SCALE_FACTOR = 0.95 # 95% of the total allocatable memory to account for error margin

def _get_daemonsets_pods_allocated_resources(client, node_name):
pods = client.get_pods_by_namespace("kube-system", field_selector=f"spec.nodeName={node_name}")
Expand All @@ -23,7 +24,7 @@ def _get_daemonsets_pods_allocated_resources(client, node_name):
memory_request += int(container.resources.requests.get("memory", "0Mi").replace("Mi", ""))
return cpu_request, memory_request * 1024

def override_config_clusterloader2(node_count, max_pods, repeats, operation_timeout, provider, override_file):
def override_config_clusterloader2(node_count, max_pods, repeats, operation_timeout, load_type, provider, override_file):
client = KubernetesClient(os.path.expanduser("~/.kube/config"))
nodes = client.get_nodes(label_selector="cri-resource-consume=true")
if len(nodes) == 0:
Expand All @@ -48,7 +49,7 @@ def override_config_clusterloader2(node_count, max_pods, repeats, operation_time
pod_count = max_pods - DAEMONSETS_PER_NODE_MAP[provider]
replica = pod_count * node_count
cpu_request = cpu_value // pod_count
memory_request_in_Ki = math.ceil(memory_value // pod_count - 20)
memory_request_in_Ki = math.ceil(memory_value * MEMORY_SCALE_FACTOR // pod_count)
memory_request_in_K = int(memory_request_in_Ki // 1.024)
print(f"CPU request for each pod: {cpu_request}m, memory request for each pod: {memory_request_in_K}K, total replica: {replica}")

Expand All @@ -65,6 +66,7 @@ def override_config_clusterloader2(node_count, max_pods, repeats, operation_time
file.write("CL2_PROMETHEUS_MEMORY_SCALE_FACTOR: 30.0\n")
file.write("CL2_PROMETHEUS_NODE_SELECTOR: \"prometheus: \\\"true\\\"\"\n")
file.write("CL2_POD_STARTUP_LATENCY_THRESHOLD: 3m\n")
file.write(f"CL2_LOAD_TYPE: {load_type}\n")

file.close()

Expand All @@ -75,6 +77,7 @@ def collect_clusterloader2(
node_count,
max_pods,
repeats,
load_type,
cl2_report_dir,
cloud_info,
run_id,
Expand All @@ -95,6 +98,7 @@ def collect_clusterloader2(
"node_count": node_count,
"max_pods": max_pods,
"churn_rate": repeats,
"load_type": load_type,
"status": status,
"group": None,
"measurement": None,
Expand Down Expand Up @@ -137,6 +141,8 @@ def main():
parser_override.add_argument("max_pods", type=int, help="Number of maximum pods per node")
parser_override.add_argument("repeats", type=int, help="Number of times to repeat the resource consumer deployment")
parser_override.add_argument("operation_timeout", type=str, default="2m", help="Operation timeout")
parser_override.add_argument("load_type", type=str, choices=["memory", "cpu"],
default="memory", help="Type of load to generate")
parser_override.add_argument("provider", type=str, help="Cloud provider name")
parser_override.add_argument("cl2_override_file", type=str, help="Path to the overrides of CL2 config file")

Expand All @@ -153,6 +159,8 @@ def main():
parser_collect.add_argument("node_count", type=int, help="Number of nodes")
parser_collect.add_argument("max_pods", type=int, help="Number of maximum pods per node")
parser_collect.add_argument("repeats", type=int, help="Number of times to repeat the resource consumer deployment")
parser_collect.add_argument("load_type", type=str, choices=["memory", "cpu"],
default="memory", help="Type of load to generate")
parser_collect.add_argument("cl2_report_dir", type=str, help="Path to the CL2 report directory")
parser_collect.add_argument("cloud_info", type=str, help="Cloud information")
parser_collect.add_argument("run_id", type=str, help="Run ID")
Expand All @@ -162,11 +170,13 @@ def main():
args = parser.parse_args()

if args.command == "override":
override_config_clusterloader2(args.node_count, args.max_pods, args.repeats, args.operation_timeout, args.provider, args.cl2_override_file)
override_config_clusterloader2(args.node_count, args.max_pods, args.repeats, args.operation_timeout, args.load_type,
args.provider, args.cl2_override_file)
elif args.command == "execute":
execute_clusterloader2(args.cl2_image, args.cl2_config_dir, args.cl2_report_dir, args.kubeconfig, args.provider)
elif args.command == "collect":
collect_clusterloader2(args.node_count, args.max_pods, args.repeats, args.cl2_report_dir, args.cloud_info, args.run_id, args.run_url, args.result_file)
collect_clusterloader2(args.node_count, args.max_pods, args.repeats, args.load_type,
args.cl2_report_dir, args.cloud_info, args.run_id, args.run_url, args.result_file)

if __name__ == "__main__":
main()
54 changes: 48 additions & 6 deletions pipelines/perf-eval/CRI Benchmark/cri-resource-consume.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,42 @@ stages:
image: "ghcr.io/azure/clusterloader2:v20241016"
topology: cri-resource-consume
matrix:
n10-p300:
n10-p300-memory:
node_count: 10
max_pods: 30
repeats: 1
operation_timeout: 3m
n10-p700:
load_type: memory
n10-p700-memory:
node_count: 10
max_pods: 70
repeats: 1
operation_timeout: 7m
n10-p1100:
load_type: memory
n10-p1100-memory:
node_count: 10
max_pods: 110
repeats: 1
operation_timeout: 11m
load_type: memory
n10-p300-cpu:
node_count: 10
max_pods: 30
repeats: 1
operation_timeout: 3m
load_type: cpu
n10-p700-cpu:
node_count: 10
max_pods: 70
repeats: 1
operation_timeout: 7m
load_type: cpu
n10-p1100-cpu:
node_count: 10
max_pods: 110
repeats: 1
operation_timeout: 11m
load_type: cpu
max_parallel: 3
timeout_in_minutes: 120
credential_type: service_connection
Expand All @@ -58,21 +79,42 @@ stages:
image: "ghcr.io/azure/clusterloader2:v20241016"
topology: cri-resource-consume
matrix:
n10-p300:
n10-p300-memory:
node_count: 10
max_pods: 30
repeats: 1
operation_timeout: 3m
load_type: memory
n10-p700-memory:
node_count: 10
max_pods: 70
repeats: 1
operation_timeout: 7m
load_type: memory
n10-p1100-memory:
node_count: 10
max_pods: 110
repeats: 1
operation_timeout: 11m
load_type: memory
n10-p300-cpu:
node_count: 10
max_pods: 30
repeats: 1
operation_timeout: 3m
n10-p700:
load_type: cpu
n10-p700-cpu:
node_count: 10
max_pods: 70
repeats: 1
operation_timeout: 7m
n10-p1100:
load_type: cpu
n10-p1100-cpu:
node_count: 10
max_pods: 110
repeats: 1
operation_timeout: 11m
load_type: cpu
max_parallel: 3
timeout_in_minutes: 120
credential_type: service_connection
Expand Down
2 changes: 1 addition & 1 deletion steps/engine/clusterloader2/cri/collect.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ steps:
set -eo pipefail

PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect \
$NODE_COUNT $MAX_PODS $REPEATS \
$NODE_COUNT $MAX_PODS $REPEATS $LOAD_TYPE \
$CL2_REPORT_DIR "$CLOUD_INFO" $RUN_ID $RUN_URL $TEST_RESULTS_FILE
workingDirectory: modules/python/clusterloader2
env:
Expand Down
3 changes: 2 additions & 1 deletion steps/engine/clusterloader2/cri/execute.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ steps:
set -eo pipefail

PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE override \
$NODE_COUNT $MAX_PODS $REPEATS $OPERATION_TIMEOUT $CLOUD ${CL2_CONFIG_DIR}/overrides.yaml
$NODE_COUNT $MAX_PODS $REPEATS $OPERATION_TIMEOUT $LOAD_TYPE \
$CLOUD ${CL2_CONFIG_DIR}/overrides.yaml
PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \
${CL2_IMAGE} ${CL2_CONFIG_DIR} $CL2_REPORT_DIR ${HOME}/.kube/config $CLOUD
workingDirectory: modules/python/clusterloader2
Expand Down