From 1a389c0570a5fa3ba68c12e3c0aa09e137573f6b Mon Sep 17 00:00:00 2001 From: carlotaarvela Date: Wed, 11 Feb 2026 17:07:06 +0000 Subject: [PATCH 01/10] Add ACNS observability test with Cilium dataplane --- .../clusterloader2/scale/config/config.yaml | 94 +++++ .../scale/config/modules/ama-logs.yaml | 26 ++ .../config/modules/ama-logs/podmonitor.yaml | 26 ++ .../modules/fortio/client-deployment.yaml | 89 +++++ .../modules/fortio/server-deployment.yaml | 68 ++++ .../scale/config/modules/fortio/service.yaml | 10 + .../scale/config/modules/hubble.yaml | 26 ++ .../config/modules/hubble/podmonitor.yaml | 24 ++ .../config/modules/measurements/ama-logs.yaml | 191 +++++++++ .../config/modules/measurements/cilium.yaml | 213 ++++++++++ .../modules/measurements/control-plane.yaml | 86 ++++ .../modules/measurements/node-disk.yaml | 67 ++++ .../config/modules/measurements/retina.yaml | 202 ++++++++++ .../modules/networkpolicy-template.yaml | 21 + .../scale/config/modules/node-exporter.yaml | 95 +++++ .../modules/node-exporter/clusterrole.yaml | 22 ++ .../node-exporter/clusterrolebinding.yaml | 16 + .../modules/node-exporter/daemonset.yaml | 119 ++++++ .../modules/node-exporter/networkpolicy.yaml | 26 ++ .../config/modules/node-exporter/service.yaml | 18 + .../modules/node-exporter/serviceaccount.yaml | 9 + .../modules/node-exporter/servicemonitor.yaml | 32 ++ .../modules/pfl/retinanetworkflowlog.yaml | 24 ++ .../scale/config/modules/scale-test.yaml | 102 +++++ .../scale/config/modules/test-steps.yaml | 370 ++++++++++++++++++ modules/python/clusterloader2/scale/scale.py | 227 +++++++++++ ...Usage_scale-test_2025-03-04T05:35:56Z.json | 29 ++ .../tests/mock_data/scale/report/junit.xml | 7 + modules/python/tests/test_scale.py | 284 ++++++++++++++ .../CNI Benchmark/cnl-observability.yml | 50 +++ .../terraform-inputs/azure.tfvars | 85 ++++ .../terraform-test-inputs/azure.json | 4 + steps/engine/clusterloader2/scale/collect.yml | 47 +++ steps/engine/clusterloader2/scale/execute.yml | 107 +++++ .../observability/collect-clusterloader2.yml | 26 ++ .../observability/execute-clusterloader2.yml | 25 ++ .../observability/validate-resources.yml | 17 + 37 files changed, 2884 insertions(+) create mode 100644 modules/python/clusterloader2/scale/config/config.yaml create mode 100644 modules/python/clusterloader2/scale/config/modules/ama-logs.yaml create mode 100644 modules/python/clusterloader2/scale/config/modules/ama-logs/podmonitor.yaml create mode 100644 modules/python/clusterloader2/scale/config/modules/fortio/client-deployment.yaml create mode 100644 modules/python/clusterloader2/scale/config/modules/fortio/server-deployment.yaml create mode 100644 modules/python/clusterloader2/scale/config/modules/fortio/service.yaml create mode 100644 modules/python/clusterloader2/scale/config/modules/hubble.yaml create mode 100644 modules/python/clusterloader2/scale/config/modules/hubble/podmonitor.yaml create mode 100644 modules/python/clusterloader2/scale/config/modules/measurements/ama-logs.yaml create mode 100644 modules/python/clusterloader2/scale/config/modules/measurements/cilium.yaml create mode 100644 modules/python/clusterloader2/scale/config/modules/measurements/control-plane.yaml create mode 100644 modules/python/clusterloader2/scale/config/modules/measurements/node-disk.yaml create mode 100644 modules/python/clusterloader2/scale/config/modules/measurements/retina.yaml create mode 100644 modules/python/clusterloader2/scale/config/modules/networkpolicy-template.yaml create mode 100644 modules/python/clusterloader2/scale/config/modules/node-exporter.yaml create mode 100644 modules/python/clusterloader2/scale/config/modules/node-exporter/clusterrole.yaml create mode 100644 modules/python/clusterloader2/scale/config/modules/node-exporter/clusterrolebinding.yaml create mode 100644 modules/python/clusterloader2/scale/config/modules/node-exporter/daemonset.yaml create mode 100644 modules/python/clusterloader2/scale/config/modules/node-exporter/networkpolicy.yaml create mode 100644 modules/python/clusterloader2/scale/config/modules/node-exporter/service.yaml create mode 100644 modules/python/clusterloader2/scale/config/modules/node-exporter/serviceaccount.yaml create mode 100644 modules/python/clusterloader2/scale/config/modules/node-exporter/servicemonitor.yaml create mode 100644 modules/python/clusterloader2/scale/config/modules/pfl/retinanetworkflowlog.yaml create mode 100644 modules/python/clusterloader2/scale/config/modules/scale-test.yaml create mode 100644 modules/python/clusterloader2/scale/config/modules/test-steps.yaml create mode 100644 modules/python/clusterloader2/scale/scale.py create mode 100644 modules/python/tests/mock_data/scale/report/GenericPrometheusQuery_CiliumAvgCPUUsage_scale-test_2025-03-04T05:35:56Z.json create mode 100644 modules/python/tests/mock_data/scale/report/junit.xml create mode 100644 modules/python/tests/test_scale.py create mode 100644 pipelines/perf-eval/CNI Benchmark/cnl-observability.yml create mode 100644 scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-inputs/azure.tfvars create mode 100644 scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-test-inputs/azure.json create mode 100644 steps/engine/clusterloader2/scale/collect.yml create mode 100644 steps/engine/clusterloader2/scale/execute.yml create mode 100644 steps/topology/observability/collect-clusterloader2.yml create mode 100644 steps/topology/observability/execute-clusterloader2.yml create mode 100644 steps/topology/observability/validate-resources.yml diff --git a/modules/python/clusterloader2/scale/config/config.yaml b/modules/python/clusterloader2/scale/config/config.yaml new file mode 100644 index 0000000000..a5ae521044 --- /dev/null +++ b/modules/python/clusterloader2/scale/config/config.yaml @@ -0,0 +1,94 @@ +name: scale-test + +# generic config +{{$groupName := DefaultParam .CL2_GROUP_NAME "scale-test"}} +{{$apiServerCallsPerSecond := DefaultParam .CL2_API_SERVER_CALLS_PER_SECOND 5}} + +# topology config +{{$namespaces := DefaultParam .CL2_NAMESPACES 1}} +{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 10}} +{{$replicasPerNamespace := DefaultParam .CL2_REPLICAS_PER_NAMESPACE 5}} + +# topology config +{{$fortioServerReplicas := DefaultParam .CL2_FORTIO_SERVERS_PER_DEPLOYMENT 10}} +{{$fortioClientReplicas := DefaultParam .CL2_FORTIO_CLIENTS_PER_DEPLOYMENT 10}} +{{$fortioClientQueriesPerSecond := DefaultParam .CL2_FORTIO_CLIENT_QUERIES_PER_SECOND 1000}} +{{$fortioClientConnections := DefaultParam .CL2_FORTIO_CLIENT_CONNECTIONS 10}} +{{$fortioNamespaces := DefaultParam .CL2_FORTIO_NAMESPACES 1}} +{{$fortioDeploymentsPerNamespace := DefaultParam .CL2_FORTIO_DEPLOYMENTS_PER_NAMESPACE 1}} + +{{$labelTrafficPods := DefaultParam .CL2_LABEL_TRAFFIC_PODS false}} + +# Retina Network Flow Log config +{{$createRetinaNetworkFlowLogs := DefaultParam .CL2_GENERATE_RETINA_NETWORK_FLOW_LOGS false}} + + +namespace: + number: {{$namespaces}} + prefix: scale-test + deleteStaleNamespaces: true + deleteAutomanagedNamespaces: true + enableExistingNamespaces: false + deleteNamespaceTimeout: 20m + +tuningSets: + - name: Sequence + parallelismLimitedLoad: + parallelismLimit: 1 + - name: DeploymentCreateQps + qpsLoad: + qps: {{$apiServerCallsPerSecond}} + +steps: + - name: Log + measurements: + - Identifier: Dummy + Method: Sleep + Params: + action: start + duration: 1ms + + - module: + path: /modules/node-exporter.yaml + params: + actionName: "create" + tuningSet: DeploymentCreateQps + + - module: + path: /modules/ama-logs.yaml + params: + actionName: "create" + tuningSet: DeploymentCreateQps + + - module: + path: /modules/hubble.yaml + params: + actionName: "create" + tuningSet: DeploymentCreateQps + + - module: + path: /modules/scale-test.yaml + params: + action: start + group: {{$groupName}} + createRetinaNetworkFlowLogs: {{$createRetinaNetworkFlowLogs}} + labelTrafficPods: {{$labelTrafficPods}} + + - module: + path: /modules/hubble.yaml + params: + actionName: "delete" + tuningSet: DeploymentCreateQps + + - module: + path: /modules/ama-logs.yaml + params: + actionName: "delete" + tuningSet: DeploymentCreateQps + + # TODO: Remove this module once there's a way to deploy node exporter that works in perf-tests repository + - module: + path: /modules/node-exporter.yaml + params: + actionName: "delete" + tuningSet: DeploymentCreateQps diff --git a/modules/python/clusterloader2/scale/config/modules/ama-logs.yaml b/modules/python/clusterloader2/scale/config/modules/ama-logs.yaml new file mode 100644 index 0000000000..683f653d75 --- /dev/null +++ b/modules/python/clusterloader2/scale/config/modules/ama-logs.yaml @@ -0,0 +1,26 @@ +## AMA Logs module creates AMA Logs pod monitor + +# Tuning set +{{$tuningSet := DefaultParam .tuningSet "DeploymentCreateQps"}} + +# interval +{{$interval := DefaultParam .interval "15s"}} +{{ $replicasPerNamespace := 1 }} + +{{if eq .actionName "create"}} + {{ $replicasPerNamespace = 1 }} +{{else}} + {{ $replicasPerNamespace = 0 }} +{{end}} + +steps: + - name: {{.actionName}} AMA Logs Pod Monitor + phases: + - namespaceList: + - "monitoring" + replicasPerNamespace: {{$replicasPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - objectTemplatePath: "modules/ama-logs/podmonitor.yaml" + basename: ama-logs-metrics + interval: 15s diff --git a/modules/python/clusterloader2/scale/config/modules/ama-logs/podmonitor.yaml b/modules/python/clusterloader2/scale/config/modules/ama-logs/podmonitor.yaml new file mode 100644 index 0000000000..04f352560b --- /dev/null +++ b/modules/python/clusterloader2/scale/config/modules/ama-logs/podmonitor.yaml @@ -0,0 +1,26 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: ama-logs-metrics + namespace: monitoring +spec: + jobLabel: ama-logs-metrics + selector: + matchLabels: + component: ama-logs-agent + namespaceSelector: + matchNames: + - kube-system + podMetricsEndpoints: + - interval: 30s + honorLabels: true + path: /metrics + relabelings: + - sourceLabels: [__address__] + action: replace + targetLabel: __address__ + regex: (.+?)(\:\d+)? + replacement: $1:9102 + - sourceLabels: [__meta_kubernetes_pod_container_name] + regex: "ama-logs" + action: keep diff --git a/modules/python/clusterloader2/scale/config/modules/fortio/client-deployment.yaml b/modules/python/clusterloader2/scale/config/modules/fortio/client-deployment.yaml new file mode 100644 index 0000000000..d962de81ed --- /dev/null +++ b/modules/python/clusterloader2/scale/config/modules/fortio/client-deployment.yaml @@ -0,0 +1,89 @@ +{{$CpuRequest := DefaultParam .CpuRequest "5m"}} +{{$MemoryRequest := DefaultParam .MemoryRequest "20Mi"}} +{{$Image := DefaultParam .Image "acnpublic.azurecr.io/fortio"}} +{{$FortioClientQueriesPerSecond := .FortioClientQueriesPerSecond}} +{{$FortioClientConnections := .FortioClientConnections}} +{{$uniqueLabel := DefaultParam .uniqueLabel ""}} + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: {{.Group}} + app: fortio + role: load +spec: + replicas: {{.Replicas}} + selector: + matchLabels: + name: {{.Name}} + app: fortio + role: load + strategy: + type: Recreate + template: + metadata: + annotations: + retina.sh: observe + labels: + name: {{.Name}} + group: {{.Group}} + app: fortio + role: load + restart: {{.deploymentLabel}} + {{if ne $uniqueLabel ""}} + uniqueLabelPerDeployment: "{{$uniqueLabel}}{{.Index}}" + {{end}} + spec: + nodeSelector: + scale-test: "true" + containers: + - name: fortio + image: {{$Image}} + imagePullPolicy: IfNotPresent + args: + [ + "load", + "-nocatchup", + "-uniform", + "-sequential-warmup", + "-jitter", + "-udp-timeout", + "1500ms", + "-timeout", + "60s", + "-connection-reuse", + "{{$FortioClientConnections}}:{{$FortioClientConnections}}", + "-c", + "{{$FortioClientConnections}}", + "-qps", + "{{$FortioClientQueriesPerSecond}}", + "-t", + "0", + "http://{{.FortioServerServiceBasename}}-{{.Index}}:8080" + ] + ports: + - containerPort: 8078 # tcp echo + - containerPort: 8079 # grpc echo + - containerPort: 8080 # main serving port + - containerPort: 8081 # redirection to https port + resources: + requests: + cpu: {{$CpuRequest}} + memory: {{$MemoryRequest}} + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "scale-test" + operator: "Equal" + value: "true" + effect: "NoSchedule" diff --git a/modules/python/clusterloader2/scale/config/modules/fortio/server-deployment.yaml b/modules/python/clusterloader2/scale/config/modules/fortio/server-deployment.yaml new file mode 100644 index 0000000000..a0c1274c17 --- /dev/null +++ b/modules/python/clusterloader2/scale/config/modules/fortio/server-deployment.yaml @@ -0,0 +1,68 @@ +{{$CpuRequest := DefaultParam .CpuRequest "5m"}} +{{$MemoryRequest := DefaultParam .MemoryRequest "20Mi"}} +{{$Image := DefaultParam .Image "acnpublic.azurecr.io/fortio"}} + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: {{.Group}} + app: fortio + role: server + svc: {{.Name}} +spec: + replicas: {{.Replicas}} + selector: + matchLabels: + name: {{.Name}} + app: fortio + role: server + strategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 20% + maxSurge: 20% + template: + metadata: + annotations: + retina.sh: observe + labels: + name: {{.Name}} + group: {{.Group}} + app: fortio + role: server + svc: {{.Name}} + restart: {{.deploymentLabel}} + spec: + nodeSelector: + scale-test: "true" + containers: + - name: fortio + image: {{$Image}} + imagePullPolicy: IfNotPresent + args: ["server", "-http-port", "0.0.0.0:8080"] + ports: + - containerPort: 8078 # tcp echo + - containerPort: 8079 # grpc echo + - containerPort: 8080 # main serving port + - containerPort: 8081 # redirection to https port + resources: + requests: + cpu: {{$CpuRequest}} + memory: {{$MemoryRequest}} + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "scale-test" + operator: "Equal" + value: "true" + effect: "NoSchedule" diff --git a/modules/python/clusterloader2/scale/config/modules/fortio/service.yaml b/modules/python/clusterloader2/scale/config/modules/fortio/service.yaml new file mode 100644 index 0000000000..a3ebcb2d65 --- /dev/null +++ b/modules/python/clusterloader2/scale/config/modules/fortio/service.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{.Name}} +spec: + selector: + svc: {{.Name}} + ports: + - port: 8080 + targetPort: 8080 diff --git a/modules/python/clusterloader2/scale/config/modules/hubble.yaml b/modules/python/clusterloader2/scale/config/modules/hubble.yaml new file mode 100644 index 0000000000..744fb84927 --- /dev/null +++ b/modules/python/clusterloader2/scale/config/modules/hubble.yaml @@ -0,0 +1,26 @@ +## Hubble module creates Hubble pod monitor + +# Tuning set +{{$tuningSet := DefaultParam .tuningSet "DeploymentCreateQps"}} + +# interval +{{$interval := DefaultParam .interval "15s"}} +{{ $replicasPerNamespace := 1 }} + +{{if eq .actionName "create"}} + {{ $replicasPerNamespace = 1 }} +{{else}} + {{ $replicasPerNamespace = 0 }} +{{end}} + +steps: + - name: {{.actionName}} Hubble Pod Monitor + phases: + - namespaceList: + - "monitoring" + replicasPerNamespace: {{$replicasPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - objectTemplatePath: "modules/hubble/podmonitor.yaml" + basename: hubble-metrics + interval: 15s diff --git a/modules/python/clusterloader2/scale/config/modules/hubble/podmonitor.yaml b/modules/python/clusterloader2/scale/config/modules/hubble/podmonitor.yaml new file mode 100644 index 0000000000..21e792ad9a --- /dev/null +++ b/modules/python/clusterloader2/scale/config/modules/hubble/podmonitor.yaml @@ -0,0 +1,24 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: hubble-metrics + namespace: monitoring +spec: + # Hubble metrics are exposed by cilium-agent pods on port 9965. + # This PodMonitor scrapes Hubble metrics from cilium-agent. + selector: + matchLabels: + k8s-app: cilium + namespaceSelector: + matchNames: + - kube-system + podMetricsEndpoints: + - interval: 30s + honorLabels: true + path: /metrics + relabelings: + - sourceLabels: [__address__] + action: replace + targetLabel: __address__ + regex: (.+?)(\:\d+)? + replacement: $1:9965 diff --git a/modules/python/clusterloader2/scale/config/modules/measurements/ama-logs.yaml b/modules/python/clusterloader2/scale/config/modules/measurements/ama-logs.yaml new file mode 100644 index 0000000000..84c6e28e31 --- /dev/null +++ b/modules/python/clusterloader2/scale/config/modules/measurements/ama-logs.yaml @@ -0,0 +1,191 @@ +{{$action := .action}} # start, gather + +{{$suffix := DefaultParam .suffix ""}} + +steps: + - name: {{$action}} Additional AMA-Logs Measurements + measurements: + - Identifier: AMALogsAvgCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: AMALogs Average CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(container_cpu_usage_seconds_total{container="ama-logs"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(container_cpu_usage_seconds_total{container="ama-logs"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(container_cpu_usage_seconds_total{container="ama-logs"}[1m])[%v:])) + - Identifier: AMALogsMaxCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: AMALogs Max CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(container_cpu_usage_seconds_total{container="ama-logs"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(container_cpu_usage_seconds_total{container="ama-logs"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(container_cpu_usage_seconds_total{container="ama-logs"}[1m])[%v:])) + - Identifier: AMALogsAvgMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: AMALogs Avg Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(container_memory_usage_bytes{container="ama-logs"}[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, avg_over_time(container_memory_usage_bytes{container="ama-logs"}[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, avg_over_time(container_memory_usage_bytes{container="ama-logs"}[%v:]) / 1024 / 1024) + - Identifier: AMALogsMaxMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: AMALogs Max Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(container_memory_usage_bytes{container="ama-logs"}[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, max_over_time(container_memory_usage_bytes{container="ama-logs"}[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, max_over_time(container_memory_usage_bytes{container="ama-logs"}[%v:]) / 1024 / 1024) + - Identifier: AMALogsContainerFsAvgWrittenBytes{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: AMALogs Container FS Average Written Bytes {{$suffix}} + metricVersion: v1 + unit: bytes/s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(container_fs_writes_bytes_total{container="ama-logs"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(container_fs_writes_bytes_total{container="ama-logs"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(container_fs_writes_bytes_total{container="ama-logs"}[1m])[%v:])) + - Identifier: AMALogsContainerFsMaxWrittenBytes{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: AMALogs Container FS Max Written Bytes {{$suffix}} + metricVersion: v1 + unit: bytes/s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(container_fs_writes_bytes_total{container="ama-logs"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(container_fs_writes_bytes_total{container="ama-logs"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(container_fs_writes_bytes_total{container="ama-logs"}[1m])[%v:])) + - Identifier: AMALogsContainerFsAvgWriteLatency{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: AMALogs Container FS Average Write Latency {{$suffix}} + metricVersion: v1 + unit: s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time((rate(container_fs_write_seconds_total{container="ama-logs"}[1m]) / rate(container_fs_writes_total{container="ama-logs"}[1m]))[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time((rate(container_fs_write_seconds_total{container="ama-logs"}[1m]) / rate(container_fs_writes_total{container="ama-logs"}[1m]))[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time((rate(container_fs_write_seconds_total{container="ama-logs"}[1m]) / rate(container_fs_writes_total{container="ama-logs"}[1m]))[%v:])) + - Identifier: AMALogsContainerFsMaxWriteLatency{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: AMALogs Container FS Max Write Latency {{$suffix}} + metricVersion: v1 + unit: s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time((rate(container_fs_write_seconds_total{container="ama-logs"}[1m]) / rate(container_fs_writes_total{container="ama-logs"}[1m]))[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time((rate(container_fs_write_seconds_total{container="ama-logs"}[1m]) / rate(container_fs_writes_total{container="ama-logs"}[1m]))[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time((rate(container_fs_write_seconds_total{container="ama-logs"}[1m]) / rate(container_fs_writes_total{container="ama-logs"}[1m]))[%v:])) + - Identifier: AMALogsContainerRestarts{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: AMALogs Container Restarts {{$suffix}} + metricVersion: v1 + unit: "#" + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(increase(kube_pod_container_status_restarts_total{container="ama-logs"}[%v])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(increase(kube_pod_container_status_restarts_total{container="ama-logs"}[%v])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(increase(kube_pod_container_status_restarts_total{container="ama-logs"}[%v])[%v:])) + + - Identifier: AMALogsNetworkFlowInputAvgRecordsPerSecond{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: AMALogs Network Flow Input Avg Records Per Second {{$suffix}} + metricVersion: v1 + unit: "#/s" + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(fluentbit_input_records_total{name=~"oms_networkflow_input"}[5m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(fluentbit_input_records_total{name=~"oms_networkflow_input"}[5m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(fluentbit_input_records_total{name=~"oms_networkflow_input"}[5m])[%v:])) + + - Identifier: AMALogsNetworkFlowOutputAvgRecordsPerSecond{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: AMALogs Network Flow Output Avg Records Per Second {{$suffix}} + metricVersion: v1 + unit: "#/s" + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(fluentbit_output_proc_records_total{name=~"oms_network_flow_output"}[5m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(fluentbit_output_proc_records_total{name=~"oms_network_flow_output"}[5m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(fluentbit_output_proc_records_total{name=~"oms_network_flow_output"}[5m])[%v:])) + + - Identifier: AMALogsNetworkFlowDroppedAvgRecordsPerSecond{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: AMALogs Network Flow Dropped Avg Records Per Second {{$suffix}} + metricVersion: v1 + unit: "#/s" + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(fluentbit_filter_drop_records_total{name=~"oms_networkflow_throttle"}[5m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(fluentbit_filter_drop_records_total{name=~"oms_networkflow_throttle"}[5m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(fluentbit_filter_drop_records_total{name=~"oms_networkflow_throttle"}[5m])[%v:])) + diff --git a/modules/python/clusterloader2/scale/config/modules/measurements/cilium.yaml b/modules/python/clusterloader2/scale/config/modules/measurements/cilium.yaml new file mode 100644 index 0000000000..c6f715cfb2 --- /dev/null +++ b/modules/python/clusterloader2/scale/config/modules/measurements/cilium.yaml @@ -0,0 +1,213 @@ +{{$action := .action}} # start, gather + +{{$suffix := DefaultParam .suffix ""}} + +steps: + - name: {{$action}} Additional Cilium Measurements + measurements: + - Identifier: CiliumAvgCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Average CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(cilium_process_cpu_seconds_total[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(cilium_process_cpu_seconds_total[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(cilium_process_cpu_seconds_total[1m])[%v:])) + - Identifier: CiliumMaxCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Max CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(cilium_process_cpu_seconds_total[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(cilium_process_cpu_seconds_total[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(cilium_process_cpu_seconds_total[1m])[%v:])) + - Identifier: CiliumAvgMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Avg Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(cilium_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, avg_over_time(cilium_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, avg_over_time(cilium_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - Identifier: CiliumMaxMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Max Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(cilium_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, max_over_time(cilium_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, max_over_time(cilium_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - Identifier: CiliumOperatorAvgCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Operator Avg CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(cilium_operator_process_cpu_seconds_total[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(cilium_operator_process_cpu_seconds_total[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(cilium_operator_process_cpu_seconds_total[1m])[%v:])) + - Identifier: CiliumOperatorMaxCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Operator Max CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(cilium_operator_process_cpu_seconds_total[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(cilium_operator_process_cpu_seconds_total[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(cilium_operator_process_cpu_seconds_total[1m])[%v:])) + - Identifier: CiliumOperatorMaxMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Operator Max Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(cilium_operator_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, max_over_time(cilium_operator_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, max_over_time(cilium_operator_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - Identifier: CiliumOperatorAvgMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Operator Avg Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(cilium_operator_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, avg_over_time(cilium_operator_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, avg_over_time(cilium_operator_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - Identifier: CiliumContainerFsAvgWrittenBytes{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Container FS Average Written Bytes {{$suffix}} + metricVersion: v1 + unit: bytes/s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(container_fs_writes_bytes_total{container="cilium-agent"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(container_fs_writes_bytes_total{container="cilium-agent"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(container_fs_writes_bytes_total{container="cilium-agent"}[1m])[%v:])) + - Identifier: CiliumContainerFsMaxWrittenBytes{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Container FS Max Written Bytes {{$suffix}} + metricVersion: v1 + unit: bytes/s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(container_fs_writes_bytes_total{container="cilium-agent"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(container_fs_writes_bytes_total{container="cilium-agent"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(container_fs_writes_bytes_total{container="cilium-agent"}[1m])[%v:])) + - Identifier: CiliumContainerFsAvgWriteLatency{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Container FS Average Write Latency {{$suffix}} + metricVersion: v1 + unit: s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time((rate(container_fs_write_seconds_total{container="cilium-agent"}[1m]) / rate(container_fs_writes_total{container="cilium-agent"}[1m]))[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time((rate(container_fs_write_seconds_total{container="cilium-agent"}[1m]) / rate(container_fs_writes_total{container="cilium-agent"}[1m]))[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time((rate(container_fs_write_seconds_total{container="cilium-agent"}[1m]) / rate(container_fs_writes_total{container="cilium-agent"}[1m]))[%v:])) + - Identifier: CiliumContainerFsMaxWriteLatency{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Container FS Max Write Latency {{$suffix}} + metricVersion: v1 + unit: s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time((rate(container_fs_write_seconds_total{container="cilium-agent"}[1m]) / rate(container_fs_writes_total{container="cilium-agent"}[1m]))[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time((rate(container_fs_write_seconds_total{container="cilium-agent"}[1m]) / rate(container_fs_writes_total{container="cilium-agent"}[1m]))[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time((rate(container_fs_write_seconds_total{container="cilium-agent"}[1m]) / rate(container_fs_writes_total{container="cilium-agent"}[1m]))[%v:])) + - Identifier: CiliumContainerRestarts{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Container Restarts {{$suffix}} + metricVersion: v1 + unit: "#" + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(increase(kube_pod_container_status_restarts_total{container="cilium-agent"}[%v])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(increase(kube_pod_container_status_restarts_total{container="cilium-agent"}[%v])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(increase(kube_pod_container_status_restarts_total{container="cilium-agent"}[%v])[%v:])) + # - Identifier: AvgCiliumHubbleMetricsCardinality{{$suffix}} + # Method: GenericPrometheusQuery + # Params: + # action: {{$action}} + # metricName: Average Cilium Hubble Metrics Cardinality {{$suffix}} + # metricVersion: v1 + # unit: "#" + # enableViolations: true + # queries: + # - name: Avg + # query: count({__name__=~"hubble_.*"}) diff --git a/modules/python/clusterloader2/scale/config/modules/measurements/control-plane.yaml b/modules/python/clusterloader2/scale/config/modules/measurements/control-plane.yaml new file mode 100644 index 0000000000..47504cbf89 --- /dev/null +++ b/modules/python/clusterloader2/scale/config/modules/measurements/control-plane.yaml @@ -0,0 +1,86 @@ +{{$action := .action}} # start, gather + +# Feature gates +{{$podStartupLatencyThreshold := DefaultParam .CL2_POD_STARTUP_LATENCY_THRESHOLD "15s"}} +{{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE := DefaultParam .CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE true}} +{{$PROMETHEUS_SCRAPE_KUBE_PROXY := DefaultParam .PROMETHEUS_SCRAPE_KUBE_PROXY true}} +{{$NETWORK_LATENCY_THRESHOLD := DefaultParam .CL2_NETWORK_LATENCY_THRESHOLD "0s"}} +{{$ENABLE_IN_CLUSTER_NETWORK_LATENCY := DefaultParam .CL2_ENABLE_IN_CLUSTER_NETWORK_LATENCY true}} + +{{$suffix := DefaultParam .suffix ""}} + +steps: + - name: {{$action}} Additional Measurements + measurements: + - Identifier: APIResponsivenessPrometheus{{$suffix}} + Method: APIResponsivenessPrometheus + Params: + action: {{$action}} + enableViolations: {{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE}} + useSimpleLatencyQuery: true + - Identifier: PodStartupLatency{{$suffix}} + Method: PodStartupLatency + Params: + action: {{$action}} + labelSelector: group = {{.group}} + threshold: {{$podStartupLatencyThreshold}} + - Identifier: ApiserverAvgCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Apiserver Average CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:])) + - Identifier: ApiserverMaxCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Apiserver Max CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:])) + - Identifier: ApiserverAvgMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Apiserver Average Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, avg_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, avg_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024) + - Identifier: ApiserverMaxMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Apiserver Max Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, max_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, max_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024) diff --git a/modules/python/clusterloader2/scale/config/modules/measurements/node-disk.yaml b/modules/python/clusterloader2/scale/config/modules/measurements/node-disk.yaml new file mode 100644 index 0000000000..06efe84dd8 --- /dev/null +++ b/modules/python/clusterloader2/scale/config/modules/measurements/node-disk.yaml @@ -0,0 +1,67 @@ +{{$action := .action}} # start, gather + +{{$suffix := DefaultParam .suffix ""}} + +steps: + - name: {{$action}} Additional Node Disk Measurements + measurements: + - Identifier: NodeDiskAvgWrittenBytes{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Node Disk Average Written Bytes {{$suffix}} + metricVersion: v1 + unit: bytes/s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(node_disk_written_bytes_total[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(node_disk_written_bytes_total[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(node_disk_written_bytes_total[1m])[%v:])) + - Identifier: NodeDiskMaxWrittenBytes{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Node Disk Max Written Bytes {{$suffix}} + metricVersion: v1 + unit: bytes/s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(node_disk_written_bytes_total[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(node_disk_written_bytes_total[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(node_disk_written_bytes_total[1m])[%v:])) + - Identifier: NodeDiskAvgWriteLatency{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Node Disk Average Write Latency {{$suffix}} + metricVersion: v1 + unit: s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time((rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]))[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time((rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]))[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time((rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]))[%v:])) + - Identifier: NodeDiskMaxWriteLatency{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Node Disk Max Write Latency {{$suffix}} + metricVersion: v1 + unit: s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time((rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]))[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time((rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]))[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time((rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]))[%v:])) diff --git a/modules/python/clusterloader2/scale/config/modules/measurements/retina.yaml b/modules/python/clusterloader2/scale/config/modules/measurements/retina.yaml new file mode 100644 index 0000000000..89d83da2a7 --- /dev/null +++ b/modules/python/clusterloader2/scale/config/modules/measurements/retina.yaml @@ -0,0 +1,202 @@ +{{$action := .action}} # start, gather + +{{$suffix := DefaultParam .suffix ""}} + +steps: + - name: {{$action}} Additional Retina Measurements + measurements: + - Identifier: RetinaAvgCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Average CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(container_cpu_usage_seconds_total{container="retina"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(container_cpu_usage_seconds_total{container="retina"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(container_cpu_usage_seconds_total{container="retina"}[1m])[%v:])) + - Identifier: RetinaMaxCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Max CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(container_cpu_usage_seconds_total{container="retina"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(container_cpu_usage_seconds_total{container="retina"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(container_cpu_usage_seconds_total{container="retina"}[1m])[%v:])) + - Identifier: RetinaAvgMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Avg Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(container_memory_usage_bytes{container="retina"}[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, avg_over_time(container_memory_usage_bytes{container="retina"}[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, avg_over_time(container_memory_usage_bytes{container="retina"}[%v:]) / 1024 / 1024) + - Identifier: RetinaMaxMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Max Memory Usage{{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(container_memory_usage_bytes{container="retina"}[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, max_over_time(container_memory_usage_bytes{container="retina"}[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, max_over_time(container_memory_usage_bytes{container="retina"}[%v:]) / 1024 / 1024) + - Identifier: RetinaOperatorAvgCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Operator Avg CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(container_cpu_usage_seconds_total{container="retina-operator"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(container_cpu_usage_seconds_total{container="retina-operator"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(container_cpu_usage_seconds_total{container="retina-operator"}[1m])[%v:])) + - Identifier: RetinaOperatorMaxCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Operator Max CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(container_cpu_usage_seconds_total{container="retina-operator"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(container_cpu_usage_seconds_total{container="retina-operator"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(container_cpu_usage_seconds_total{container="retina-operator"}[1m])[%v:])) + - Identifier: RetinaOperatorMaxMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Operator Max Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(container_memory_usage_bytes{container="retina-operator"}[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, max_over_time(container_memory_usage_bytes{container="retina-operator"}[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, max_over_time(container_memory_usage_bytes{container="retina-operator"}[%v:]) / 1024 / 1024) + - Identifier: RetinaOperatorAvgMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Operator Avg Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(container_memory_usage_bytes{container="retina-operator"}[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, avg_over_time(container_memory_usage_bytes{container="retina-operator"}[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, avg_over_time(container_memory_usage_bytes{container="retina-operator"}[%v:]) / 1024 / 1024) + - Identifier: RetinaContainerFsAvgWrittenBytes{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Container FS Average Written Bytes {{$suffix}} + metricVersion: v1 + unit: bytes/s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(container_fs_writes_bytes_total{container="retina"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(container_fs_writes_bytes_total{container="retina"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(container_fs_writes_bytes_total{container="retina"}[1m])[%v:])) + - Identifier: RetinaContainerFsMaxWrittenBytes{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Container FS Max Written Bytes {{$suffix}} + metricVersion: v1 + unit: bytes/s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(container_fs_writes_bytes_total{container="retina"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(container_fs_writes_bytes_total{container="retina"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(container_fs_writes_bytes_total{container="retina"}[1m])[%v:])) + - Identifier: RetinaContainerFsAvgWriteLatency{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Container FS Average Write Latency {{$suffix}} + metricVersion: v1 + unit: s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time((rate(container_fs_write_seconds_total{container="retina"}[1m]) / rate(container_fs_writes_total{container="retina"}[1m]))[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time((rate(container_fs_write_seconds_total{container="retina"}[1m]) / rate(container_fs_writes_total{container="retina"}[1m]))[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time((rate(container_fs_write_seconds_total{container="retina"}[1m]) / rate(container_fs_writes_total{container="retina"}[1m]))[%v:])) + - Identifier: RetinaContainerFsMaxWriteLatency{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Container FS Max Write Latency {{$suffix}} + metricVersion: v1 + unit: s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time((rate(container_fs_write_seconds_total{container="retina"}[1m]) / rate(container_fs_writes_total{container="retina"}[1m]))[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time((rate(container_fs_write_seconds_total{container="retina"}[1m]) / rate(container_fs_writes_total{container="retina"}[1m]))[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time((rate(container_fs_write_seconds_total{container="retina"}[1m]) / rate(container_fs_writes_total{container="retina"}[1m]))[%v:])) + - Identifier: RetinaContainerRestarts{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Container Restarts {{$suffix}} + metricVersion: v1 + unit: "#" + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(increase(kube_pod_container_status_restarts_total{container="retina"}[%v])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(increase(kube_pod_container_status_restarts_total{container="retina"}[%v])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(increase(kube_pod_container_status_restarts_total{container="retina"}[%v])[%v:])) diff --git a/modules/python/clusterloader2/scale/config/modules/networkpolicy-template.yaml b/modules/python/clusterloader2/scale/config/modules/networkpolicy-template.yaml new file mode 100644 index 0000000000..f89c57cdac --- /dev/null +++ b/modules/python/clusterloader2/scale/config/modules/networkpolicy-template.yaml @@ -0,0 +1,21 @@ +# NetworkPolicy for API/etcd object scale testing. +# Uses dummy labels to create policy objects without affecting actual traffic. +# Purpose: measure API server, etcd, and controller load from policy churn. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: scale-test-policy +spec: + egress: + - {} + ingress: + - from: + - podSelector: + matchLabels: + dummy-label: dummy-value-{{.Index}} + podSelector: + matchLabels: + dummy-selector: dummy-value-{{.Index}} + policyTypes: + - Egress + - Ingress diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter.yaml b/modules/python/clusterloader2/scale/config/modules/node-exporter.yaml new file mode 100644 index 0000000000..5db6526c4c --- /dev/null +++ b/modules/python/clusterloader2/scale/config/modules/node-exporter.yaml @@ -0,0 +1,95 @@ +## Node Exporter module creates Node Exporter components + +# Tuning set +{{$tuningSet := DefaultParam .tuningSet "DeploymentCreateQps"}} +# interval +{{$interval := DefaultParam .interval "15s"}} +{{ $replicasPerNamespace := 1 }} + +{{if eq .actionName "create"}} + {{ $replicasPerNamespace = 1 }} +{{else}} + {{ $replicasPerNamespace = 0 }} +{{end}} + +steps: + - name: Start measurements + measurements: + - Identifier: WaitForNodeExporterPodsRunning + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: DaemonSet + labelSelector: app.kubernetes.io/name = node-exporter + operationTimeout: 5m + - name: {{.actionName}} Node Exporter Service Monitor + phases: + - namespaceList: + - "monitoring" + replicasPerNamespace: {{$replicasPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - objectTemplatePath: "modules/node-exporter/servicemonitor.yaml" + basename: node-exporter + interval: 15s + - name: {{.actionName}} Node Exporter Service Account + phases: + - namespaceList: + - "monitoring" + replicasPerNamespace: {{$replicasPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - objectTemplatePath: "modules/node-exporter/serviceaccount.yaml" + basename: node-exporter + - name: {{.actionName}} Node Exporter Cluster Role + phases: + - namespaceList: + - "" + replicasPerNamespace: {{$replicasPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - objectTemplatePath: "modules/node-exporter/clusterrole.yaml" + basename: node-exporter + - name: {{.actionName}} Node Exporter Cluster Role Binding + phases: + - namespaceList: + - "" + replicasPerNamespace: {{$replicasPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - objectTemplatePath: "modules/node-exporter/clusterrolebinding.yaml" + basename: node-exporter + - name: {{.actionName}} Node Exporter Daemonset + phases: + - namespaceList: + - "monitoring" + replicasPerNamespace: {{$replicasPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - objectTemplatePath: "modules/node-exporter/daemonset.yaml" + basename: node-exporter + - name: {{.actionName}} Node Exporter Network Policy + phases: + - namespaceList: + - "monitoring" + replicasPerNamespace: {{$replicasPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - objectTemplatePath: "modules/node-exporter/networkpolicy.yaml" + basename: node-exporter + - name: {{.actionName}} Node Exporter Services + phases: + - namespaceList: + - "monitoring" + replicasPerNamespace: {{$replicasPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - objectTemplatePath: "modules/node-exporter/service.yaml" + basename: node-exporter + - name: Wait for pods to be running + measurements: + - Identifier: WaitForNodeExporterPodsRunning + Method: WaitForControlledPodsRunning + Params: + action: gather diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter/clusterrole.yaml b/modules/python/clusterloader2/scale/config/modules/node-exporter/clusterrole.yaml new file mode 100644 index 0000000000..2b4003ad4d --- /dev/null +++ b/modules/python/clusterloader2/scale/config/modules/node-exporter/clusterrole.yaml @@ -0,0 +1,22 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: 1.9.1 + name: node-exporter +rules: +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create + diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter/clusterrolebinding.yaml b/modules/python/clusterloader2/scale/config/modules/node-exporter/clusterrolebinding.yaml new file mode 100644 index 0000000000..a3efb6f78e --- /dev/null +++ b/modules/python/clusterloader2/scale/config/modules/node-exporter/clusterrolebinding.yaml @@ -0,0 +1,16 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: 1.9.1 + name: node-exporter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: node-exporter +subjects: +- kind: ServiceAccount + name: node-exporter + namespace: monitoring diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter/daemonset.yaml b/modules/python/clusterloader2/scale/config/modules/node-exporter/daemonset.yaml new file mode 100644 index 0000000000..d7a952b55b --- /dev/null +++ b/modules/python/clusterloader2/scale/config/modules/node-exporter/daemonset.yaml @@ -0,0 +1,119 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: 1.9.1 + name: node-exporter +spec: + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: node-exporter + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: 1.9.1 + spec: + # kube-rbac-proxy needs SA token to call TokenReview/SubjectAccessReview for authn/authz + automountServiceAccountToken: true + containers: + - args: + - --web.listen-address=127.0.0.1:9100 + - --path.sysfs=/host/sys + - --path.rootfs=/host/root + - --path.udev.data=/host/root/run/udev/data + - --no-collector.wifi + - --no-collector.hwmon + - --no-collector.btrfs + - --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run/k3s/containerd/.+|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/) + - --collector.netclass.ignored-devices=^(veth.*|[a-f0-9]{15})$ + - --collector.netdev.device-exclude=^(veth.*|[a-f0-9]{15})$ + image: quay.io/prometheus/node-exporter:v1.9.1 + name: node-exporter + resources: + limits: + cpu: 250m + memory: 180Mi + requests: + cpu: 102m + memory: 180Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + volumeMounts: + - mountPath: /host/sys + mountPropagation: HostToContainer + name: sys + readOnly: true + - mountPath: /host/root + mountPropagation: HostToContainer + name: root + readOnly: true + - args: + - --secure-listen-address=[$(IP)]:9100 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --upstream=http://127.0.0.1:9100/ + env: + - name: IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: quay.io/brancz/kube-rbac-proxy:v0.19.0 + name: kube-rbac-proxy + ports: + - containerPort: 9100 + hostPort: 9100 + name: https + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + seccompProfile: + type: RuntimeDefault + # hostNetwork and hostPID required for node-exporter collectors to access host-level + # metrics (network stats, process info). hostPort 9100 used by kube-rbac-proxy. + hostNetwork: true + hostPID: true + nodeSelector: + kubernetes.io/os: linux + priorityClassName: system-cluster-critical + securityContext: + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + serviceAccountName: node-exporter + tolerations: + - operator: Exists + volumes: + - hostPath: + path: /sys + name: sys + - hostPath: + path: / + name: root + updateStrategy: + rollingUpdate: + maxUnavailable: 10% + type: RollingUpdate + diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter/networkpolicy.yaml b/modules/python/clusterloader2/scale/config/modules/node-exporter/networkpolicy.yaml new file mode 100644 index 0000000000..00f7859945 --- /dev/null +++ b/modules/python/clusterloader2/scale/config/modules/node-exporter/networkpolicy.yaml @@ -0,0 +1,26 @@ +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: 1.9.1 + name: node-exporter +spec: + egress: + - {} + ingress: + - from: + - podSelector: + matchLabels: + app.kubernetes.io/name: prometheus + ports: + - port: 9100 + protocol: TCP + podSelector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + policyTypes: + - Egress + - Ingress diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter/service.yaml b/modules/python/clusterloader2/scale/config/modules/node-exporter/service.yaml new file mode 100644 index 0000000000..625c50a714 --- /dev/null +++ b/modules/python/clusterloader2/scale/config/modules/node-exporter/service.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: 1.9.1 + name: node-exporter +spec: + clusterIP: None + ports: + - name: https + port: 9100 + targetPort: https + selector: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter/serviceaccount.yaml b/modules/python/clusterloader2/scale/config/modules/node-exporter/serviceaccount.yaml new file mode 100644 index 0000000000..e98c9208ba --- /dev/null +++ b/modules/python/clusterloader2/scale/config/modules/node-exporter/serviceaccount.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: 1.9.1 + name: node-exporter + diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter/servicemonitor.yaml b/modules/python/clusterloader2/scale/config/modules/node-exporter/servicemonitor.yaml new file mode 100644 index 0000000000..4578d39c33 --- /dev/null +++ b/modules/python/clusterloader2/scale/config/modules/node-exporter/servicemonitor.yaml @@ -0,0 +1,32 @@ +{{$interval := DefaultParam .interval "15s"}} + +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: 1.9.1 + name: node-exporter + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: {{$interval}} + port: https + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: instance + scheme: https + tlsConfig: + insecureSkipVerify: true + jobLabel: app.kubernetes.io/name + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + diff --git a/modules/python/clusterloader2/scale/config/modules/pfl/retinanetworkflowlog.yaml b/modules/python/clusterloader2/scale/config/modules/pfl/retinanetworkflowlog.yaml new file mode 100644 index 0000000000..653f1edced --- /dev/null +++ b/modules/python/clusterloader2/scale/config/modules/pfl/retinanetworkflowlog.yaml @@ -0,0 +1,24 @@ +{{$namespaces := .namespaces}} +apiVersion: acn.azure.com/v1alpha1 +kind: ContainerNetworkLog +metadata: + name: test +spec: + includefilters: # List of filters + {{range $i := Loop $namespaces}} + - name: filter-{{ AddInt $i 1 }} # Filter name + from: + namespacedPod: + - scale-test-{{ AddInt $i 1 }}/fortio-client- + to: + namespacedPod: + - scale-test-{{ AddInt $i 1 }}/fortio-server- + protocol: + - tcp + - dns + - udp + verdict: + - forwarded + - dropped + {{end}} + diff --git a/modules/python/clusterloader2/scale/config/modules/scale-test.yaml b/modules/python/clusterloader2/scale/config/modules/scale-test.yaml new file mode 100644 index 0000000000..d1c0f941bf --- /dev/null +++ b/modules/python/clusterloader2/scale/config/modules/scale-test.yaml @@ -0,0 +1,102 @@ +name: scale-test + +# generic config +{{$groupName := DefaultParam .CL2_GROUP_NAME "scale-test"}} +{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "15m"}} + +# topology config +{{$namespaces := DefaultParam .CL2_NAMESPACES 1}} +{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 10}} +{{$replicasPerNamespace := DefaultParam .CL2_REPLICAS_PER_NAMESPACE 5}} + +# topology config +{{$fortioServerReplicas := DefaultParam .CL2_FORTIO_SERVERS_PER_DEPLOYMENT 10}} +{{$fortioClientReplicas := DefaultParam .CL2_FORTIO_CLIENTS_PER_DEPLOYMENT 10}} +{{$fortioClientQueriesPerSecond := DefaultParam .CL2_FORTIO_CLIENT_QUERIES_PER_SECOND 1000}} +{{$fortioClientConnections := DefaultParam .CL2_FORTIO_CLIENT_CONNECTIONS 10}} +{{$fortioNamespaces := DefaultParam .CL2_FORTIO_NAMESPACES 1}} +{{$fortioDeploymentsPerNamespace := DefaultParam .CL2_FORTIO_DEPLOYMENTS_PER_NAMESPACE 1}} + +# Network Policies +{{$networkPoliciesPerNamespace := DefaultParam .CL2_NETWORK_POLICIES_PER_NAMESPACE 0}} + +# Retina Network Flow Log config +{{$createRetinaNetworkFlowLogs := DefaultParam .createRetinaNetworkFlowLogs false}} + +steps: + - module: + path: /modules/measurements/control-plane.yaml + params: + action: start + group: {{$groupName}} + + - module: + path: /modules/measurements/retina.yaml + params: + action: start + group: {{$groupName}} + + - module: + path: /modules/measurements/cilium.yaml + params: + action: start + group: {{$groupName}} + + - module: + path: /modules/measurements/ama-logs.yaml + params: + action: start + group: {{$groupName}} + + - module: + path: /modules/measurements/node-disk.yaml + params: + action: start + group: {{$groupName}} + + - module: + path: /modules/test-steps.yaml + params: + tuningSet: DeploymentCreateQps + operationTimeout: {{$operationTimeout}} + Group: {{$groupName}} + namespaces: {{$fortioNamespaces}} + fortioDeploymentsPerNamespace: {{$fortioDeploymentsPerNamespace}} + fortioServerReplicas: {{$fortioServerReplicas}} + fortioClientReplicas: {{$fortioClientReplicas}} + fortioClientQueriesPerSecond: {{$fortioClientQueriesPerSecond}} + fortioClientConnections: {{$fortioClientConnections}} + createRetinaNetworkFlowLogs: {{$createRetinaNetworkFlowLogs}} + deploymentLabel: start + networkPoliciesPerNamespace: {{$networkPoliciesPerNamespace}} + labelTrafficPods: {{.labelTrafficPods}} + + - module: + path: /modules/measurements/control-plane.yaml + params: + action: gather + group: {{$groupName}} + + - module: + path: /modules/measurements/retina.yaml + params: + action: gather + group: {{$groupName}} + + - module: + path: /modules/measurements/cilium.yaml + params: + action: gather + group: {{$groupName}} + + - module: + path: /modules/measurements/ama-logs.yaml + params: + action: gather + group: {{$groupName}} + + - module: + path: /modules/measurements/node-disk.yaml + params: + action: gather + group: {{$groupName}} diff --git a/modules/python/clusterloader2/scale/config/modules/test-steps.yaml b/modules/python/clusterloader2/scale/config/modules/test-steps.yaml new file mode 100644 index 0000000000..29628b6a42 --- /dev/null +++ b/modules/python/clusterloader2/scale/config/modules/test-steps.yaml @@ -0,0 +1,370 @@ +## Input params +{{$tuningSet := .tuningSet}} +{{$operationTimeout := .operationTimeout}} +{{$Group := .Group}} +{{$namespaces := .namespaces}} +{{$deploymentLabel := .deploymentLabel}} +{{$networkPoliciesPerNamespace := .networkPoliciesPerNamespace}} +{{$fortioDeploymentsPerNamespace := .fortioDeploymentsPerNamespace}} +{{$fortioServerReplicas := .fortioServerReplicas}} +{{$fortioClientReplicas := .fortioClientReplicas}} +{{$fortioClientQueriesPerSecond := .fortioClientQueriesPerSecond}} +{{$fortioClientConnections := .fortioClientConnections}} +{{$CpuRequest := .CpuRequest}} +{{$MemoryRequest := .MemoryRequest}} +{{$createRetinaNetworkFlowLogs := .createRetinaNetworkFlowLogs}} + +steps: +{{if $createRetinaNetworkFlowLogs}} +- name: Create Retina Network Flow Log + phases: + - namespaceList: + - "" + replicasPerNamespace: 1 + tuningSet: {{$tuningSet}} + objectBundle: + - basename: retina-network-flow-log + objectTemplatePath: modules/pfl/retinanetworkflowlog.yaml + templateFillMap: + Group: {{.Group}} + namespaces: {{$namespaces}} +{{end}} + +- name: Starting measurement for 'create traffic services' + measurements: + - Method: WaitForControlledPodsRunning + Instances: + - Identifier: WaitForServerPodsRunning + Params: + apiVersion: apps/v1 + kind: Deployment + labelSelector: role = server + Params: + action: start + checkIfPodsAreUpdated: true + labelSelector: group = {{.Group}} + operationTimeout: {{$operationTimeout}} + apiVersion: apps/v1 + +- name: "create traffic services" + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$fortioDeploymentsPerNamespace}} + tuningSet: Sequence + objectBundle: + - basename: fortio-server + objectTemplatePath: modules/fortio/service.yaml + +- name: 'create servers' + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$fortioDeploymentsPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - basename: fortio-server + objectTemplatePath: modules/fortio/server-deployment.yaml + templateFillMap: + Replicas: {{$fortioServerReplicas}} + # SvcName: fortio-server-service + Group: {{.Group}} + deploymentLabel: {{.deploymentLabel}} + CpuRequest: {{$CpuRequest}} + MemoryRequest: {{$MemoryRequest}} + +- name: Waiting for 'create servers' to be completed + measurements: + - Method: WaitForControlledPodsRunning + Instances: + - Identifier: WaitForServerPodsRunning + Params: + action: gather + refreshInterval: 15s + +- name: Starting measurement for 'create clients' + measurements: + - Method: WaitForControlledPodsRunning + Instances: + - Identifier: WaitForClientPodsRunning + Params: + apiVersion: apps/v1 + kind: Deployment + labelSelector: role = load + Params: + action: start + checkIfPodsAreUpdated: true + labelSelector: group = {{.Group}} + operationTimeout: {{$operationTimeout}} + apiVersion: apps/v1 + +# Create clients after the servers have been created because we want all servers to be backend pods for their service +- name: 'create clients' + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$fortioDeploymentsPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - basename: fortio-client + objectTemplatePath: modules/fortio/client-deployment.yaml + templateFillMap: + Replicas: {{$fortioClientReplicas}} + Group: {{.Group}} + deploymentLabel: {{.deploymentLabel}} + CpuRequest: {{$CpuRequest}} + MemoryRequest: {{$MemoryRequest}} + FortioClientConnections: {{$fortioClientConnections}} + FortioClientQueriesPerSecond: {{$fortioClientQueriesPerSecond}} + FortioServerServiceBasename: fortio-server + +- name: Waiting for 'create clients' to be completed + measurements: + - Method: WaitForControlledPodsRunning + Instances: + - Identifier: WaitForClientPodsRunning + Params: + action: gather + refreshInterval: 15s + +{{if .labelTrafficPods}} + +- name: Starting measurement for 'label traffic clients' + measurements: + - Method: WaitForControlledPodsRunning + Instances: + - Identifier: WaitForClientPodsRunning + Params: + apiVersion: apps/v1 + kind: Deployment + labelSelector: role = load + Params: + action: start + checkIfPodsAreUpdated: true + labelSelector: group = {{.Group}} + operationTimeout: {{$operationTimeout}} + apiVersion: apps/v1 + +- name: 'add label to client pods in deployment' + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$fortioDeploymentsPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - basename: fortio-client + objectTemplatePath: modules/fortio/client-deployment.yaml + templateFillMap: + Replicas: {{$fortioClientReplicas}} + Group: {{.Group}} + deploymentLabel: {{.deploymentLabel}} + uniqueLabel: "uniqueLabel" + CpuRequest: {{$CpuRequest}} + MemoryRequest: {{$MemoryRequest}} + FortioClientConnections: {{$fortioClientConnections}} + FortioClientQueriesPerSecond: {{$fortioClientQueriesPerSecond}} + FortioServerServiceBasename: fortio-server + +- name: Waiting for 'label traffic clients' to be completed + measurements: + - Method: WaitForControlledPodsRunning + Instances: + - Identifier: WaitForClientPodsRunning + Params: + action: gather + refreshInterval: 15s + +{{end}} + +- name: 'create {{$networkPoliciesPerNamespace}} network policies per namespace' + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$networkPoliciesPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - basename: scale-test-policy + objectTemplatePath: modules/networkpolicy-template.yaml + +{{if .labelTrafficPods}} + +- name: Starting measurement for 'remove labels from traffic clients' + measurements: + - Method: WaitForControlledPodsRunning + Instances: + - Identifier: WaitForClientPodsRunning + Params: + apiVersion: apps/v1 + kind: Deployment + labelSelector: role = load + Params: + action: start + checkIfPodsAreUpdated: true + labelSelector: group = {{.Group}} + operationTimeout: {{$operationTimeout}} + apiVersion: apps/v1 + +- name: 'remove label from client pods in deployment' + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$fortioDeploymentsPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - basename: fortio-client + objectTemplatePath: modules/fortio/client-deployment.yaml + templateFillMap: + Replicas: {{$fortioClientReplicas}} + Group: {{.Group}} + deploymentLabel: {{.deploymentLabel}} + CpuRequest: {{$CpuRequest}} + MemoryRequest: {{$MemoryRequest}} + FortioClientConnections: {{$fortioClientConnections}} + FortioClientQueriesPerSecond: {{$fortioClientQueriesPerSecond}} + FortioServerServiceBasename: fortio-server + +- name: Waiting for 'remove labels from traffic clients' to be completed + measurements: + - Method: WaitForControlledPodsRunning + Instances: + - Identifier: WaitForClientPodsRunning + Params: + action: gather + refreshInterval: 15s + +- name: Starting measurement for 're-add labels to traffic clients' + measurements: + - Method: WaitForControlledPodsRunning + Instances: + - Identifier: WaitForClientPodsRunning + Params: + apiVersion: apps/v1 + kind: Deployment + labelSelector: role = load + Params: + action: start + checkIfPodsAreUpdated: true + labelSelector: group = {{.Group}} + operationTimeout: {{$operationTimeout}} + apiVersion: apps/v1 + +- name: 're-add label to client pods in deployment' + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$fortioDeploymentsPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - basename: fortio-client + objectTemplatePath: modules/fortio/client-deployment.yaml + templateFillMap: + Replicas: {{$fortioClientReplicas}} + Group: {{.Group}} + deploymentLabel: {{.deploymentLabel}} + uniqueLabel: "uniqueLabel" + CpuRequest: {{$CpuRequest}} + MemoryRequest: {{$MemoryRequest}} + FortioClientConnections: {{$fortioClientConnections}} + FortioClientQueriesPerSecond: {{$fortioClientQueriesPerSecond}} + FortioServerServiceBasename: fortio-server + +- name: Waiting for 're-add labels to traffic clients' to be completed + measurements: + - Method: WaitForControlledPodsRunning + Instances: + - Identifier: WaitForClientPodsRunning + Params: + action: gather + refreshInterval: 15s + +{{end}} + +- name: Wait to get metrics + measurements: + - Identifier: Dummy + Method: Sleep + Params: + action: start + duration: 15m + +- name: 'delete network policies' + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: {{$tuningSet}} + objectBundle: + - basename: scale-test-policy + objectTemplatePath: modules/networkpolicy-template.yaml +- name: "delete k8s services" + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: Sequence + objectBundle: + - basename: fortio-server + objectTemplatePath: modules/fortio/service.yaml + +- name: 'delete servers' + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: {{$tuningSet}} + objectBundle: + - basename: fortio-server + objectTemplatePath: modules/fortio/server-deployment.yaml + templateFillMap: + Replicas: {{$fortioServerReplicas}} + # SvcName: fortio-server-service + Group: {{.Group}} + deploymentLabel: {{.deploymentLabel}} + CpuRequest: {{$CpuRequest}} + MemoryRequest: {{$MemoryRequest}} + +- name: 'delete clients' + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: {{$tuningSet}} + objectBundle: + - basename: fortio-client + objectTemplatePath: modules/fortio/client-deployment.yaml + templateFillMap: + Replicas: {{$fortioClientReplicas}} + Group: {{.Group}} + deploymentLabel: {{.deploymentLabel}} + CpuRequest: {{$CpuRequest}} + MemoryRequest: {{$MemoryRequest}} + FortioClientConnections: {{$fortioClientConnections}} + FortioClientQueriesPerSecond: {{$fortioClientQueriesPerSecond}} + FortioServerServiceBasename: fortio-server + +{{if $createRetinaNetworkFlowLogs}} +# Delete retina network flow log +- name: Delete Retina Network Flow Log + phases: + - namespaceList: + - "" + replicasPerNamespace: 0 + tuningSet: {{$tuningSet}} + objectBundle: + - basename: retina-network-flow-log + objectTemplatePath: modules/pfl/retinanetworkflowlog.yaml + templateFillMap: + Group: {{.Group}} + namespaces: {{$namespaces}} +{{end}} diff --git a/modules/python/clusterloader2/scale/scale.py b/modules/python/clusterloader2/scale/scale.py new file mode 100644 index 0000000000..ebc0a4a73d --- /dev/null +++ b/modules/python/clusterloader2/scale/scale.py @@ -0,0 +1,227 @@ +import json +import os +import argparse + +from datetime import datetime, timezone +from clusterloader2.utils import parse_xml_to_json, run_cl2_command, process_cl2_reports +from utils.common import str2bool + +def configure_clusterloader2( + fortio_servers_per_deployment, + fortio_clients_per_deployment, + fortio_client_queries_per_second, + fortio_client_connections, + fortio_namespaces, + fortio_deployments_per_namespace, + network_policies_per_namespace, + generate_retina_network_flow_logs, + label_traffic_pods, + override_file): + + with open(override_file, 'w', encoding='utf-8') as file: + file.write("CL2_PROMETHEUS_TOLERATE_MASTER: true\n") + file.write("CL2_PROMETHEUS_MEMORY_LIMIT_FACTOR: 100.0\n") + file.write("CL2_PROMETHEUS_MEMORY_SCALE_FACTOR: 100.0\n") + file.write("CL2_PROMETHEUS_CPU_SCALE_FACTOR: 30.0\n") + file.write("CL2_PROMETHEUS_SCRAPE_CILIUM_AGENT: true\n") + file.write("CL2_PROMETHEUS_SCRAPE_CILIUM_OPERATOR: true\n") + file.write("CL2_PROMETHEUS_NODE_SELECTOR: \"prometheus: \\\"true\\\"\"\n") + file.write("CL2_POD_STARTUP_LATENCY_THRESHOLD: 3m\n") + file.write(f"CL2_LABEL_TRAFFIC_PODS: {label_traffic_pods}\n") + + # topology config + file.write(f"CL2_FORTIO_SERVERS_PER_DEPLOYMENT: {fortio_servers_per_deployment}\n") + file.write(f"CL2_FORTIO_CLIENTS_PER_DEPLOYMENT: {fortio_clients_per_deployment}\n") + file.write(f"CL2_FORTIO_CLIENT_QUERIES_PER_SECOND: {fortio_client_queries_per_second}\n") + file.write(f"CL2_FORTIO_CLIENT_CONNECTIONS: {fortio_client_connections}\n") + file.write(f"CL2_FORTIO_NAMESPACES: {fortio_namespaces}\n") + file.write(f"CL2_FORTIO_DEPLOYMENTS_PER_NAMESPACE: {fortio_deployments_per_namespace}\n") + file.write("CL2_FORTIO_POD_CPU: 10m\n") + file.write("CL2_FORTIO_POD_MEMORY: 50Mi\n") + file.write(f"CL2_NETWORK_POLICIES_PER_NAMESPACE: {network_policies_per_namespace}\n") + file.write(f"CL2_GENERATE_RETINA_NETWORK_FLOW_LOGS: {generate_retina_network_flow_logs}\n") + + with open(override_file, 'r', encoding='utf-8') as file: + print(f"Content of file {override_file}:\n{file.read()}") + +def execute_clusterloader2( + cl2_image, + cl2_config_dir, + cl2_report_dir, + cl2_config_file, + kubeconfig, + provider, + scrape_containerd +): + run_cl2_command(kubeconfig, cl2_image, cl2_config_dir, cl2_report_dir, provider, + cl2_config_file=cl2_config_file, overrides=True, enable_prometheus=True, + scrape_containerd=scrape_containerd, tear_down_prometheus=True, + scrape_kubelets=True, scrape_ksm=True, + scrape_metrics_server=True) + + +def collect_clusterloader2( + cl2_report_dir, + cloud_info, + run_id, + run_url, + result_file, + test_type, + start_timestamp, + observability_tool, + repository, + repository_ref, + fortio_servers_per_deployment, + fortio_clients_per_deployment, + fortio_client_queries_per_second, + fortio_client_connections, + fortio_namespaces, + fortio_deployments_per_namespace, + network_policies_per_namespace, + generate_retina_network_flow_logs=False, + label_traffic_pods=False, + trigger_reason="", +): + details = parse_xml_to_json(os.path.join(cl2_report_dir, "junit.xml"), indent = 2) + json_data = json.loads(details) + testsuites = json_data["testsuites"] + + if testsuites: + status = "success" if testsuites[0]["failures"] == 0 else "failure" + else: + raise Exception(f"No testsuites found in the report! Raw data: {details}") + + # TODO: Expose optional parameter to include test details + template = { + "timestamp": datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'), + "status": status, + "group": None, + "measurement": None, + "result": None, + "observability_tool": observability_tool, + "test_details": { + # add more details here about tests (e.g. features tested) + "trigger_reason": trigger_reason, + "observability_tool": observability_tool, + "repository": repository, + "repository_ref": repository_ref, + "traffic_generator": "fortio", + "traffic_namespaces": fortio_namespaces, + "traffic_deployments_per_namespace": fortio_deployments_per_namespace, + "traffic_servers_per_deployment": fortio_servers_per_deployment, + "traffic_clients_per_deployment": fortio_clients_per_deployment, + "traffic_pods": fortio_namespaces * fortio_deployments_per_namespace * (fortio_clients_per_deployment + fortio_servers_per_deployment), + "network_policies": network_policies_per_namespace, + "generate_retina_network_flow_logs": generate_retina_network_flow_logs, + "label_traffic_pods": label_traffic_pods, + "requests_per_second": fortio_client_queries_per_second, + "details": testsuites[0]["testcases"][0].get("failure", None) if testsuites[0].get("testcases") else None, + }, + "cloud_info": cloud_info, + "run_id": run_id, + "run_url": run_url, + "test_type": test_type, + "start_timestamp": start_timestamp, + # parameters + "fortio_servers_per_deployment": fortio_servers_per_deployment, + "fortio_clients_per_deployment": fortio_clients_per_deployment, + "fortio_client_queries_per_second": fortio_client_queries_per_second, + "fortio_client_connections": fortio_client_connections, + "fortio_namespaces": fortio_namespaces, + "fortio_deployments_per_namespace": fortio_deployments_per_namespace, + } + content = process_cl2_reports(cl2_report_dir, template) + + os.makedirs(os.path.dirname(result_file), exist_ok=True) + with open(result_file, 'w', encoding='utf-8') as file: + file.write(content) + +def main(): + parser = argparse.ArgumentParser(description="SLO Kubernetes resources.") + subparsers = parser.add_subparsers(dest="command") + + # Sub-command for configure_clusterloader2 + parser_configure = subparsers.add_parser("configure", help="Override CL2 config file") + parser_configure.add_argument("--fortio-servers-per-deployment", type=int, required=True, help="Number of Fortio servers per deployment") + parser_configure.add_argument("--fortio-clients-per-deployment", type=int, required=True, help="Number of Fortio clients per deployment") + parser_configure.add_argument("--fortio-client-queries-per-second", type=int, required=True, help="Queries per second for each Fortio client pod. NOT queries per second per connection") + parser_configure.add_argument("--fortio-client-connections", type=int, required=True, help="Number of simultaneous connections for each Fortio client") + parser_configure.add_argument("--fortio-namespaces", type=int, required=True, help="Number of namespaces, each with their own service. Fortio clients query servers in the same namespace. Be wary of integer division causing less pods than expected regarding this parameter, pods, and pods per node.") + parser_configure.add_argument("--fortio-deployments-per-namespace", type=int, required=True, help="Number of Fortio server deployments (and number of client deployments) per service/partition. Be wary of integer division causing less pods than expected regarding this parameter, namespaces, pods, and pods per node.") + parser_configure.add_argument("--network-policies-per-namespace", type=int, help="Number of network policies to be created per namespace", default=0, nargs='?') + parser_configure.add_argument("--generate-retina-network-flow-logs", type=str2bool, choices=[True, False], nargs='?', default=False, help="Generate Retina Network Flow Logs (default=False)") + parser_configure.add_argument("--label_traffic_pods", type=str2bool, choices=[True, False], nargs='?', default=False, help="Add/Remove label to client traffic pods(default=False)") + parser_configure.add_argument("--cl2_override_file", type=str, help="Path to the overrides of CL2 config file") + + # Sub-command for execute_clusterloader2 + parser_execute = subparsers.add_parser("execute", help="Execute scale up operation") + parser_execute.add_argument("--cl2-image", type=str, required=True, help="Name of the CL2 image") + parser_execute.add_argument("--cl2-config-dir", type=str, required=True, help="Path to the CL2 config directory") + parser_execute.add_argument("--cl2-report-dir", type=str, required=True, help="Path to the CL2 report directory") + parser_execute.add_argument("--cl2-config-file", type=str, required=True, help="Path to the CL2 config file") + parser_execute.add_argument("--kubeconfig", type=str, required=True, help="Path to the kubeconfig file") + parser_execute.add_argument("--provider", type=str, required=True, help="Cloud provider name") + parser_execute.add_argument("--scrape-containerd", type=str2bool, choices=[True, False], default=False, + help="Whether to scrape containerd metrics. Must be either True or False") + + # Sub-command for collect_clusterloader2 + parser_collect = subparsers.add_parser("collect", help="Collect scale up data") + parser_collect.add_argument("--cl2_report_dir", type=str, help="Path to the CL2 report directory") + parser_collect.add_argument("--cloud_info", type=str, help="Cloud information") + parser_collect.add_argument("--run_id", type=str, help="Run ID") + parser_collect.add_argument("--run_url", type=str, help="Run URL") + parser_collect.add_argument("--result_file", type=str, help="Path to the result file") + parser_collect.add_argument("--test_type", type=str, nargs='?', default="default-config", + help="Description of test type") + parser_collect.add_argument("--start_timestamp", type=str, help="Test start timestamp") + parser_collect.add_argument("--observability_tool", type=str, help="Observability tool evaluated in the test") + parser_collect.add_argument("--repository", type=str, help="Repository of observability tool evaluated in the test") + parser_collect.add_argument("--repository_ref", type=str, help="Repository Ref (branch/tag/SHA) of observability tool evaluated in the test") + parser_collect.add_argument("--fortio-servers-per-deployment", type=int, required=True, help="Number of Fortio servers per deployment") + parser_collect.add_argument("--fortio-clients-per-deployment", type=int, required=True, help="Number of Fortio clients per deployment") + parser_collect.add_argument("--fortio-client-queries-per-second", type=int, required=True, help="Queries per second for each Fortio client pod. NOT queries per second per connection") + parser_collect.add_argument("--fortio-client-connections", type=int, required=True, help="Number of simultaneous connections for each Fortio client") + parser_collect.add_argument("--fortio-namespaces", type=int, required=True, help="Number of namespaces, each with their own service. Fortio clients query servers in the same namespace. Be wary of integer division causing less pods than expected regarding this parameter, pods, and pods per node.") + parser_collect.add_argument("--fortio-deployments-per-namespace", type=int, required=True, help="Number of Fortio server deployments (and number of client deployments) per service/partition. Be wary of integer division causing less pods than expected regarding this parameter, namespaces, pods, and pods per node.") + parser_collect.add_argument("--network-policies-per-namespace", type=int, help="Number of network policies to be created per namespace", default=0, nargs='?') + parser_collect.add_argument("--generate-retina-network-flow-logs", type=str2bool, choices=[True, False], nargs='?', default=False, help="Generate Retina Network Flow Logs (default=False)") + parser_collect.add_argument("--label_traffic_pods", type=str2bool, choices=[True, False], nargs='?', default=False, help="Add/Remove label to client traffic pods(default=False)") + parser_collect.add_argument("--trigger_reason", type=str, help="What triggered the test", nargs='?', default="") + + args = parser.parse_args() + + if args.command == "configure": + configure_clusterloader2(args.fortio_servers_per_deployment, + args.fortio_clients_per_deployment, + args.fortio_client_queries_per_second, + args.fortio_client_connections, + args.fortio_namespaces, + args.fortio_deployments_per_namespace, + args.network_policies_per_namespace, + args.generate_retina_network_flow_logs, + args.label_traffic_pods, + args.cl2_override_file, + ) + elif args.command == "execute": + execute_clusterloader2(args.cl2_image, args.cl2_config_dir, args.cl2_report_dir, args.cl2_config_file, + args.kubeconfig, args.provider, args.scrape_containerd) + elif args.command == "collect": + collect_clusterloader2(args.cl2_report_dir, args.cloud_info, args.run_id, args.run_url, + args.result_file, args.test_type, args.start_timestamp, + args.observability_tool, + args.repository, + args.repository_ref, + args.fortio_servers_per_deployment, + args.fortio_clients_per_deployment, + args.fortio_client_queries_per_second, + args.fortio_client_connections, + args.fortio_namespaces, + args.fortio_deployments_per_namespace, + args.network_policies_per_namespace, + args.generate_retina_network_flow_logs, + args.label_traffic_pods, + args.trigger_reason, + ) + +if __name__ == "__main__": + main() diff --git a/modules/python/tests/mock_data/scale/report/GenericPrometheusQuery_CiliumAvgCPUUsage_scale-test_2025-03-04T05:35:56Z.json b/modules/python/tests/mock_data/scale/report/GenericPrometheusQuery_CiliumAvgCPUUsage_scale-test_2025-03-04T05:35:56Z.json new file mode 100644 index 0000000000..e37e687b64 --- /dev/null +++ b/modules/python/tests/mock_data/scale/report/GenericPrometheusQuery_CiliumAvgCPUUsage_scale-test_2025-03-04T05:35:56Z.json @@ -0,0 +1,29 @@ +{ + "version": "v1", + "dataItems": [ + { + "labels": { + "Metric": "Perc99" + }, + "data": { + "value": 0.5 + } + }, + { + "labels": { + "Metric": "Perc90" + }, + "data": { + "value": 0.3 + } + }, + { + "labels": { + "Metric": "Perc50" + }, + "data": { + "value": 0.1 + } + } + ] +} diff --git a/modules/python/tests/mock_data/scale/report/junit.xml b/modules/python/tests/mock_data/scale/report/junit.xml new file mode 100644 index 0000000000..1b80a746d6 --- /dev/null +++ b/modules/python/tests/mock_data/scale/report/junit.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/modules/python/tests/test_scale.py b/modules/python/tests/test_scale.py new file mode 100644 index 0000000000..d3a6fb80a1 --- /dev/null +++ b/modules/python/tests/test_scale.py @@ -0,0 +1,284 @@ +import json +import os +import sys +import tempfile +import unittest +from unittest.mock import patch, MagicMock + +from clusterloader2.scale.scale import ( + configure_clusterloader2, + execute_clusterloader2, + collect_clusterloader2, + main, +) + + +class TestConfigureScale(unittest.TestCase): + """Test cases for configure_clusterloader2 function""" + + def test_basic_configuration(self): + """Test basic configuration with default parameters""" + with tempfile.NamedTemporaryFile( + delete=False, mode="w+", encoding="utf-8" + ) as tmp: + tmp_path = tmp.name + + try: + configure_clusterloader2( + fortio_servers_per_deployment=15, + fortio_clients_per_deployment=15, + fortio_client_queries_per_second=1500, + fortio_client_connections=50, + fortio_namespaces=1, + fortio_deployments_per_namespace=1000, + network_policies_per_namespace=100, + generate_retina_network_flow_logs=False, + label_traffic_pods=False, + override_file=tmp_path, + ) + + with open(tmp_path, "r", encoding="utf-8") as f: + content = f.read() + + # Assert Prometheus config + self.assertIn("CL2_PROMETHEUS_TOLERATE_MASTER: true", content) + self.assertIn("CL2_PROMETHEUS_MEMORY_LIMIT_FACTOR: 100.0", content) + self.assertIn("CL2_PROMETHEUS_SCRAPE_CILIUM_AGENT: true", content) + self.assertIn("CL2_PROMETHEUS_SCRAPE_CILIUM_OPERATOR: true", content) + + # Assert Fortio config + self.assertIn("CL2_FORTIO_SERVERS_PER_DEPLOYMENT: 15", content) + self.assertIn("CL2_FORTIO_CLIENTS_PER_DEPLOYMENT: 15", content) + self.assertIn("CL2_FORTIO_CLIENT_QUERIES_PER_SECOND: 1500", content) + self.assertIn("CL2_FORTIO_CLIENT_CONNECTIONS: 50", content) + self.assertIn("CL2_FORTIO_NAMESPACES: 1", content) + self.assertIn("CL2_FORTIO_DEPLOYMENTS_PER_NAMESPACE: 1000", content) + + # Assert network policies and flags + self.assertIn("CL2_NETWORK_POLICIES_PER_NAMESPACE: 100", content) + self.assertIn("CL2_GENERATE_RETINA_NETWORK_FLOW_LOGS: False", content) + self.assertIn("CL2_LABEL_TRAFFIC_PODS: False", content) + finally: + os.remove(tmp_path) + + def test_configuration_with_retina_flow_logs(self): + """Test configuration with Retina flow logs enabled""" + with tempfile.NamedTemporaryFile( + delete=False, mode="w+", encoding="utf-8" + ) as tmp: + tmp_path = tmp.name + + try: + configure_clusterloader2( + fortio_servers_per_deployment=10, + fortio_clients_per_deployment=10, + fortio_client_queries_per_second=1000, + fortio_client_connections=25, + fortio_namespaces=5, + fortio_deployments_per_namespace=100, + network_policies_per_namespace=50, + generate_retina_network_flow_logs=True, + label_traffic_pods=True, + override_file=tmp_path, + ) + + with open(tmp_path, "r", encoding="utf-8") as f: + content = f.read() + + self.assertIn("CL2_GENERATE_RETINA_NETWORK_FLOW_LOGS: True", content) + self.assertIn("CL2_LABEL_TRAFFIC_PODS: True", content) + finally: + os.remove(tmp_path) + + +class TestExecuteScale(unittest.TestCase): + """Test cases for execute_clusterloader2 function""" + + @patch("clusterloader2.scale.scale.run_cl2_command") + def test_execute_calls_run_cl2_command(self, mock_run_cl2): + """Test that execute_clusterloader2 calls run_cl2_command with correct params""" + execute_clusterloader2( + cl2_image="ghcr.io/azure/clusterloader2:v20250513", + cl2_config_dir="/path/to/config", + cl2_report_dir="/path/to/report", + cl2_config_file="config.yaml", + kubeconfig="/path/to/kubeconfig", + provider="aks", + scrape_containerd=False, + ) + + mock_run_cl2.assert_called_once_with( + "/path/to/kubeconfig", + "ghcr.io/azure/clusterloader2:v20250513", + "/path/to/config", + "/path/to/report", + "aks", + cl2_config_file="config.yaml", + overrides=True, + enable_prometheus=True, + scrape_containerd=False, + tear_down_prometheus=True, + scrape_kubelets=True, + scrape_ksm=True, + scrape_metrics_server=True, + ) + + +class TestCollectScale(unittest.TestCase): + """Test cases for collect_clusterloader2 function""" + + def test_collect_creates_result_file(self): + """Test that collect_clusterloader2 creates result file with correct structure""" + cl2_report_dir = os.path.join( + os.path.dirname(__file__), "mock_data", "scale", "report" + ) + result_file = tempfile.mktemp(suffix=".jsonl") + + try: + collect_clusterloader2( + cl2_report_dir=cl2_report_dir, + cloud_info=json.dumps({"cloud": "azure", "region": "eastus2"}), + run_id="test-run-123", + run_url="http://example.com/run123", + result_file=result_file, + test_type="unit-test", + start_timestamp="2025-03-04T05:00:00Z", + observability_tool="cnl", + repository="https://github.com/microsoft/retina", + repository_ref="main", + fortio_servers_per_deployment=15, + fortio_clients_per_deployment=15, + fortio_client_queries_per_second=1500, + fortio_client_connections=50, + fortio_namespaces=1, + fortio_deployments_per_namespace=1000, + network_policies_per_namespace=100, + generate_retina_network_flow_logs=True, + label_traffic_pods=False, + trigger_reason="Manual", + ) + + self.assertTrue(os.path.exists(result_file)) + with open(result_file, "r", encoding="utf-8") as f: + content = f.read() + + # Result should contain JSONL lines + self.assertTrue(len(content) > 0) + + # Parse the first line and verify structure + lines = content.strip().split("\n") + if lines and lines[0]: + result = json.loads(lines[0]) + self.assertEqual(result["status"], "success") + self.assertEqual(result["run_id"], "test-run-123") + self.assertEqual(result["test_type"], "unit-test") + self.assertEqual(result["observability_tool"], "cnl") + self.assertIn("test_details", result) + self.assertEqual(result["test_details"]["traffic_generator"], "fortio") + self.assertEqual(result["test_details"]["traffic_namespaces"], 1) + self.assertEqual(result["test_details"]["network_policies"], 100) + finally: + if os.path.exists(result_file): + os.remove(result_file) + + def test_collect_calculates_traffic_pods(self): + """Test that traffic_pods is calculated correctly""" + cl2_report_dir = os.path.join( + os.path.dirname(__file__), "mock_data", "scale", "report" + ) + result_file = tempfile.mktemp(suffix=".jsonl") + + try: + # 5 namespaces * 10 deployments * (3 servers + 3 clients) = 300 pods + collect_clusterloader2( + cl2_report_dir=cl2_report_dir, + cloud_info=json.dumps({"cloud": "azure"}), + run_id="test-run", + run_url="http://example.com", + result_file=result_file, + test_type="unit-test", + start_timestamp="2025-03-04T05:00:00Z", + observability_tool="cnl", + repository="", + repository_ref="", + fortio_servers_per_deployment=3, + fortio_clients_per_deployment=3, + fortio_client_queries_per_second=100, + fortio_client_connections=10, + fortio_namespaces=5, + fortio_deployments_per_namespace=10, + network_policies_per_namespace=0, + ) + + with open(result_file, "r", encoding="utf-8") as f: + content = f.read() + + lines = content.strip().split("\n") + if lines and lines[0]: + result = json.loads(lines[0]) + # 5 * 10 * (3 + 3) = 300 + self.assertEqual(result["test_details"]["traffic_pods"], 300) + finally: + if os.path.exists(result_file): + os.remove(result_file) + + +class TestMainArgumentParsing(unittest.TestCase): + """Test cases for main() argument parsing""" + + @patch("clusterloader2.scale.scale.configure_clusterloader2") + def test_configure_command_parsing(self, mock_configure): + """Test that configure command parses arguments correctly""" + test_args = [ + "scale.py", + "configure", + "--fortio-servers-per-deployment", "15", + "--fortio-clients-per-deployment", "15", + "--fortio-client-queries-per-second", "1500", + "--fortio-client-connections", "50", + "--fortio-namespaces", "1", + "--fortio-deployments-per-namespace", "1000", + "--network-policies-per-namespace", "100", + "--generate-retina-network-flow-logs", "True", + "--label_traffic_pods", "False", + "--cl2_override_file", "/tmp/overrides.yaml", + ] + + with patch.object(sys, "argv", test_args): + main() + + mock_configure.assert_called_once_with( + 15, 15, 1500, 50, 1, 1000, 100, True, False, "/tmp/overrides.yaml" + ) + + @patch("clusterloader2.scale.scale.execute_clusterloader2") + def test_execute_command_parsing(self, mock_execute): + """Test that execute command parses arguments correctly""" + test_args = [ + "scale.py", + "execute", + "--cl2-image", "ghcr.io/azure/clusterloader2:v20250513", + "--cl2-config-dir", "/path/to/config", + "--cl2-report-dir", "/path/to/report", + "--cl2-config-file", "config.yaml", + "--kubeconfig", "/path/to/kubeconfig", + "--provider", "aks", + "--scrape-containerd", "False", + ] + + with patch.object(sys, "argv", test_args): + main() + + mock_execute.assert_called_once_with( + "ghcr.io/azure/clusterloader2:v20250513", + "/path/to/config", + "/path/to/report", + "config.yaml", + "/path/to/kubeconfig", + "aks", + False, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/pipelines/perf-eval/CNI Benchmark/cnl-observability.yml b/pipelines/perf-eval/CNI Benchmark/cnl-observability.yml new file mode 100644 index 0000000000..ffd719e401 --- /dev/null +++ b/pipelines/perf-eval/CNI Benchmark/cnl-observability.yml @@ -0,0 +1,50 @@ +trigger: none + +schedules: + - cron: "0 3 * * *" + displayName: Daily 3am CNL observability test + branches: + include: + - main + always: true + +variables: + SCENARIO_TYPE: perf-eval + SCENARIO_NAME: azurecni-overlay-cilium-cnl + OWNER: aks + OBSERVABILITY_TOOL: cnl + +stages: + - stage: azure_eastus2 # format: [_]+ (e.g. azure_eastus2, aws_eastus_westus) + dependsOn: [] + jobs: + - template: /jobs/competitive-test.yml # must keep as is + parameters: + cloud: azure + regions: + - eastus2 + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20250513" + install: false + topology: observability + matrix: + cnl: + traffic_deployment_count: 1000 + traffic_replica_count: 15 + network_policies_per_namespace: 1000 + cl2_config_file: config.yaml + # fortio variables + fortio_servers_per_deployment: 15 + fortio_clients_per_deployment: 15 + fortio_client_queries_per_second: 1500 + fortio_client_connections: 50 + fortio_namespaces: 1 + fortio_deployments_per_namespace: 1000 + generate_retina_network_flow_logs: true + label_traffic_pods: false + trigger_reason: ${{ variables['Build.Reason'] }} + max_parallel: 1 + timeout_in_minutes: 720 + credential_type: service_connection + ssh_key_enabled: false diff --git a/scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-inputs/azure.tfvars b/scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-inputs/azure.tfvars new file mode 100644 index 0000000000..f7f9cdba16 --- /dev/null +++ b/scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-inputs/azure.tfvars @@ -0,0 +1,85 @@ +scenario_type = "perf-eval" +scenario_name = "cnl-azurecni-overlay-cilium" +deletion_delay = "20h" +owner = "aks" + +aks_cli_config_list = [ + { + role = "slo" + aks_name = "telescope-acns-scale-test" + kubernetes_version = "1.33" + sku_tier = "Standard" + + optional_parameters = [ + { + name = "generate-ssh-keys" + value = "" + }, + { + name = "max-pods" + value = "250" + }, + { + name = "network-plugin" + value = "azure" + }, + { + name = "network-plugin-mode" + value = "overlay" + }, + { + name = "pod-cidr" + value = "192.168.0.0/16" + }, + { + name = "enable-acns" + value = "" + }, + { + name = "network-dataplane" + value = "cilium" + }, + { + name = "zones" + value = "1 2 3" + } + ] + + default_node_pool = { + name = "default" + node_count = 5 + auto_scaling_enabled = false + vm_size = "Standard_D4_v3" + zones = ["1", "2", "3"] + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D64_v3" + zones = ["1", "2", "3"] + optional_parameters = [ + { + name = "labels" + value = "prometheus=true" + } + ] + }, + { + name = "traffic" + node_count = 1000 + auto_scaling_enabled = false + max_pods = 250 + vm_size = "Standard_D4_v3" + zones = ["1", "2", "3"] + optional_parameters = [ + { + name = "labels" + value = "slo=true scale-test=true" + } + ] + } + ] + } +] diff --git a/scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-test-inputs/azure.json b/scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-test-inputs/azure.json new file mode 100644 index 0000000000..ea27a572c6 --- /dev/null +++ b/scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-test-inputs/azure.json @@ -0,0 +1,4 @@ +{ + "run_id" : "123456789", + "region" : "eastus" +} diff --git a/steps/engine/clusterloader2/scale/collect.yml b/steps/engine/clusterloader2/scale/collect.yml new file mode 100644 index 0000000000..7d5514dd3d --- /dev/null +++ b/steps/engine/clusterloader2/scale/collect.yml @@ -0,0 +1,47 @@ +parameters: +- name: cloud + type: string + default: '' +- name: engine_input + type: object + default: {} +- name: region + type: string + +steps: +- template: /steps/cloud/${{ parameters.cloud }}/collect-cloud-info.yml + parameters: + region: ${{ parameters.region }} +- script: | + set -x + set -eo pipefail + + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect \ + --cl2_report_dir ${CL2_REPORT_DIR:-0} \ + --cloud_info "${CLOUD_INFO:-""}" \ + --run_id $RUN_ID \ + --run_url $RUN_URL \ + --result_file $TEST_RESULTS_FILE \ + --start_timestamp $START_TIME \ + --observability_tool ${OBSERVABILITY_TOOL:-""} \ + --repository ${REPOSITORY:-""} \ + --repository_ref ${REPOSITORY_REF:-""} \ + --fortio-servers-per-deployment $FORTIO_SERVERS_PER_DEPLOYMENT \ + --fortio-clients-per-deployment $FORTIO_CLIENTS_PER_DEPLOYMENT \ + --fortio-client-queries-per-second $FORTIO_CLIENT_QUERIES_PER_SECOND \ + --fortio-client-connections $FORTIO_CLIENT_CONNECTIONS \ + --fortio-namespaces $FORTIO_NAMESPACES \ + --fortio-deployments-per-namespace $FORTIO_DEPLOYMENTS_PER_NAMESPACE \ + --network-policies-per-namespace $NETWORK_POLICIES_PER_NAMESPACE \ + --generate-retina-network-flow-logs ${GENERATE_RETINA_NETWORK_FLOW_LOGS:-False} \ + --label_traffic_pods ${LABEL_TRAFFIC_PODS:-False} \ + --trigger_reason ${TRIGGER_REASON:-""} + workingDirectory: modules/python + env: + CLOUD: ${{ parameters.cloud }} + RUN_URL: $(RUN_URL) + PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/clusterloader2/scale/scale.py + CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/scale/results + REPOSITORY: ${{ parameters.engine_input.repository }} + REPOSITORY_REF: ${{ parameters.engine_input.ref }} + displayName: "Collect Results" diff --git a/steps/engine/clusterloader2/scale/execute.yml b/steps/engine/clusterloader2/scale/execute.yml new file mode 100644 index 0000000000..085b363ec1 --- /dev/null +++ b/steps/engine/clusterloader2/scale/execute.yml @@ -0,0 +1,107 @@ +parameters: + - name: cloud + type: string + default: "" + - name: engine_input + type: object + default: {} + - name: region + type: string + - name: install + type: boolean + default: false + - name: repository + type: string + default: "" + - name: makeargs + type: string + default: "" + - name: ref + type: string + default: "main" + +steps: + - script: | + echo "Set the start time for test execution" + startTimestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + echo "Start: $startTimestamp" + echo "##vso[task.setvariable variable=START_TIME]$startTimestamp" + displayName: set up timestamp variable + + - ${{ if parameters.install }}: + - script: | + set -eo pipefail + if [ -z "$REPOSITORY" ]; then + echo "##vso[task.logissue type=error]install=true but repository parameter is empty. Please provide a valid repository URL." + exit 1 + fi + if [ -z "$MAKEARGS" ]; then + echo "##vso[task.logissue type=error]install=true but makeargs parameter is empty. Please provide make target(s) (e.g., 'helm-install')." + exit 1 + fi + env: + REPOSITORY: ${{ parameters.repository }} + MAKEARGS: ${{ parameters.makeargs }} + displayName: Validate install parameters + + - script: | + set -eo pipefail + git clone ${{ parameters.repository }} retina --no-checkout + cd retina + git fetch --depth 1 origin ${{ parameters.ref }} + git checkout FETCH_HEAD + displayName: Clone Retina OSS Repo + + - script: | + set -eo pipefail + cd retina + make ${{ parameters.makeargs }} + displayName: Install Retina + + - script: | + set -eo pipefail + + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE configure \ + --fortio-servers-per-deployment $FORTIO_SERVERS_PER_DEPLOYMENT \ + --fortio-clients-per-deployment $FORTIO_CLIENTS_PER_DEPLOYMENT \ + --fortio-client-queries-per-second $FORTIO_CLIENT_QUERIES_PER_SECOND \ + --fortio-client-connections $FORTIO_CLIENT_CONNECTIONS \ + --fortio-namespaces $FORTIO_NAMESPACES \ + --fortio-deployments-per-namespace $FORTIO_DEPLOYMENTS_PER_NAMESPACE \ + --network-policies-per-namespace $NETWORK_POLICIES_PER_NAMESPACE \ + --generate-retina-network-flow-logs ${GENERATE_RETINA_NETWORK_FLOW_LOGS:-False} \ + --label_traffic_pods ${LABEL_TRAFFIC_PODS:-False} \ + --cl2_override_file ${CL2_CONFIG_DIR}/overrides.yaml + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \ + --cl2-image "${CL2_IMAGE}" \ + --cl2-config-dir "${CL2_CONFIG_DIR}" \ + --cl2-report-dir "${CL2_REPORT_DIR}" \ + --cl2-config-file "${CL2_CONFIG_FILE}" \ + --kubeconfig "${HOME}/.kube/config" \ + --provider "${CLOUD}" \ + --scrape-containerd ${SCRAPE_CONTAINERD:-False} + workingDirectory: modules/python + env: + ${{ if eq(parameters.cloud, 'azure') }}: + CLOUD: aks + ${{ else }}: + CLOUD: ${{ parameters.cloud }} + REGION: ${{ parameters.region }} + PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/clusterloader2/scale/scale.py + CL2_IMAGE: ${{ parameters.engine_input.image }} + CL2_CONFIG_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/scale/config + CL2_CONFIG_FILE: config.yaml + CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/scale/results + displayName: "Run Benchmark" + + - ${{ if parameters.install }}: + - script: | + set -eo pipefail + if [ -d "retina" ]; then + cd retina + make helm-uninstall + else + echo "Retina directory does not exist, skipping uninstall" + fi + condition: always() + displayName: Uninstall Helm Chart diff --git a/steps/topology/observability/collect-clusterloader2.yml b/steps/topology/observability/collect-clusterloader2.yml new file mode 100644 index 0000000000..f390a6ab23 --- /dev/null +++ b/steps/topology/observability/collect-clusterloader2.yml @@ -0,0 +1,26 @@ +parameters: +- name: cloud + type: string + default: '' +- name: engine_input + type: object + default: {} +- name: regions + type: object + default: {} + +steps: +- template: /steps/engine/clusterloader2/scale/collect.yml + parameters: + cloud: ${{ parameters.cloud }} + engine_input: ${{ parameters.engine_input }} + region: ${{ parameters.regions[0] }} +- script: | + if [ -n "$RUN_ID" ]; then + echo "RUN_ID already set: $RUN_ID" + else + run_id=$(Build.BuildId)-$(System.JobId) + echo "Run ID: $run_id" + echo "##vso[task.setvariable variable=RUN_ID]$run_id" + fi + displayName: "Set unique Run ID before publish" diff --git a/steps/topology/observability/execute-clusterloader2.yml b/steps/topology/observability/execute-clusterloader2.yml new file mode 100644 index 0000000000..0572362d99 --- /dev/null +++ b/steps/topology/observability/execute-clusterloader2.yml @@ -0,0 +1,25 @@ +parameters: +- name: cloud + type: string + default: '' +- name: engine_input + type: object + default: {} +- name: regions + type: object + default: {} + +steps: +- template: /steps/engine/clusterloader2/scale/execute.yml + parameters: + cloud: ${{ parameters.cloud }} + engine_input: ${{ parameters.engine_input }} + region: ${{ parameters.regions[0] }} + ${{ if eq(parameters.engine_input.install, true) }}: + install: true + ${{ if not(eq(parameters.engine_input.repository, '')) }}: + repository: ${{ parameters.engine_input.repository }} + ${{ if not(eq(parameters.engine_input.makeargs, '')) }}: + makeargs: ${{ parameters.engine_input.makeargs }} + ${{ if not(eq(parameters.engine_input.ref, '')) }}: + ref: ${{ parameters.engine_input.ref }} diff --git a/steps/topology/observability/validate-resources.yml b/steps/topology/observability/validate-resources.yml new file mode 100644 index 0000000000..19826ff7dc --- /dev/null +++ b/steps/topology/observability/validate-resources.yml @@ -0,0 +1,17 @@ +parameters: + - name: cloud + type: string + - name: engine + type: string + - name: regions + type: object + +steps: + - template: /steps/cloud/${{ parameters.cloud }}/update-kubeconfig.yml + parameters: + role: slo + region: ${{ parameters.regions[0] }} + - template: /steps/engine/clusterloader2/large-cluster/validate.yml + parameters: + desired_nodes: 1006 + validation_timeout_in_minutes: 60 From d11d8a3e4aefd16ca1b5b8135c46c5d7b0ac8573 Mon Sep 17 00:00:00 2001 From: carlotaarvela Date: Thu, 12 Feb 2026 13:09:34 +0000 Subject: [PATCH 02/10] Fix test --- modules/python/clusterloader2/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/python/clusterloader2/utils.py b/modules/python/clusterloader2/utils.py index 8212b5ae7f..abba12219c 100644 --- a/modules/python/clusterloader2/utils.py +++ b/modules/python/clusterloader2/utils.py @@ -90,8 +90,9 @@ def get_measurement(file_path): group_name = file_name.split("_")[1] return file_prefix, group_name if file_name.startswith(PROM_QUERY_PREFIX): - group_name = file_name.split("_")[1] - measurement_name = file_name.split("_")[0][len(PROM_QUERY_PREFIX)+1:] + parts = file_name.split("_") + measurement_name = parts[1] # e.g., CiliumAvgCPUUsage + group_name = parts[2] # e.g., scale-test return measurement_name, group_name if file_name.startswith(JOB_LIFECYCLE_LATENCY_PREFIX): group_name = file_name.split("_")[1] From 85c2f34cae2c974b2cf3f0fc2889b481ab1392b4 Mon Sep 17 00:00:00 2001 From: carlotaarvela Date: Thu, 12 Feb 2026 13:15:40 +0000 Subject: [PATCH 03/10] Fix lint! --- modules/python/tests/test_scale.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/python/tests/test_scale.py b/modules/python/tests/test_scale.py index d3a6fb80a1..7ce3fadd70 100644 --- a/modules/python/tests/test_scale.py +++ b/modules/python/tests/test_scale.py @@ -3,7 +3,7 @@ import sys import tempfile import unittest -from unittest.mock import patch, MagicMock +from unittest.mock import patch from clusterloader2.scale.scale import ( configure_clusterloader2, From 78b09db7ac82e6be1b03c4fe06b3e2475d69beb9 Mon Sep 17 00:00:00 2001 From: carlotaarvela Date: Fri, 13 Feb 2026 15:22:57 +0000 Subject: [PATCH 04/10] Update clusterloader2 scale config and observability pipeline --- .../clusterloader2/scale/config/config.yaml | 9 +- .../scale/config/modules/node-exporter.yaml | 2 +- ...kflowlog.yaml => containernetworklog.yaml} | 0 .../scale/config/modules/scale-test.yaml | 7 +- .../scale/config/modules/test-steps.yaml | 169 ++---------------- modules/python/clusterloader2/scale/scale.py | 16 +- modules/python/clusterloader2/utils.py | 9 +- modules/python/tests/test_scale.py | 16 +- .../CNI Benchmark/cnl-observability.yml | 2 +- steps/engine/clusterloader2/scale/collect.yml | 2 +- steps/engine/clusterloader2/scale/execute.yml | 2 +- steps/provision-resources.yml | 34 ++++ .../observability/collect-clusterloader2.yml | 10 +- .../observability/validate-resources.yml | 32 +++- 14 files changed, 111 insertions(+), 199 deletions(-) rename modules/python/clusterloader2/scale/config/modules/pfl/{retinanetworkflowlog.yaml => containernetworklog.yaml} (100%) diff --git a/modules/python/clusterloader2/scale/config/config.yaml b/modules/python/clusterloader2/scale/config/config.yaml index a5ae521044..946405b313 100644 --- a/modules/python/clusterloader2/scale/config/config.yaml +++ b/modules/python/clusterloader2/scale/config/config.yaml @@ -17,10 +17,8 @@ name: scale-test {{$fortioNamespaces := DefaultParam .CL2_FORTIO_NAMESPACES 1}} {{$fortioDeploymentsPerNamespace := DefaultParam .CL2_FORTIO_DEPLOYMENTS_PER_NAMESPACE 1}} -{{$labelTrafficPods := DefaultParam .CL2_LABEL_TRAFFIC_PODS false}} - -# Retina Network Flow Log config -{{$createRetinaNetworkFlowLogs := DefaultParam .CL2_GENERATE_RETINA_NETWORK_FLOW_LOGS false}} +# Container Network Log config +{{$createContainerNetworkLogs := DefaultParam .CL2_GENERATE_CONTAINER_NETWORK_LOGS false}} namespace: @@ -71,8 +69,7 @@ steps: params: action: start group: {{$groupName}} - createRetinaNetworkFlowLogs: {{$createRetinaNetworkFlowLogs}} - labelTrafficPods: {{$labelTrafficPods}} + createContainerNetworkLogs: {{$createContainerNetworkLogs}} - module: path: /modules/hubble.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter.yaml b/modules/python/clusterloader2/scale/config/modules/node-exporter.yaml index 5db6526c4c..7a8d20a43f 100644 --- a/modules/python/clusterloader2/scale/config/modules/node-exporter.yaml +++ b/modules/python/clusterloader2/scale/config/modules/node-exporter.yaml @@ -22,7 +22,7 @@ steps: apiVersion: apps/v1 kind: DaemonSet labelSelector: app.kubernetes.io/name = node-exporter - operationTimeout: 5m + operationTimeout: 10m - name: {{.actionName}} Node Exporter Service Monitor phases: - namespaceList: diff --git a/modules/python/clusterloader2/scale/config/modules/pfl/retinanetworkflowlog.yaml b/modules/python/clusterloader2/scale/config/modules/pfl/containernetworklog.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/pfl/retinanetworkflowlog.yaml rename to modules/python/clusterloader2/scale/config/modules/pfl/containernetworklog.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/scale-test.yaml b/modules/python/clusterloader2/scale/config/modules/scale-test.yaml index d1c0f941bf..e639fd8c5d 100644 --- a/modules/python/clusterloader2/scale/config/modules/scale-test.yaml +++ b/modules/python/clusterloader2/scale/config/modules/scale-test.yaml @@ -20,8 +20,8 @@ name: scale-test # Network Policies {{$networkPoliciesPerNamespace := DefaultParam .CL2_NETWORK_POLICIES_PER_NAMESPACE 0}} -# Retina Network Flow Log config -{{$createRetinaNetworkFlowLogs := DefaultParam .createRetinaNetworkFlowLogs false}} +# Container Network Log config +{{$createContainerNetworkLogs := DefaultParam .createContainerNetworkLogs false}} steps: - module: @@ -66,10 +66,9 @@ steps: fortioClientReplicas: {{$fortioClientReplicas}} fortioClientQueriesPerSecond: {{$fortioClientQueriesPerSecond}} fortioClientConnections: {{$fortioClientConnections}} - createRetinaNetworkFlowLogs: {{$createRetinaNetworkFlowLogs}} + createContainerNetworkLogs: {{$createContainerNetworkLogs}} deploymentLabel: start networkPoliciesPerNamespace: {{$networkPoliciesPerNamespace}} - labelTrafficPods: {{.labelTrafficPods}} - module: path: /modules/measurements/control-plane.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/test-steps.yaml b/modules/python/clusterloader2/scale/config/modules/test-steps.yaml index 29628b6a42..dd1fb70422 100644 --- a/modules/python/clusterloader2/scale/config/modules/test-steps.yaml +++ b/modules/python/clusterloader2/scale/config/modules/test-steps.yaml @@ -10,21 +10,21 @@ {{$fortioClientReplicas := .fortioClientReplicas}} {{$fortioClientQueriesPerSecond := .fortioClientQueriesPerSecond}} {{$fortioClientConnections := .fortioClientConnections}} -{{$CpuRequest := .CpuRequest}} -{{$MemoryRequest := .MemoryRequest}} -{{$createRetinaNetworkFlowLogs := .createRetinaNetworkFlowLogs}} +{{$CpuRequest := DefaultParam .CpuRequest "5m"}} +{{$MemoryRequest := DefaultParam .MemoryRequest "20Mi"}} +{{$createContainerNetworkLogs := .createContainerNetworkLogs}} steps: -{{if $createRetinaNetworkFlowLogs}} -- name: Create Retina Network Flow Log +{{if $createContainerNetworkLogs}} +- name: Create Container Network Log phases: - namespaceList: - "" replicasPerNamespace: 1 tuningSet: {{$tuningSet}} objectBundle: - - basename: retina-network-flow-log - objectTemplatePath: modules/pfl/retinanetworkflowlog.yaml + - basename: container-network-log + objectTemplatePath: modules/pfl/containernetworklog.yaml templateFillMap: Group: {{.Group}} namespaces: {{$namespaces}} @@ -130,56 +130,6 @@ steps: action: gather refreshInterval: 15s -{{if .labelTrafficPods}} - -- name: Starting measurement for 'label traffic clients' - measurements: - - Method: WaitForControlledPodsRunning - Instances: - - Identifier: WaitForClientPodsRunning - Params: - apiVersion: apps/v1 - kind: Deployment - labelSelector: role = load - Params: - action: start - checkIfPodsAreUpdated: true - labelSelector: group = {{.Group}} - operationTimeout: {{$operationTimeout}} - apiVersion: apps/v1 - -- name: 'add label to client pods in deployment' - phases: - - namespaceRange: - min: 1 - max: {{$namespaces}} - replicasPerNamespace: {{$fortioDeploymentsPerNamespace}} - tuningSet: {{$tuningSet}} - objectBundle: - - basename: fortio-client - objectTemplatePath: modules/fortio/client-deployment.yaml - templateFillMap: - Replicas: {{$fortioClientReplicas}} - Group: {{.Group}} - deploymentLabel: {{.deploymentLabel}} - uniqueLabel: "uniqueLabel" - CpuRequest: {{$CpuRequest}} - MemoryRequest: {{$MemoryRequest}} - FortioClientConnections: {{$fortioClientConnections}} - FortioClientQueriesPerSecond: {{$fortioClientQueriesPerSecond}} - FortioServerServiceBasename: fortio-server - -- name: Waiting for 'label traffic clients' to be completed - measurements: - - Method: WaitForControlledPodsRunning - Instances: - - Identifier: WaitForClientPodsRunning - Params: - action: gather - refreshInterval: 15s - -{{end}} - - name: 'create {{$networkPoliciesPerNamespace}} network policies per namespace' phases: - namespaceRange: @@ -191,101 +141,6 @@ steps: - basename: scale-test-policy objectTemplatePath: modules/networkpolicy-template.yaml -{{if .labelTrafficPods}} - -- name: Starting measurement for 'remove labels from traffic clients' - measurements: - - Method: WaitForControlledPodsRunning - Instances: - - Identifier: WaitForClientPodsRunning - Params: - apiVersion: apps/v1 - kind: Deployment - labelSelector: role = load - Params: - action: start - checkIfPodsAreUpdated: true - labelSelector: group = {{.Group}} - operationTimeout: {{$operationTimeout}} - apiVersion: apps/v1 - -- name: 'remove label from client pods in deployment' - phases: - - namespaceRange: - min: 1 - max: {{$namespaces}} - replicasPerNamespace: {{$fortioDeploymentsPerNamespace}} - tuningSet: {{$tuningSet}} - objectBundle: - - basename: fortio-client - objectTemplatePath: modules/fortio/client-deployment.yaml - templateFillMap: - Replicas: {{$fortioClientReplicas}} - Group: {{.Group}} - deploymentLabel: {{.deploymentLabel}} - CpuRequest: {{$CpuRequest}} - MemoryRequest: {{$MemoryRequest}} - FortioClientConnections: {{$fortioClientConnections}} - FortioClientQueriesPerSecond: {{$fortioClientQueriesPerSecond}} - FortioServerServiceBasename: fortio-server - -- name: Waiting for 'remove labels from traffic clients' to be completed - measurements: - - Method: WaitForControlledPodsRunning - Instances: - - Identifier: WaitForClientPodsRunning - Params: - action: gather - refreshInterval: 15s - -- name: Starting measurement for 're-add labels to traffic clients' - measurements: - - Method: WaitForControlledPodsRunning - Instances: - - Identifier: WaitForClientPodsRunning - Params: - apiVersion: apps/v1 - kind: Deployment - labelSelector: role = load - Params: - action: start - checkIfPodsAreUpdated: true - labelSelector: group = {{.Group}} - operationTimeout: {{$operationTimeout}} - apiVersion: apps/v1 - -- name: 're-add label to client pods in deployment' - phases: - - namespaceRange: - min: 1 - max: {{$namespaces}} - replicasPerNamespace: {{$fortioDeploymentsPerNamespace}} - tuningSet: {{$tuningSet}} - objectBundle: - - basename: fortio-client - objectTemplatePath: modules/fortio/client-deployment.yaml - templateFillMap: - Replicas: {{$fortioClientReplicas}} - Group: {{.Group}} - deploymentLabel: {{.deploymentLabel}} - uniqueLabel: "uniqueLabel" - CpuRequest: {{$CpuRequest}} - MemoryRequest: {{$MemoryRequest}} - FortioClientConnections: {{$fortioClientConnections}} - FortioClientQueriesPerSecond: {{$fortioClientQueriesPerSecond}} - FortioServerServiceBasename: fortio-server - -- name: Waiting for 're-add labels to traffic clients' to be completed - measurements: - - Method: WaitForControlledPodsRunning - Instances: - - Identifier: WaitForClientPodsRunning - Params: - action: gather - refreshInterval: 15s - -{{end}} - - name: Wait to get metrics measurements: - Identifier: Dummy @@ -353,17 +208,17 @@ steps: FortioClientQueriesPerSecond: {{$fortioClientQueriesPerSecond}} FortioServerServiceBasename: fortio-server -{{if $createRetinaNetworkFlowLogs}} -# Delete retina network flow log -- name: Delete Retina Network Flow Log +{{if $createContainerNetworkLogs}} +# Delete container network log +- name: Delete Container Network Log phases: - namespaceList: - "" replicasPerNamespace: 0 tuningSet: {{$tuningSet}} objectBundle: - - basename: retina-network-flow-log - objectTemplatePath: modules/pfl/retinanetworkflowlog.yaml + - basename: container-network-log + objectTemplatePath: modules/pfl/containernetworklog.yaml templateFillMap: Group: {{.Group}} namespaces: {{$namespaces}} diff --git a/modules/python/clusterloader2/scale/scale.py b/modules/python/clusterloader2/scale/scale.py index ebc0a4a73d..b759bfeadb 100644 --- a/modules/python/clusterloader2/scale/scale.py +++ b/modules/python/clusterloader2/scale/scale.py @@ -14,7 +14,7 @@ def configure_clusterloader2( fortio_namespaces, fortio_deployments_per_namespace, network_policies_per_namespace, - generate_retina_network_flow_logs, + generate_container_network_logs, label_traffic_pods, override_file): @@ -39,7 +39,7 @@ def configure_clusterloader2( file.write("CL2_FORTIO_POD_CPU: 10m\n") file.write("CL2_FORTIO_POD_MEMORY: 50Mi\n") file.write(f"CL2_NETWORK_POLICIES_PER_NAMESPACE: {network_policies_per_namespace}\n") - file.write(f"CL2_GENERATE_RETINA_NETWORK_FLOW_LOGS: {generate_retina_network_flow_logs}\n") + file.write(f"CL2_GENERATE_CONTAINER_NETWORK_LOGS: {generate_container_network_logs}\n") with open(override_file, 'r', encoding='utf-8') as file: print(f"Content of file {override_file}:\n{file.read()}") @@ -78,7 +78,7 @@ def collect_clusterloader2( fortio_namespaces, fortio_deployments_per_namespace, network_policies_per_namespace, - generate_retina_network_flow_logs=False, + generate_container_network_logs=False, label_traffic_pods=False, trigger_reason="", ): @@ -112,7 +112,7 @@ def collect_clusterloader2( "traffic_clients_per_deployment": fortio_clients_per_deployment, "traffic_pods": fortio_namespaces * fortio_deployments_per_namespace * (fortio_clients_per_deployment + fortio_servers_per_deployment), "network_policies": network_policies_per_namespace, - "generate_retina_network_flow_logs": generate_retina_network_flow_logs, + "generate_container_network_logs": generate_container_network_logs, "label_traffic_pods": label_traffic_pods, "requests_per_second": fortio_client_queries_per_second, "details": testsuites[0]["testcases"][0].get("failure", None) if testsuites[0].get("testcases") else None, @@ -149,7 +149,7 @@ def main(): parser_configure.add_argument("--fortio-namespaces", type=int, required=True, help="Number of namespaces, each with their own service. Fortio clients query servers in the same namespace. Be wary of integer division causing less pods than expected regarding this parameter, pods, and pods per node.") parser_configure.add_argument("--fortio-deployments-per-namespace", type=int, required=True, help="Number of Fortio server deployments (and number of client deployments) per service/partition. Be wary of integer division causing less pods than expected regarding this parameter, namespaces, pods, and pods per node.") parser_configure.add_argument("--network-policies-per-namespace", type=int, help="Number of network policies to be created per namespace", default=0, nargs='?') - parser_configure.add_argument("--generate-retina-network-flow-logs", type=str2bool, choices=[True, False], nargs='?', default=False, help="Generate Retina Network Flow Logs (default=False)") + parser_configure.add_argument("--generate-container-network-logs", type=str2bool, choices=[True, False], nargs='?', default=False, help="Generate Container Network Logs (default=False)") parser_configure.add_argument("--label_traffic_pods", type=str2bool, choices=[True, False], nargs='?', default=False, help="Add/Remove label to client traffic pods(default=False)") parser_configure.add_argument("--cl2_override_file", type=str, help="Path to the overrides of CL2 config file") @@ -184,7 +184,7 @@ def main(): parser_collect.add_argument("--fortio-namespaces", type=int, required=True, help="Number of namespaces, each with their own service. Fortio clients query servers in the same namespace. Be wary of integer division causing less pods than expected regarding this parameter, pods, and pods per node.") parser_collect.add_argument("--fortio-deployments-per-namespace", type=int, required=True, help="Number of Fortio server deployments (and number of client deployments) per service/partition. Be wary of integer division causing less pods than expected regarding this parameter, namespaces, pods, and pods per node.") parser_collect.add_argument("--network-policies-per-namespace", type=int, help="Number of network policies to be created per namespace", default=0, nargs='?') - parser_collect.add_argument("--generate-retina-network-flow-logs", type=str2bool, choices=[True, False], nargs='?', default=False, help="Generate Retina Network Flow Logs (default=False)") + parser_collect.add_argument("--generate-container-network-logs", type=str2bool, choices=[True, False], nargs='?', default=False, help="Generate Container Network Logs (default=False)") parser_collect.add_argument("--label_traffic_pods", type=str2bool, choices=[True, False], nargs='?', default=False, help="Add/Remove label to client traffic pods(default=False)") parser_collect.add_argument("--trigger_reason", type=str, help="What triggered the test", nargs='?', default="") @@ -198,7 +198,7 @@ def main(): args.fortio_namespaces, args.fortio_deployments_per_namespace, args.network_policies_per_namespace, - args.generate_retina_network_flow_logs, + args.generate_container_network_logs, args.label_traffic_pods, args.cl2_override_file, ) @@ -218,7 +218,7 @@ def main(): args.fortio_namespaces, args.fortio_deployments_per_namespace, args.network_policies_per_namespace, - args.generate_retina_network_flow_logs, + args.generate_container_network_logs, args.label_traffic_pods, args.trigger_reason, ) diff --git a/modules/python/clusterloader2/utils.py b/modules/python/clusterloader2/utils.py index abba12219c..21055c42b0 100644 --- a/modules/python/clusterloader2/utils.py +++ b/modules/python/clusterloader2/utils.py @@ -91,8 +91,13 @@ def get_measurement(file_path): return file_prefix, group_name if file_name.startswith(PROM_QUERY_PREFIX): parts = file_name.split("_") - measurement_name = parts[1] # e.g., CiliumAvgCPUUsage - group_name = parts[2] # e.g., scale-test + if len(parts) >= 3: + measurement_name = parts[1] # e.g., CiliumAvgCPUUsage + group_name = parts[2] # e.g., scale-test + else: + # Fallback: measurement is in parts[1], no group available + measurement_name = parts[1] if len(parts) > 1 else None + group_name = None return measurement_name, group_name if file_name.startswith(JOB_LIFECYCLE_LATENCY_PREFIX): group_name = file_name.split("_")[1] diff --git a/modules/python/tests/test_scale.py b/modules/python/tests/test_scale.py index 7ce3fadd70..21a52d71f3 100644 --- a/modules/python/tests/test_scale.py +++ b/modules/python/tests/test_scale.py @@ -32,7 +32,7 @@ def test_basic_configuration(self): fortio_namespaces=1, fortio_deployments_per_namespace=1000, network_policies_per_namespace=100, - generate_retina_network_flow_logs=False, + generate_container_network_logs=False, label_traffic_pods=False, override_file=tmp_path, ) @@ -56,13 +56,13 @@ def test_basic_configuration(self): # Assert network policies and flags self.assertIn("CL2_NETWORK_POLICIES_PER_NAMESPACE: 100", content) - self.assertIn("CL2_GENERATE_RETINA_NETWORK_FLOW_LOGS: False", content) + self.assertIn("CL2_GENERATE_CONTAINER_NETWORK_LOGS: False", content) self.assertIn("CL2_LABEL_TRAFFIC_PODS: False", content) finally: os.remove(tmp_path) - def test_configuration_with_retina_flow_logs(self): - """Test configuration with Retina flow logs enabled""" + def test_configuration_with_container_network_logs(self): + """Test configuration with Container Network Logs enabled""" with tempfile.NamedTemporaryFile( delete=False, mode="w+", encoding="utf-8" ) as tmp: @@ -77,7 +77,7 @@ def test_configuration_with_retina_flow_logs(self): fortio_namespaces=5, fortio_deployments_per_namespace=100, network_policies_per_namespace=50, - generate_retina_network_flow_logs=True, + generate_container_network_logs=True, label_traffic_pods=True, override_file=tmp_path, ) @@ -85,7 +85,7 @@ def test_configuration_with_retina_flow_logs(self): with open(tmp_path, "r", encoding="utf-8") as f: content = f.read() - self.assertIn("CL2_GENERATE_RETINA_NETWORK_FLOW_LOGS: True", content) + self.assertIn("CL2_GENERATE_CONTAINER_NETWORK_LOGS: True", content) self.assertIn("CL2_LABEL_TRAFFIC_PODS: True", content) finally: os.remove(tmp_path) @@ -153,7 +153,7 @@ def test_collect_creates_result_file(self): fortio_namespaces=1, fortio_deployments_per_namespace=1000, network_policies_per_namespace=100, - generate_retina_network_flow_logs=True, + generate_container_network_logs=True, label_traffic_pods=False, trigger_reason="Manual", ) @@ -239,7 +239,7 @@ def test_configure_command_parsing(self, mock_configure): "--fortio-namespaces", "1", "--fortio-deployments-per-namespace", "1000", "--network-policies-per-namespace", "100", - "--generate-retina-network-flow-logs", "True", + "--generate-container-network-logs", "True", "--label_traffic_pods", "False", "--cl2_override_file", "/tmp/overrides.yaml", ] diff --git a/pipelines/perf-eval/CNI Benchmark/cnl-observability.yml b/pipelines/perf-eval/CNI Benchmark/cnl-observability.yml index ffd719e401..1b7d44c337 100644 --- a/pipelines/perf-eval/CNI Benchmark/cnl-observability.yml +++ b/pipelines/perf-eval/CNI Benchmark/cnl-observability.yml @@ -41,7 +41,7 @@ stages: fortio_client_connections: 50 fortio_namespaces: 1 fortio_deployments_per_namespace: 1000 - generate_retina_network_flow_logs: true + generate_container_network_logs: true label_traffic_pods: false trigger_reason: ${{ variables['Build.Reason'] }} max_parallel: 1 diff --git a/steps/engine/clusterloader2/scale/collect.yml b/steps/engine/clusterloader2/scale/collect.yml index 7d5514dd3d..845942dcb7 100644 --- a/steps/engine/clusterloader2/scale/collect.yml +++ b/steps/engine/clusterloader2/scale/collect.yml @@ -33,7 +33,7 @@ steps: --fortio-namespaces $FORTIO_NAMESPACES \ --fortio-deployments-per-namespace $FORTIO_DEPLOYMENTS_PER_NAMESPACE \ --network-policies-per-namespace $NETWORK_POLICIES_PER_NAMESPACE \ - --generate-retina-network-flow-logs ${GENERATE_RETINA_NETWORK_FLOW_LOGS:-False} \ + --generate-container-network-logs ${GENERATE_CONTAINER_NETWORK_LOGS:-False} \ --label_traffic_pods ${LABEL_TRAFFIC_PODS:-False} \ --trigger_reason ${TRIGGER_REASON:-""} workingDirectory: modules/python diff --git a/steps/engine/clusterloader2/scale/execute.yml b/steps/engine/clusterloader2/scale/execute.yml index 085b363ec1..43a903cda6 100644 --- a/steps/engine/clusterloader2/scale/execute.yml +++ b/steps/engine/clusterloader2/scale/execute.yml @@ -69,7 +69,7 @@ steps: --fortio-namespaces $FORTIO_NAMESPACES \ --fortio-deployments-per-namespace $FORTIO_DEPLOYMENTS_PER_NAMESPACE \ --network-policies-per-namespace $NETWORK_POLICIES_PER_NAMESPACE \ - --generate-retina-network-flow-logs ${GENERATE_RETINA_NETWORK_FLOW_LOGS:-False} \ + --generate-container-network-logs ${GENERATE_CONTAINER_NETWORK_LOGS:-False} \ --label_traffic_pods ${LABEL_TRAFFIC_PODS:-False} \ --cl2_override_file ${CL2_CONFIG_DIR}/overrides.yaml PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \ diff --git a/steps/provision-resources.yml b/steps/provision-resources.yml index 4496a50f28..9a68e9f82b 100644 --- a/steps/provision-resources.yml +++ b/steps/provision-resources.yml @@ -69,6 +69,40 @@ steps: env: region: ${{ parameters.regions[0] }} +- script: | + set -eo pipefail + echo "Checking AdvancedNetworkingFlowLogsPreview feature flag status..." + + feature_state=$(az feature show --namespace "Microsoft.ContainerService" --name "AdvancedNetworkingFlowLogsPreview" --query "properties.state" -o tsv 2>/dev/null || echo "NotRegistered") + + if [ "$feature_state" == "Registered" ]; then + echo "✓ AdvancedNetworkingFlowLogsPreview feature flag is already registered." + else + echo "Feature flag state: $feature_state" + echo "Registering AdvancedNetworkingFlowLogsPreview feature flag..." + az feature register --namespace "Microsoft.ContainerService" --name "AdvancedNetworkingFlowLogsPreview" + + echo "Waiting for feature flag registration to complete (this may take a few minutes)..." + for i in {1..30}; do + sleep 20 + feature_state=$(az feature show --namespace "Microsoft.ContainerService" --name "AdvancedNetworkingFlowLogsPreview" --query "properties.state" -o tsv) + echo "Attempt $i: Feature state is '$feature_state'" + if [ "$feature_state" == "Registered" ]; then + echo "✓ Feature flag registered successfully!" + break + fi + done + + if [ "$feature_state" != "Registered" ]; then + echo "##[warning]Feature flag not yet registered after waiting. Continuing anyway..." + fi + + echo "Refreshing Microsoft.ContainerService provider..." + az provider register --namespace "Microsoft.ContainerService" + fi + displayName: "Register AdvancedNetworkingFlowLogsPreview feature flag" + condition: and(${{ eq(parameters.cloud, 'azure') }}, ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'true')) + - template: /steps/terraform/run-command.yml parameters: command: version diff --git a/steps/topology/observability/collect-clusterloader2.yml b/steps/topology/observability/collect-clusterloader2.yml index f390a6ab23..58fffd9b3e 100644 --- a/steps/topology/observability/collect-clusterloader2.yml +++ b/steps/topology/observability/collect-clusterloader2.yml @@ -10,17 +10,9 @@ parameters: default: {} steps: +- template: /steps/set-run-id.yml - template: /steps/engine/clusterloader2/scale/collect.yml parameters: cloud: ${{ parameters.cloud }} engine_input: ${{ parameters.engine_input }} region: ${{ parameters.regions[0] }} -- script: | - if [ -n "$RUN_ID" ]; then - echo "RUN_ID already set: $RUN_ID" - else - run_id=$(Build.BuildId)-$(System.JobId) - echo "Run ID: $run_id" - echo "##vso[task.setvariable variable=RUN_ID]$run_id" - fi - displayName: "Set unique Run ID before publish" diff --git a/steps/topology/observability/validate-resources.yml b/steps/topology/observability/validate-resources.yml index 19826ff7dc..b8564cb103 100644 --- a/steps/topology/observability/validate-resources.yml +++ b/steps/topology/observability/validate-resources.yml @@ -13,5 +13,35 @@ steps: region: ${{ parameters.regions[0] }} - template: /steps/engine/clusterloader2/large-cluster/validate.yml parameters: - desired_nodes: 1006 + desired_nodes: 16 validation_timeout_in_minutes: 60 + - script: | + set -eo pipefail + echo "Waiting for ContainerNetworkLog CRD to be available..." + timeout=300 # 5 minutes timeout + interval=10 + elapsed=0 + + # Debug: Show available acn.azure.com CRDs + echo "Checking for ACNS-related CRDs..." + kubectl get crd | grep -E "acn\.azure\.com|cilium" || echo "No ACNS/Cilium CRDs found yet" + + while ! kubectl get crd containernetworklogs.acn.azure.com &>/dev/null; do + if [ $elapsed -ge $timeout ]; then + echo "##vso[task.logissue type=error]Timeout waiting for ContainerNetworkLog CRD." + echo "" + echo "The ContainerNetworkLog CRD requires the AdvancedNetworkingFlowLogsPreview feature flag" + echo "to be registered BEFORE cluster creation." + echo "" + echo "Available CRDs:" + kubectl get crd | grep -E "acn|cilium|network" || true + exit 1 + fi + echo "ContainerNetworkLog CRD not found yet. Retrying in ${interval}s... (${elapsed}s elapsed)" + sleep $interval + elapsed=$((elapsed + interval)) + done + + echo "ContainerNetworkLog CRD is available!" + kubectl get crd containernetworklogs.acn.azure.com + displayName: "Wait for ContainerNetworkLog CRD" From c3cb269162aaebb19e06f86258875c2e6a69a184 Mon Sep 17 00:00:00 2001 From: carlotaarvela Date: Fri, 13 Feb 2026 15:28:25 +0000 Subject: [PATCH 05/10] Fix trailing spaces lint errors --- steps/provision-resources.yml | 10 +++++----- steps/topology/observability/validate-resources.yml | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/steps/provision-resources.yml b/steps/provision-resources.yml index 9a68e9f82b..bee946aba3 100644 --- a/steps/provision-resources.yml +++ b/steps/provision-resources.yml @@ -72,16 +72,16 @@ steps: - script: | set -eo pipefail echo "Checking AdvancedNetworkingFlowLogsPreview feature flag status..." - + feature_state=$(az feature show --namespace "Microsoft.ContainerService" --name "AdvancedNetworkingFlowLogsPreview" --query "properties.state" -o tsv 2>/dev/null || echo "NotRegistered") - + if [ "$feature_state" == "Registered" ]; then echo "✓ AdvancedNetworkingFlowLogsPreview feature flag is already registered." else echo "Feature flag state: $feature_state" echo "Registering AdvancedNetworkingFlowLogsPreview feature flag..." az feature register --namespace "Microsoft.ContainerService" --name "AdvancedNetworkingFlowLogsPreview" - + echo "Waiting for feature flag registration to complete (this may take a few minutes)..." for i in {1..30}; do sleep 20 @@ -92,11 +92,11 @@ steps: break fi done - + if [ "$feature_state" != "Registered" ]; then echo "##[warning]Feature flag not yet registered after waiting. Continuing anyway..." fi - + echo "Refreshing Microsoft.ContainerService provider..." az provider register --namespace "Microsoft.ContainerService" fi diff --git a/steps/topology/observability/validate-resources.yml b/steps/topology/observability/validate-resources.yml index b8564cb103..f4fadb4298 100644 --- a/steps/topology/observability/validate-resources.yml +++ b/steps/topology/observability/validate-resources.yml @@ -21,11 +21,11 @@ steps: timeout=300 # 5 minutes timeout interval=10 elapsed=0 - + # Debug: Show available acn.azure.com CRDs echo "Checking for ACNS-related CRDs..." kubectl get crd | grep -E "acn\.azure\.com|cilium" || echo "No ACNS/Cilium CRDs found yet" - + while ! kubectl get crd containernetworklogs.acn.azure.com &>/dev/null; do if [ $elapsed -ge $timeout ]; then echo "##vso[task.logissue type=error]Timeout waiting for ContainerNetworkLog CRD." @@ -41,7 +41,7 @@ steps: sleep $interval elapsed=$((elapsed + interval)) done - + echo "ContainerNetworkLog CRD is available!" kubectl get crd containernetworklogs.acn.azure.com displayName: "Wait for ContainerNetworkLog CRD" From 4a8c385e28a9c07d141193ee6e0c24dbdb6de8d7 Mon Sep 17 00:00:00 2001 From: carlotaarvela Date: Tue, 3 Mar 2026 14:09:38 +0000 Subject: [PATCH 06/10] Fixes --- .../scale/config/modules/node-exporter.yaml | 37 ++++++++++++------- .../modules/node-exporter/clusterrole.yaml | 2 +- .../node-exporter/clusterrolebinding.yaml | 6 +-- .../modules/node-exporter/daemonset.yaml | 9 +++-- .../modules/node-exporter/networkpolicy.yaml | 3 +- .../config/modules/node-exporter/service.yaml | 3 +- .../modules/node-exporter/serviceaccount.yaml | 3 +- .../modules/node-exporter/servicemonitor.yaml | 2 +- modules/python/clusterloader2/scale/scale.py | 2 + modules/python/clusterloader2/utils.py | 30 +++++++++++---- modules/python/tests/test_scale.py | 2 + modules/terraform/azure/aks-cli/main.tf | 2 +- .../CNI Benchmark/cnl-observability.yml | 2 +- .../terraform-inputs/azure.tfvars | 25 ++++++++++--- steps/provision-resources.yml | 10 ++--- .../observability/validate-resources.yml | 6 +-- 16 files changed, 94 insertions(+), 50 deletions(-) diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter.yaml b/modules/python/clusterloader2/scale/config/modules/node-exporter.yaml index 7a8d20a43f..82e6a634c8 100644 --- a/modules/python/clusterloader2/scale/config/modules/node-exporter.yaml +++ b/modules/python/clusterloader2/scale/config/modules/node-exporter.yaml @@ -4,6 +4,10 @@ {{$tuningSet := DefaultParam .tuningSet "DeploymentCreateQps"}} # interval {{$interval := DefaultParam .interval "15s"}} +# operation timeout +{{$operationTimeout := DefaultParam .CL2_NODE_EXPORTER_OPERATION_TIMEOUT "30m"}} +# enable validation +{{$enableValidation := DefaultParam .CL2_NODE_EXPORTER_ENABLE_VALIDATION true}} {{ $replicasPerNamespace := 1 }} {{if eq .actionName "create"}} @@ -13,6 +17,7 @@ {{end}} steps: +{{if $enableValidation}} - name: Start measurements measurements: - Identifier: WaitForNodeExporterPodsRunning @@ -22,23 +27,15 @@ steps: apiVersion: apps/v1 kind: DaemonSet labelSelector: app.kubernetes.io/name = node-exporter - operationTimeout: 10m - - name: {{.actionName}} Node Exporter Service Monitor - phases: - - namespaceList: - - "monitoring" - replicasPerNamespace: {{$replicasPerNamespace}} - tuningSet: {{$tuningSet}} - objectBundle: - - objectTemplatePath: "modules/node-exporter/servicemonitor.yaml" - basename: node-exporter - interval: 15s + operationTimeout: {{$operationTimeout}} + checkIfPodsAreUpdated: false +{{end}} - name: {{.actionName}} Node Exporter Service Account phases: - namespaceList: - "monitoring" replicasPerNamespace: {{$replicasPerNamespace}} - tuningSet: {{$tuningSet}} + tuningSet: Sequence objectBundle: - objectTemplatePath: "modules/node-exporter/serviceaccount.yaml" basename: node-exporter @@ -47,7 +44,7 @@ steps: - namespaceList: - "" replicasPerNamespace: {{$replicasPerNamespace}} - tuningSet: {{$tuningSet}} + tuningSet: Sequence objectBundle: - objectTemplatePath: "modules/node-exporter/clusterrole.yaml" basename: node-exporter @@ -56,10 +53,20 @@ steps: - namespaceList: - "" replicasPerNamespace: {{$replicasPerNamespace}} - tuningSet: {{$tuningSet}} + tuningSet: Sequence objectBundle: - objectTemplatePath: "modules/node-exporter/clusterrolebinding.yaml" basename: node-exporter + - name: {{.actionName}} Node Exporter Service Monitor + phases: + - namespaceList: + - "monitoring" + replicasPerNamespace: {{$replicasPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - objectTemplatePath: "modules/node-exporter/servicemonitor.yaml" + basename: node-exporter + interval: 15s - name: {{.actionName}} Node Exporter Daemonset phases: - namespaceList: @@ -87,9 +94,11 @@ steps: objectBundle: - objectTemplatePath: "modules/node-exporter/service.yaml" basename: node-exporter +{{if $enableValidation}} - name: Wait for pods to be running measurements: - Identifier: WaitForNodeExporterPodsRunning Method: WaitForControlledPodsRunning Params: action: gather +{{end}} diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter/clusterrole.yaml b/modules/python/clusterloader2/scale/config/modules/node-exporter/clusterrole.yaml index 2b4003ad4d..9248447173 100644 --- a/modules/python/clusterloader2/scale/config/modules/node-exporter/clusterrole.yaml +++ b/modules/python/clusterloader2/scale/config/modules/node-exporter/clusterrole.yaml @@ -5,7 +5,7 @@ metadata: app.kubernetes.io/component: exporter app.kubernetes.io/name: node-exporter app.kubernetes.io/version: 1.9.1 - name: node-exporter + name: {{.Name}} rules: - apiGroups: - authentication.k8s.io diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter/clusterrolebinding.yaml b/modules/python/clusterloader2/scale/config/modules/node-exporter/clusterrolebinding.yaml index a3efb6f78e..6763097793 100644 --- a/modules/python/clusterloader2/scale/config/modules/node-exporter/clusterrolebinding.yaml +++ b/modules/python/clusterloader2/scale/config/modules/node-exporter/clusterrolebinding.yaml @@ -5,12 +5,12 @@ metadata: app.kubernetes.io/component: exporter app.kubernetes.io/name: node-exporter app.kubernetes.io/version: 1.9.1 - name: node-exporter + name: {{.Name}} roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: node-exporter + name: {{.Name}} subjects: - kind: ServiceAccount - name: node-exporter + name: {{.Name}} namespace: monitoring diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter/daemonset.yaml b/modules/python/clusterloader2/scale/config/modules/node-exporter/daemonset.yaml index d7a952b55b..24986d608a 100644 --- a/modules/python/clusterloader2/scale/config/modules/node-exporter/daemonset.yaml +++ b/modules/python/clusterloader2/scale/config/modules/node-exporter/daemonset.yaml @@ -5,8 +5,10 @@ metadata: app.kubernetes.io/component: exporter app.kubernetes.io/name: node-exporter app.kubernetes.io/version: 1.9.1 - name: node-exporter + name: {{.Name}} + namespace: monitoring spec: + minReadySeconds: 0 selector: matchLabels: app.kubernetes.io/component: exporter @@ -71,7 +73,6 @@ spec: name: kube-rbac-proxy ports: - containerPort: 9100 - hostPort: 9100 name: https resources: limits: @@ -92,7 +93,7 @@ spec: seccompProfile: type: RuntimeDefault # hostNetwork and hostPID required for node-exporter collectors to access host-level - # metrics (network stats, process info). hostPort 9100 used by kube-rbac-proxy. + # metrics (network stats, process info). Port 9100 exposed via Service for Prometheus scraping. hostNetwork: true hostPID: true nodeSelector: @@ -102,7 +103,7 @@ spec: runAsGroup: 65534 runAsNonRoot: true runAsUser: 65534 - serviceAccountName: node-exporter + serviceAccountName: {{.Name}} tolerations: - operator: Exists volumes: diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter/networkpolicy.yaml b/modules/python/clusterloader2/scale/config/modules/node-exporter/networkpolicy.yaml index 00f7859945..83d111067e 100644 --- a/modules/python/clusterloader2/scale/config/modules/node-exporter/networkpolicy.yaml +++ b/modules/python/clusterloader2/scale/config/modules/node-exporter/networkpolicy.yaml @@ -5,7 +5,8 @@ metadata: app.kubernetes.io/component: exporter app.kubernetes.io/name: node-exporter app.kubernetes.io/version: 1.9.1 - name: node-exporter + name: {{.Name}} + namespace: monitoring spec: egress: - {} diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter/service.yaml b/modules/python/clusterloader2/scale/config/modules/node-exporter/service.yaml index 625c50a714..0dbedc0cc6 100644 --- a/modules/python/clusterloader2/scale/config/modules/node-exporter/service.yaml +++ b/modules/python/clusterloader2/scale/config/modules/node-exporter/service.yaml @@ -5,7 +5,8 @@ metadata: app.kubernetes.io/component: exporter app.kubernetes.io/name: node-exporter app.kubernetes.io/version: 1.9.1 - name: node-exporter + name: {{.Name}} + namespace: monitoring spec: clusterIP: None ports: diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter/serviceaccount.yaml b/modules/python/clusterloader2/scale/config/modules/node-exporter/serviceaccount.yaml index e98c9208ba..7b3093f022 100644 --- a/modules/python/clusterloader2/scale/config/modules/node-exporter/serviceaccount.yaml +++ b/modules/python/clusterloader2/scale/config/modules/node-exporter/serviceaccount.yaml @@ -5,5 +5,6 @@ metadata: app.kubernetes.io/component: exporter app.kubernetes.io/name: node-exporter app.kubernetes.io/version: 1.9.1 - name: node-exporter + name: {{.Name}} + namespace: monitoring diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter/servicemonitor.yaml b/modules/python/clusterloader2/scale/config/modules/node-exporter/servicemonitor.yaml index 4578d39c33..a6d9c075a7 100644 --- a/modules/python/clusterloader2/scale/config/modules/node-exporter/servicemonitor.yaml +++ b/modules/python/clusterloader2/scale/config/modules/node-exporter/servicemonitor.yaml @@ -7,7 +7,7 @@ metadata: app.kubernetes.io/component: exporter app.kubernetes.io/name: node-exporter app.kubernetes.io/version: 1.9.1 - name: node-exporter + name: {{.Name}} namespace: monitoring spec: endpoints: diff --git a/modules/python/clusterloader2/scale/scale.py b/modules/python/clusterloader2/scale/scale.py index b759bfeadb..3363733773 100644 --- a/modules/python/clusterloader2/scale/scale.py +++ b/modules/python/clusterloader2/scale/scale.py @@ -27,6 +27,8 @@ def configure_clusterloader2( file.write("CL2_PROMETHEUS_SCRAPE_CILIUM_OPERATOR: true\n") file.write("CL2_PROMETHEUS_NODE_SELECTOR: \"prometheus: \\\"true\\\"\"\n") file.write("CL2_POD_STARTUP_LATENCY_THRESHOLD: 3m\n") + file.write("CL2_NODE_EXPORTER_OPERATION_TIMEOUT: 60m\n") + file.write("CL2_NODE_EXPORTER_ENABLE_VALIDATION: false\n") file.write(f"CL2_LABEL_TRAFFIC_PODS: {label_traffic_pods}\n") # topology config diff --git a/modules/python/clusterloader2/utils.py b/modules/python/clusterloader2/utils.py index 21055c42b0..144cdba049 100644 --- a/modules/python/clusterloader2/utils.py +++ b/modules/python/clusterloader2/utils.py @@ -90,15 +90,29 @@ def get_measurement(file_path): group_name = file_name.split("_")[1] return file_prefix, group_name if file_name.startswith(PROM_QUERY_PREFIX): - parts = file_name.split("_") - if len(parts) >= 3: - measurement_name = parts[1] # e.g., CiliumAvgCPUUsage - group_name = parts[2] # e.g., scale-test + # Remove "GenericPrometheusQuery " or "GenericPrometheusQuery_" prefix + if file_name.startswith(PROM_QUERY_PREFIX + " "): + remainder = file_name[len(PROM_QUERY_PREFIX) + 1:] + elif file_name.startswith(PROM_QUERY_PREFIX + "_"): + remainder = file_name[len(PROM_QUERY_PREFIX) + 1:] else: - # Fallback: measurement is in parts[1], no group available - measurement_name = parts[1] if len(parts) > 1 else None - group_name = None - return measurement_name, group_name + return None, None + + # Format: __.json + # Split on underscore to extract parts + parts = remainder.split("_") + if len(parts) >= 2: + # Find where the group starts (it's the part before the timestamp) + # Timestamp format: 2026-02-25T13:51:31Z.json (contains 'T' and 'Z') + for i in range(len(parts) - 1, 0, -1): + if 'T' in parts[i]: + # Found timestamp, group is parts[i-1] + group_name = parts[i - 1] + # Measurement is everything before the group + measurement_name = "_".join(parts[:i - 1]).rstrip("_") + return measurement_name, group_name + + return None, None if file_name.startswith(JOB_LIFECYCLE_LATENCY_PREFIX): group_name = file_name.split("_")[1] return JOB_LIFECYCLE_LATENCY_PREFIX, group_name diff --git a/modules/python/tests/test_scale.py b/modules/python/tests/test_scale.py index 21a52d71f3..b9765cc720 100644 --- a/modules/python/tests/test_scale.py +++ b/modules/python/tests/test_scale.py @@ -45,6 +45,8 @@ def test_basic_configuration(self): self.assertIn("CL2_PROMETHEUS_MEMORY_LIMIT_FACTOR: 100.0", content) self.assertIn("CL2_PROMETHEUS_SCRAPE_CILIUM_AGENT: true", content) self.assertIn("CL2_PROMETHEUS_SCRAPE_CILIUM_OPERATOR: true", content) + self.assertIn("CL2_NODE_EXPORTER_OPERATION_TIMEOUT: 60m", content) + self.assertIn("CL2_NODE_EXPORTER_ENABLE_VALIDATION: false", content) # Assert Fortio config self.assertIn("CL2_FORTIO_SERVERS_PER_DEPLOYMENT: 15", content) diff --git a/modules/terraform/azure/aks-cli/main.tf b/modules/terraform/azure/aks-cli/main.tf index 1d6a8d637c..94b0e489bb 100644 --- a/modules/terraform/azure/aks-cli/main.tf +++ b/modules/terraform/azure/aks-cli/main.tf @@ -222,7 +222,7 @@ resource "terraform_data" "enable_aks_cli_preview_extension" { EOT ) : ( <[_]+ (e.g. azure_eastus2, aws_eastus_westus) diff --git a/scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-inputs/azure.tfvars b/scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-inputs/azure.tfvars index f7f9cdba16..4fcb82e1f4 100644 --- a/scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-inputs/azure.tfvars +++ b/scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-inputs/azure.tfvars @@ -5,10 +5,11 @@ owner = "aks" aks_cli_config_list = [ { - role = "slo" - aks_name = "telescope-acns-scale-test" - kubernetes_version = "1.33" - sku_tier = "Standard" + role = "slo" + aks_name = "telescope-acns-scale-test" + kubernetes_version = "1.34" + sku_tier = "Standard" + use_aks_preview_cli_extension = true optional_parameters = [ { @@ -29,12 +30,24 @@ aks_cli_config_list = [ }, { name = "pod-cidr" - value = "192.168.0.0/16" + value = "100.64.0.0/10" }, { name = "enable-acns" value = "" }, + { + name = "enable-container-network-logs" + value = "" + }, + { + name = "enable-addons" + value = "monitoring" + }, + { + name = "enable-high-log-scale-mode" + value = "" + }, { name = "network-dataplane" value = "cilium" @@ -68,7 +81,7 @@ aks_cli_config_list = [ }, { name = "traffic" - node_count = 1000 + node_count = 10 auto_scaling_enabled = false max_pods = 250 vm_size = "Standard_D4_v3" diff --git a/steps/provision-resources.yml b/steps/provision-resources.yml index bee946aba3..9a68e9f82b 100644 --- a/steps/provision-resources.yml +++ b/steps/provision-resources.yml @@ -72,16 +72,16 @@ steps: - script: | set -eo pipefail echo "Checking AdvancedNetworkingFlowLogsPreview feature flag status..." - + feature_state=$(az feature show --namespace "Microsoft.ContainerService" --name "AdvancedNetworkingFlowLogsPreview" --query "properties.state" -o tsv 2>/dev/null || echo "NotRegistered") - + if [ "$feature_state" == "Registered" ]; then echo "✓ AdvancedNetworkingFlowLogsPreview feature flag is already registered." else echo "Feature flag state: $feature_state" echo "Registering AdvancedNetworkingFlowLogsPreview feature flag..." az feature register --namespace "Microsoft.ContainerService" --name "AdvancedNetworkingFlowLogsPreview" - + echo "Waiting for feature flag registration to complete (this may take a few minutes)..." for i in {1..30}; do sleep 20 @@ -92,11 +92,11 @@ steps: break fi done - + if [ "$feature_state" != "Registered" ]; then echo "##[warning]Feature flag not yet registered after waiting. Continuing anyway..." fi - + echo "Refreshing Microsoft.ContainerService provider..." az provider register --namespace "Microsoft.ContainerService" fi diff --git a/steps/topology/observability/validate-resources.yml b/steps/topology/observability/validate-resources.yml index f4fadb4298..b8564cb103 100644 --- a/steps/topology/observability/validate-resources.yml +++ b/steps/topology/observability/validate-resources.yml @@ -21,11 +21,11 @@ steps: timeout=300 # 5 minutes timeout interval=10 elapsed=0 - + # Debug: Show available acn.azure.com CRDs echo "Checking for ACNS-related CRDs..." kubectl get crd | grep -E "acn\.azure\.com|cilium" || echo "No ACNS/Cilium CRDs found yet" - + while ! kubectl get crd containernetworklogs.acn.azure.com &>/dev/null; do if [ $elapsed -ge $timeout ]; then echo "##vso[task.logissue type=error]Timeout waiting for ContainerNetworkLog CRD." @@ -41,7 +41,7 @@ steps: sleep $interval elapsed=$((elapsed + interval)) done - + echo "ContainerNetworkLog CRD is available!" kubectl get crd containernetworklogs.acn.azure.com displayName: "Wait for ContainerNetworkLog CRD" From f84cf657f0cb3a7acc32c6515c0efca1ebf8f474 Mon Sep 17 00:00:00 2001 From: carlotaarvela Date: Wed, 4 Mar 2026 10:41:40 +0000 Subject: [PATCH 07/10] PR comments --- modules/python/clusterloader2/utils.py | 4 +-- .../terraform-inputs/azure.tfvars | 2 +- steps/provision-resources.yml | 34 ------------------- .../observability/validate-resources.yml | 6 ++-- 4 files changed, 6 insertions(+), 40 deletions(-) diff --git a/modules/python/clusterloader2/utils.py b/modules/python/clusterloader2/utils.py index 144cdba049..50deb2ed85 100644 --- a/modules/python/clusterloader2/utils.py +++ b/modules/python/clusterloader2/utils.py @@ -97,7 +97,7 @@ def get_measurement(file_path): remainder = file_name[len(PROM_QUERY_PREFIX) + 1:] else: return None, None - + # Format: __.json # Split on underscore to extract parts parts = remainder.split("_") @@ -111,7 +111,7 @@ def get_measurement(file_path): # Measurement is everything before the group measurement_name = "_".join(parts[:i - 1]).rstrip("_") return measurement_name, group_name - + return None, None if file_name.startswith(JOB_LIFECYCLE_LATENCY_PREFIX): group_name = file_name.split("_")[1] diff --git a/scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-inputs/azure.tfvars b/scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-inputs/azure.tfvars index 4fcb82e1f4..1bdc0d6f97 100644 --- a/scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-inputs/azure.tfvars +++ b/scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-inputs/azure.tfvars @@ -81,7 +81,7 @@ aks_cli_config_list = [ }, { name = "traffic" - node_count = 10 + node_count = 1000 auto_scaling_enabled = false max_pods = 250 vm_size = "Standard_D4_v3" diff --git a/steps/provision-resources.yml b/steps/provision-resources.yml index 9a68e9f82b..4496a50f28 100644 --- a/steps/provision-resources.yml +++ b/steps/provision-resources.yml @@ -69,40 +69,6 @@ steps: env: region: ${{ parameters.regions[0] }} -- script: | - set -eo pipefail - echo "Checking AdvancedNetworkingFlowLogsPreview feature flag status..." - - feature_state=$(az feature show --namespace "Microsoft.ContainerService" --name "AdvancedNetworkingFlowLogsPreview" --query "properties.state" -o tsv 2>/dev/null || echo "NotRegistered") - - if [ "$feature_state" == "Registered" ]; then - echo "✓ AdvancedNetworkingFlowLogsPreview feature flag is already registered." - else - echo "Feature flag state: $feature_state" - echo "Registering AdvancedNetworkingFlowLogsPreview feature flag..." - az feature register --namespace "Microsoft.ContainerService" --name "AdvancedNetworkingFlowLogsPreview" - - echo "Waiting for feature flag registration to complete (this may take a few minutes)..." - for i in {1..30}; do - sleep 20 - feature_state=$(az feature show --namespace "Microsoft.ContainerService" --name "AdvancedNetworkingFlowLogsPreview" --query "properties.state" -o tsv) - echo "Attempt $i: Feature state is '$feature_state'" - if [ "$feature_state" == "Registered" ]; then - echo "✓ Feature flag registered successfully!" - break - fi - done - - if [ "$feature_state" != "Registered" ]; then - echo "##[warning]Feature flag not yet registered after waiting. Continuing anyway..." - fi - - echo "Refreshing Microsoft.ContainerService provider..." - az provider register --namespace "Microsoft.ContainerService" - fi - displayName: "Register AdvancedNetworkingFlowLogsPreview feature flag" - condition: and(${{ eq(parameters.cloud, 'azure') }}, ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'true')) - - template: /steps/terraform/run-command.yml parameters: command: version diff --git a/steps/topology/observability/validate-resources.yml b/steps/topology/observability/validate-resources.yml index b8564cb103..f4fadb4298 100644 --- a/steps/topology/observability/validate-resources.yml +++ b/steps/topology/observability/validate-resources.yml @@ -21,11 +21,11 @@ steps: timeout=300 # 5 minutes timeout interval=10 elapsed=0 - + # Debug: Show available acn.azure.com CRDs echo "Checking for ACNS-related CRDs..." kubectl get crd | grep -E "acn\.azure\.com|cilium" || echo "No ACNS/Cilium CRDs found yet" - + while ! kubectl get crd containernetworklogs.acn.azure.com &>/dev/null; do if [ $elapsed -ge $timeout ]; then echo "##vso[task.logissue type=error]Timeout waiting for ContainerNetworkLog CRD." @@ -41,7 +41,7 @@ steps: sleep $interval elapsed=$((elapsed + interval)) done - + echo "ContainerNetworkLog CRD is available!" kubectl get crd containernetworklogs.acn.azure.com displayName: "Wait for ContainerNetworkLog CRD" From ec52687a678c97b4958bfc4f6836ce5c985c4fd7 Mon Sep 17 00:00:00 2001 From: carlotaarvela Date: Thu, 5 Mar 2026 14:09:33 +0000 Subject: [PATCH 08/10] Renaming modules + topologies --- .../config/config.yaml | 0 .../config/modules/ama-logs.yaml | 0 .../config/modules/ama-logs/podmonitor.yaml | 0 .../modules/fortio/client-deployment.yaml | 0 .../modules/fortio/server-deployment.yaml | 0 .../config/modules/fortio/service.yaml | 0 .../config/modules/hubble.yaml | 0 .../config/modules/hubble/podmonitor.yaml | 0 .../config/modules/measurements/ama-logs.yaml | 0 .../config/modules/measurements/cilium.yaml | 0 .../modules/measurements/control-plane.yaml | 0 .../modules/measurements/node-disk.yaml | 0 .../config/modules/measurements/retina.yaml | 0 .../modules/networkpolicy-template.yaml | 0 .../config/modules/node-exporter.yaml | 0 .../modules/node-exporter/clusterrole.yaml | 0 .../node-exporter/clusterrolebinding.yaml | 0 .../modules/node-exporter/daemonset.yaml | 0 .../modules/node-exporter/networkpolicy.yaml | 0 .../config/modules/node-exporter/service.yaml | 0 .../modules/node-exporter/serviceaccount.yaml | 0 .../modules/node-exporter/servicemonitor.yaml | 0 .../modules/pfl/containernetworklog.yaml | 0 .../config/modules/scale-test.yaml | 0 .../config/modules/test-steps.yaml | 0 .../{scale => network-scale}/scale.py | 0 ...Usage_scale-test_2025-03-04T05:35:56Z.json | 0 .../{scale => network-scale}/report/junit.xml | 0 .../{test_scale.py => test_network_scale.py} | 44 ++++++++++++------- .../CNI Benchmark/cnl-observability.yml | 2 +- .../{scale => network-scale}/collect.yml | 4 +- .../{scale => network-scale}/execute.yml | 6 +-- .../collect-clusterloader2.yml | 2 +- .../execute-clusterloader2.yml | 2 +- .../validate-resources.yml | 6 +-- 35 files changed, 40 insertions(+), 26 deletions(-) rename modules/python/clusterloader2/{scale => network-scale}/config/config.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/config/modules/ama-logs.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/config/modules/ama-logs/podmonitor.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/config/modules/fortio/client-deployment.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/config/modules/fortio/server-deployment.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/config/modules/fortio/service.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/config/modules/hubble.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/config/modules/hubble/podmonitor.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/config/modules/measurements/ama-logs.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/config/modules/measurements/cilium.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/config/modules/measurements/control-plane.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/config/modules/measurements/node-disk.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/config/modules/measurements/retina.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/config/modules/networkpolicy-template.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/config/modules/node-exporter.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/config/modules/node-exporter/clusterrole.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/config/modules/node-exporter/clusterrolebinding.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/config/modules/node-exporter/daemonset.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/config/modules/node-exporter/networkpolicy.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/config/modules/node-exporter/service.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/config/modules/node-exporter/serviceaccount.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/config/modules/node-exporter/servicemonitor.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/config/modules/pfl/containernetworklog.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/config/modules/scale-test.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/config/modules/test-steps.yaml (100%) rename modules/python/clusterloader2/{scale => network-scale}/scale.py (100%) rename modules/python/tests/mock_data/{scale => network-scale}/report/GenericPrometheusQuery_CiliumAvgCPUUsage_scale-test_2025-03-04T05:35:56Z.json (100%) rename modules/python/tests/mock_data/{scale => network-scale}/report/junit.xml (100%) rename modules/python/tests/{test_scale.py => test_network_scale.py} (88%) rename steps/engine/clusterloader2/{scale => network-scale}/collect.yml (96%) rename steps/engine/clusterloader2/{scale => network-scale}/execute.yml (97%) rename steps/topology/{observability => network-scale}/collect-clusterloader2.yml (83%) rename steps/topology/{observability => network-scale}/execute-clusterloader2.yml (91%) rename steps/topology/{observability => network-scale}/validate-resources.yml (98%) diff --git a/modules/python/clusterloader2/scale/config/config.yaml b/modules/python/clusterloader2/network-scale/config/config.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/config.yaml rename to modules/python/clusterloader2/network-scale/config/config.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/ama-logs.yaml b/modules/python/clusterloader2/network-scale/config/modules/ama-logs.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/ama-logs.yaml rename to modules/python/clusterloader2/network-scale/config/modules/ama-logs.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/ama-logs/podmonitor.yaml b/modules/python/clusterloader2/network-scale/config/modules/ama-logs/podmonitor.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/ama-logs/podmonitor.yaml rename to modules/python/clusterloader2/network-scale/config/modules/ama-logs/podmonitor.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/fortio/client-deployment.yaml b/modules/python/clusterloader2/network-scale/config/modules/fortio/client-deployment.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/fortio/client-deployment.yaml rename to modules/python/clusterloader2/network-scale/config/modules/fortio/client-deployment.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/fortio/server-deployment.yaml b/modules/python/clusterloader2/network-scale/config/modules/fortio/server-deployment.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/fortio/server-deployment.yaml rename to modules/python/clusterloader2/network-scale/config/modules/fortio/server-deployment.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/fortio/service.yaml b/modules/python/clusterloader2/network-scale/config/modules/fortio/service.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/fortio/service.yaml rename to modules/python/clusterloader2/network-scale/config/modules/fortio/service.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/hubble.yaml b/modules/python/clusterloader2/network-scale/config/modules/hubble.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/hubble.yaml rename to modules/python/clusterloader2/network-scale/config/modules/hubble.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/hubble/podmonitor.yaml b/modules/python/clusterloader2/network-scale/config/modules/hubble/podmonitor.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/hubble/podmonitor.yaml rename to modules/python/clusterloader2/network-scale/config/modules/hubble/podmonitor.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/measurements/ama-logs.yaml b/modules/python/clusterloader2/network-scale/config/modules/measurements/ama-logs.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/measurements/ama-logs.yaml rename to modules/python/clusterloader2/network-scale/config/modules/measurements/ama-logs.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/measurements/cilium.yaml b/modules/python/clusterloader2/network-scale/config/modules/measurements/cilium.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/measurements/cilium.yaml rename to modules/python/clusterloader2/network-scale/config/modules/measurements/cilium.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/measurements/control-plane.yaml b/modules/python/clusterloader2/network-scale/config/modules/measurements/control-plane.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/measurements/control-plane.yaml rename to modules/python/clusterloader2/network-scale/config/modules/measurements/control-plane.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/measurements/node-disk.yaml b/modules/python/clusterloader2/network-scale/config/modules/measurements/node-disk.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/measurements/node-disk.yaml rename to modules/python/clusterloader2/network-scale/config/modules/measurements/node-disk.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/measurements/retina.yaml b/modules/python/clusterloader2/network-scale/config/modules/measurements/retina.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/measurements/retina.yaml rename to modules/python/clusterloader2/network-scale/config/modules/measurements/retina.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/networkpolicy-template.yaml b/modules/python/clusterloader2/network-scale/config/modules/networkpolicy-template.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/networkpolicy-template.yaml rename to modules/python/clusterloader2/network-scale/config/modules/networkpolicy-template.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter.yaml b/modules/python/clusterloader2/network-scale/config/modules/node-exporter.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/node-exporter.yaml rename to modules/python/clusterloader2/network-scale/config/modules/node-exporter.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter/clusterrole.yaml b/modules/python/clusterloader2/network-scale/config/modules/node-exporter/clusterrole.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/node-exporter/clusterrole.yaml rename to modules/python/clusterloader2/network-scale/config/modules/node-exporter/clusterrole.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter/clusterrolebinding.yaml b/modules/python/clusterloader2/network-scale/config/modules/node-exporter/clusterrolebinding.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/node-exporter/clusterrolebinding.yaml rename to modules/python/clusterloader2/network-scale/config/modules/node-exporter/clusterrolebinding.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter/daemonset.yaml b/modules/python/clusterloader2/network-scale/config/modules/node-exporter/daemonset.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/node-exporter/daemonset.yaml rename to modules/python/clusterloader2/network-scale/config/modules/node-exporter/daemonset.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter/networkpolicy.yaml b/modules/python/clusterloader2/network-scale/config/modules/node-exporter/networkpolicy.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/node-exporter/networkpolicy.yaml rename to modules/python/clusterloader2/network-scale/config/modules/node-exporter/networkpolicy.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter/service.yaml b/modules/python/clusterloader2/network-scale/config/modules/node-exporter/service.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/node-exporter/service.yaml rename to modules/python/clusterloader2/network-scale/config/modules/node-exporter/service.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter/serviceaccount.yaml b/modules/python/clusterloader2/network-scale/config/modules/node-exporter/serviceaccount.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/node-exporter/serviceaccount.yaml rename to modules/python/clusterloader2/network-scale/config/modules/node-exporter/serviceaccount.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/node-exporter/servicemonitor.yaml b/modules/python/clusterloader2/network-scale/config/modules/node-exporter/servicemonitor.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/node-exporter/servicemonitor.yaml rename to modules/python/clusterloader2/network-scale/config/modules/node-exporter/servicemonitor.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/pfl/containernetworklog.yaml b/modules/python/clusterloader2/network-scale/config/modules/pfl/containernetworklog.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/pfl/containernetworklog.yaml rename to modules/python/clusterloader2/network-scale/config/modules/pfl/containernetworklog.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/scale-test.yaml b/modules/python/clusterloader2/network-scale/config/modules/scale-test.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/scale-test.yaml rename to modules/python/clusterloader2/network-scale/config/modules/scale-test.yaml diff --git a/modules/python/clusterloader2/scale/config/modules/test-steps.yaml b/modules/python/clusterloader2/network-scale/config/modules/test-steps.yaml similarity index 100% rename from modules/python/clusterloader2/scale/config/modules/test-steps.yaml rename to modules/python/clusterloader2/network-scale/config/modules/test-steps.yaml diff --git a/modules/python/clusterloader2/scale/scale.py b/modules/python/clusterloader2/network-scale/scale.py similarity index 100% rename from modules/python/clusterloader2/scale/scale.py rename to modules/python/clusterloader2/network-scale/scale.py diff --git a/modules/python/tests/mock_data/scale/report/GenericPrometheusQuery_CiliumAvgCPUUsage_scale-test_2025-03-04T05:35:56Z.json b/modules/python/tests/mock_data/network-scale/report/GenericPrometheusQuery_CiliumAvgCPUUsage_scale-test_2025-03-04T05:35:56Z.json similarity index 100% rename from modules/python/tests/mock_data/scale/report/GenericPrometheusQuery_CiliumAvgCPUUsage_scale-test_2025-03-04T05:35:56Z.json rename to modules/python/tests/mock_data/network-scale/report/GenericPrometheusQuery_CiliumAvgCPUUsage_scale-test_2025-03-04T05:35:56Z.json diff --git a/modules/python/tests/mock_data/scale/report/junit.xml b/modules/python/tests/mock_data/network-scale/report/junit.xml similarity index 100% rename from modules/python/tests/mock_data/scale/report/junit.xml rename to modules/python/tests/mock_data/network-scale/report/junit.xml diff --git a/modules/python/tests/test_scale.py b/modules/python/tests/test_network_scale.py similarity index 88% rename from modules/python/tests/test_scale.py rename to modules/python/tests/test_network_scale.py index b9765cc720..42d1fadd97 100644 --- a/modules/python/tests/test_scale.py +++ b/modules/python/tests/test_network_scale.py @@ -1,19 +1,33 @@ import json +import importlib.util import os import sys import tempfile import unittest +from pathlib import Path from unittest.mock import patch -from clusterloader2.scale.scale import ( - configure_clusterloader2, - execute_clusterloader2, - collect_clusterloader2, - main, +MODULE_PATH = ( + Path(__file__).resolve().parents[1] + / "clusterloader2" + / "network-scale" + / "scale.py" ) +MODULE_SPEC = importlib.util.spec_from_file_location( + "clusterloader2_network_scale", MODULE_PATH +) +if MODULE_SPEC is None or MODULE_SPEC.loader is None: + raise ImportError(f"Unable to load module from {MODULE_PATH}") +network_scale_module = importlib.util.module_from_spec(MODULE_SPEC) +MODULE_SPEC.loader.exec_module(network_scale_module) + +configure_clusterloader2 = network_scale_module.configure_clusterloader2 +execute_clusterloader2 = network_scale_module.execute_clusterloader2 +collect_clusterloader2 = network_scale_module.collect_clusterloader2 +main = network_scale_module.main -class TestConfigureScale(unittest.TestCase): +class TestConfigureNetworkScale(unittest.TestCase): """Test cases for configure_clusterloader2 function""" def test_basic_configuration(self): @@ -93,10 +107,10 @@ def test_configuration_with_container_network_logs(self): os.remove(tmp_path) -class TestExecuteScale(unittest.TestCase): +class TestExecuteNetworkScale(unittest.TestCase): """Test cases for execute_clusterloader2 function""" - @patch("clusterloader2.scale.scale.run_cl2_command") + @patch.object(network_scale_module, "run_cl2_command") def test_execute_calls_run_cl2_command(self, mock_run_cl2): """Test that execute_clusterloader2 calls run_cl2_command with correct params""" execute_clusterloader2( @@ -126,13 +140,13 @@ def test_execute_calls_run_cl2_command(self, mock_run_cl2): ) -class TestCollectScale(unittest.TestCase): +class TestCollectNetworkScale(unittest.TestCase): """Test cases for collect_clusterloader2 function""" def test_collect_creates_result_file(self): """Test that collect_clusterloader2 creates result file with correct structure""" cl2_report_dir = os.path.join( - os.path.dirname(__file__), "mock_data", "scale", "report" + os.path.dirname(__file__), "mock_data", "network-scale", "report" ) result_file = tempfile.mktemp(suffix=".jsonl") @@ -186,7 +200,7 @@ def test_collect_creates_result_file(self): def test_collect_calculates_traffic_pods(self): """Test that traffic_pods is calculated correctly""" cl2_report_dir = os.path.join( - os.path.dirname(__file__), "mock_data", "scale", "report" + os.path.dirname(__file__), "mock_data", "network-scale", "report" ) result_file = tempfile.mktemp(suffix=".jsonl") @@ -228,11 +242,11 @@ def test_collect_calculates_traffic_pods(self): class TestMainArgumentParsing(unittest.TestCase): """Test cases for main() argument parsing""" - @patch("clusterloader2.scale.scale.configure_clusterloader2") + @patch.object(network_scale_module, "configure_clusterloader2") def test_configure_command_parsing(self, mock_configure): """Test that configure command parses arguments correctly""" test_args = [ - "scale.py", + "network-scale/scale.py", "configure", "--fortio-servers-per-deployment", "15", "--fortio-clients-per-deployment", "15", @@ -253,11 +267,11 @@ def test_configure_command_parsing(self, mock_configure): 15, 15, 1500, 50, 1, 1000, 100, True, False, "/tmp/overrides.yaml" ) - @patch("clusterloader2.scale.scale.execute_clusterloader2") + @patch.object(network_scale_module, "execute_clusterloader2") def test_execute_command_parsing(self, mock_execute): """Test that execute command parses arguments correctly""" test_args = [ - "scale.py", + "network-scale/scale.py", "execute", "--cl2-image", "ghcr.io/azure/clusterloader2:v20250513", "--cl2-config-dir", "/path/to/config", diff --git a/pipelines/perf-eval/CNI Benchmark/cnl-observability.yml b/pipelines/perf-eval/CNI Benchmark/cnl-observability.yml index 2562c7ee91..1660285448 100644 --- a/pipelines/perf-eval/CNI Benchmark/cnl-observability.yml +++ b/pipelines/perf-eval/CNI Benchmark/cnl-observability.yml @@ -27,7 +27,7 @@ stages: engine_input: image: "ghcr.io/azure/clusterloader2:v20250513" install: false - topology: observability + topology: network-scale matrix: cnl: traffic_deployment_count: 1000 diff --git a/steps/engine/clusterloader2/scale/collect.yml b/steps/engine/clusterloader2/network-scale/collect.yml similarity index 96% rename from steps/engine/clusterloader2/scale/collect.yml rename to steps/engine/clusterloader2/network-scale/collect.yml index 845942dcb7..692e81f26f 100644 --- a/steps/engine/clusterloader2/scale/collect.yml +++ b/steps/engine/clusterloader2/network-scale/collect.yml @@ -40,8 +40,8 @@ steps: env: CLOUD: ${{ parameters.cloud }} RUN_URL: $(RUN_URL) - PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/clusterloader2/scale/scale.py - CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/scale/results + PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/clusterloader2/network-scale/scale.py + CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/network-scale/results REPOSITORY: ${{ parameters.engine_input.repository }} REPOSITORY_REF: ${{ parameters.engine_input.ref }} displayName: "Collect Results" diff --git a/steps/engine/clusterloader2/scale/execute.yml b/steps/engine/clusterloader2/network-scale/execute.yml similarity index 97% rename from steps/engine/clusterloader2/scale/execute.yml rename to steps/engine/clusterloader2/network-scale/execute.yml index 43a903cda6..99b1cea821 100644 --- a/steps/engine/clusterloader2/scale/execute.yml +++ b/steps/engine/clusterloader2/network-scale/execute.yml @@ -87,11 +87,11 @@ steps: ${{ else }}: CLOUD: ${{ parameters.cloud }} REGION: ${{ parameters.region }} - PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/clusterloader2/scale/scale.py + PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/clusterloader2/network-scale/scale.py CL2_IMAGE: ${{ parameters.engine_input.image }} - CL2_CONFIG_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/scale/config + CL2_CONFIG_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/network-scale/config CL2_CONFIG_FILE: config.yaml - CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/scale/results + CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/network-scale/results displayName: "Run Benchmark" - ${{ if parameters.install }}: diff --git a/steps/topology/observability/collect-clusterloader2.yml b/steps/topology/network-scale/collect-clusterloader2.yml similarity index 83% rename from steps/topology/observability/collect-clusterloader2.yml rename to steps/topology/network-scale/collect-clusterloader2.yml index 58fffd9b3e..554ed17a31 100644 --- a/steps/topology/observability/collect-clusterloader2.yml +++ b/steps/topology/network-scale/collect-clusterloader2.yml @@ -11,7 +11,7 @@ parameters: steps: - template: /steps/set-run-id.yml -- template: /steps/engine/clusterloader2/scale/collect.yml +- template: /steps/engine/clusterloader2/network-scale/collect.yml parameters: cloud: ${{ parameters.cloud }} engine_input: ${{ parameters.engine_input }} diff --git a/steps/topology/observability/execute-clusterloader2.yml b/steps/topology/network-scale/execute-clusterloader2.yml similarity index 91% rename from steps/topology/observability/execute-clusterloader2.yml rename to steps/topology/network-scale/execute-clusterloader2.yml index 0572362d99..9a72cf39e6 100644 --- a/steps/topology/observability/execute-clusterloader2.yml +++ b/steps/topology/network-scale/execute-clusterloader2.yml @@ -10,7 +10,7 @@ parameters: default: {} steps: -- template: /steps/engine/clusterloader2/scale/execute.yml +- template: /steps/engine/clusterloader2/network-scale/execute.yml parameters: cloud: ${{ parameters.cloud }} engine_input: ${{ parameters.engine_input }} diff --git a/steps/topology/observability/validate-resources.yml b/steps/topology/network-scale/validate-resources.yml similarity index 98% rename from steps/topology/observability/validate-resources.yml rename to steps/topology/network-scale/validate-resources.yml index f4fadb4298..b8564cb103 100644 --- a/steps/topology/observability/validate-resources.yml +++ b/steps/topology/network-scale/validate-resources.yml @@ -21,11 +21,11 @@ steps: timeout=300 # 5 minutes timeout interval=10 elapsed=0 - + # Debug: Show available acn.azure.com CRDs echo "Checking for ACNS-related CRDs..." kubectl get crd | grep -E "acn\.azure\.com|cilium" || echo "No ACNS/Cilium CRDs found yet" - + while ! kubectl get crd containernetworklogs.acn.azure.com &>/dev/null; do if [ $elapsed -ge $timeout ]; then echo "##vso[task.logissue type=error]Timeout waiting for ContainerNetworkLog CRD." @@ -41,7 +41,7 @@ steps: sleep $interval elapsed=$((elapsed + interval)) done - + echo "ContainerNetworkLog CRD is available!" kubectl get crd containernetworklogs.acn.azure.com displayName: "Wait for ContainerNetworkLog CRD" From c8e4caa8056f13711f257adfebd7312f8b9cceb8 Mon Sep 17 00:00:00 2001 From: carlotaarvela Date: Thu, 5 Mar 2026 14:16:44 +0000 Subject: [PATCH 09/10] Fix scneario name --- pipelines/perf-eval/CNI Benchmark/cnl-observability.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/perf-eval/CNI Benchmark/cnl-observability.yml b/pipelines/perf-eval/CNI Benchmark/cnl-observability.yml index 1660285448..d90b439091 100644 --- a/pipelines/perf-eval/CNI Benchmark/cnl-observability.yml +++ b/pipelines/perf-eval/CNI Benchmark/cnl-observability.yml @@ -10,7 +10,7 @@ schedules: variables: SCENARIO_TYPE: perf-eval - SCENARIO_NAME: azurecni-overlay-cilium-cnl + SCENARIO_NAME: cnl-azurecni-overlay-cilium OWNER: aks OBSERVABILITY_TOOL: acns From 230d1aa90846c06aedce8c68261e619f27e12fb1 Mon Sep 17 00:00:00 2001 From: carlotaarvela Date: Thu, 5 Mar 2026 15:47:33 +0000 Subject: [PATCH 10/10] Lint --- steps/topology/network-scale/validate-resources.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/steps/topology/network-scale/validate-resources.yml b/steps/topology/network-scale/validate-resources.yml index b8564cb103..f4fadb4298 100644 --- a/steps/topology/network-scale/validate-resources.yml +++ b/steps/topology/network-scale/validate-resources.yml @@ -21,11 +21,11 @@ steps: timeout=300 # 5 minutes timeout interval=10 elapsed=0 - + # Debug: Show available acn.azure.com CRDs echo "Checking for ACNS-related CRDs..." kubectl get crd | grep -E "acn\.azure\.com|cilium" || echo "No ACNS/Cilium CRDs found yet" - + while ! kubectl get crd containernetworklogs.acn.azure.com &>/dev/null; do if [ $elapsed -ge $timeout ]; then echo "##vso[task.logissue type=error]Timeout waiting for ContainerNetworkLog CRD." @@ -41,7 +41,7 @@ steps: sleep $interval elapsed=$((elapsed + interval)) done - + echo "ContainerNetworkLog CRD is available!" kubectl get crd containernetworklogs.acn.azure.com displayName: "Wait for ContainerNetworkLog CRD"