diff --git a/modules/python/clusterloader2/network-scale/config/config.yaml b/modules/python/clusterloader2/network-scale/config/config.yaml new file mode 100644 index 0000000000..946405b313 --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/config.yaml @@ -0,0 +1,91 @@ +name: scale-test + +# generic config +{{$groupName := DefaultParam .CL2_GROUP_NAME "scale-test"}} +{{$apiServerCallsPerSecond := DefaultParam .CL2_API_SERVER_CALLS_PER_SECOND 5}} + +# topology config +{{$namespaces := DefaultParam .CL2_NAMESPACES 1}} +{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 10}} +{{$replicasPerNamespace := DefaultParam .CL2_REPLICAS_PER_NAMESPACE 5}} + +# topology config +{{$fortioServerReplicas := DefaultParam .CL2_FORTIO_SERVERS_PER_DEPLOYMENT 10}} +{{$fortioClientReplicas := DefaultParam .CL2_FORTIO_CLIENTS_PER_DEPLOYMENT 10}} +{{$fortioClientQueriesPerSecond := DefaultParam .CL2_FORTIO_CLIENT_QUERIES_PER_SECOND 1000}} +{{$fortioClientConnections := DefaultParam .CL2_FORTIO_CLIENT_CONNECTIONS 10}} +{{$fortioNamespaces := DefaultParam .CL2_FORTIO_NAMESPACES 1}} +{{$fortioDeploymentsPerNamespace := DefaultParam .CL2_FORTIO_DEPLOYMENTS_PER_NAMESPACE 1}} + +# Container Network Log config +{{$createContainerNetworkLogs := DefaultParam .CL2_GENERATE_CONTAINER_NETWORK_LOGS false}} + + +namespace: + number: {{$namespaces}} + prefix: scale-test + deleteStaleNamespaces: true + deleteAutomanagedNamespaces: true + enableExistingNamespaces: false + deleteNamespaceTimeout: 20m + +tuningSets: + - name: Sequence + parallelismLimitedLoad: + parallelismLimit: 1 + - name: DeploymentCreateQps + qpsLoad: + qps: {{$apiServerCallsPerSecond}} + +steps: + - name: Log + measurements: + - Identifier: Dummy + Method: Sleep + Params: + action: start + duration: 1ms + + - module: + path: /modules/node-exporter.yaml + params: + actionName: "create" + tuningSet: DeploymentCreateQps + + - module: + path: /modules/ama-logs.yaml + params: + actionName: "create" + tuningSet: DeploymentCreateQps + + - module: + path: /modules/hubble.yaml + params: + actionName: "create" + tuningSet: DeploymentCreateQps + + - module: + path: /modules/scale-test.yaml + params: + action: start + group: {{$groupName}} + createContainerNetworkLogs: {{$createContainerNetworkLogs}} + + - module: + path: /modules/hubble.yaml + params: + actionName: "delete" + tuningSet: DeploymentCreateQps + + - module: + path: /modules/ama-logs.yaml + params: + actionName: "delete" + tuningSet: DeploymentCreateQps + + # TODO: Remove this module once there's a way to deploy node exporter that works in perf-tests repository + - module: + path: /modules/node-exporter.yaml + params: + actionName: "delete" + tuningSet: DeploymentCreateQps diff --git a/modules/python/clusterloader2/network-scale/config/modules/ama-logs.yaml b/modules/python/clusterloader2/network-scale/config/modules/ama-logs.yaml new file mode 100644 index 0000000000..683f653d75 --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/modules/ama-logs.yaml @@ -0,0 +1,26 @@ +## AMA Logs module creates AMA Logs pod monitor + +# Tuning set +{{$tuningSet := DefaultParam .tuningSet "DeploymentCreateQps"}} + +# interval +{{$interval := DefaultParam .interval "15s"}} +{{ $replicasPerNamespace := 1 }} + +{{if eq .actionName "create"}} + {{ $replicasPerNamespace = 1 }} +{{else}} + {{ $replicasPerNamespace = 0 }} +{{end}} + +steps: + - name: {{.actionName}} AMA Logs Pod Monitor + phases: + - namespaceList: + - "monitoring" + replicasPerNamespace: {{$replicasPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - objectTemplatePath: "modules/ama-logs/podmonitor.yaml" + basename: ama-logs-metrics + interval: 15s diff --git a/modules/python/clusterloader2/network-scale/config/modules/ama-logs/podmonitor.yaml b/modules/python/clusterloader2/network-scale/config/modules/ama-logs/podmonitor.yaml new file mode 100644 index 0000000000..04f352560b --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/modules/ama-logs/podmonitor.yaml @@ -0,0 +1,26 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: ama-logs-metrics + namespace: monitoring +spec: + jobLabel: ama-logs-metrics + selector: + matchLabels: + component: ama-logs-agent + namespaceSelector: + matchNames: + - kube-system + podMetricsEndpoints: + - interval: 30s + honorLabels: true + path: /metrics + relabelings: + - sourceLabels: [__address__] + action: replace + targetLabel: __address__ + regex: (.+?)(\:\d+)? + replacement: $1:9102 + - sourceLabels: [__meta_kubernetes_pod_container_name] + regex: "ama-logs" + action: keep diff --git a/modules/python/clusterloader2/network-scale/config/modules/fortio/client-deployment.yaml b/modules/python/clusterloader2/network-scale/config/modules/fortio/client-deployment.yaml new file mode 100644 index 0000000000..d962de81ed --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/modules/fortio/client-deployment.yaml @@ -0,0 +1,89 @@ +{{$CpuRequest := DefaultParam .CpuRequest "5m"}} +{{$MemoryRequest := DefaultParam .MemoryRequest "20Mi"}} +{{$Image := DefaultParam .Image "acnpublic.azurecr.io/fortio"}} +{{$FortioClientQueriesPerSecond := .FortioClientQueriesPerSecond}} +{{$FortioClientConnections := .FortioClientConnections}} +{{$uniqueLabel := DefaultParam .uniqueLabel ""}} + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: {{.Group}} + app: fortio + role: load +spec: + replicas: {{.Replicas}} + selector: + matchLabels: + name: {{.Name}} + app: fortio + role: load + strategy: + type: Recreate + template: + metadata: + annotations: + retina.sh: observe + labels: + name: {{.Name}} + group: {{.Group}} + app: fortio + role: load + restart: {{.deploymentLabel}} + {{if ne $uniqueLabel ""}} + uniqueLabelPerDeployment: "{{$uniqueLabel}}{{.Index}}" + {{end}} + spec: + nodeSelector: + scale-test: "true" + containers: + - name: fortio + image: {{$Image}} + imagePullPolicy: IfNotPresent + args: + [ + "load", + "-nocatchup", + "-uniform", + "-sequential-warmup", + "-jitter", + "-udp-timeout", + "1500ms", + "-timeout", + "60s", + "-connection-reuse", + "{{$FortioClientConnections}}:{{$FortioClientConnections}}", + "-c", + "{{$FortioClientConnections}}", + "-qps", + "{{$FortioClientQueriesPerSecond}}", + "-t", + "0", + "http://{{.FortioServerServiceBasename}}-{{.Index}}:8080" + ] + ports: + - containerPort: 8078 # tcp echo + - containerPort: 8079 # grpc echo + - containerPort: 8080 # main serving port + - containerPort: 8081 # redirection to https port + resources: + requests: + cpu: {{$CpuRequest}} + memory: {{$MemoryRequest}} + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "scale-test" + operator: "Equal" + value: "true" + effect: "NoSchedule" diff --git a/modules/python/clusterloader2/network-scale/config/modules/fortio/server-deployment.yaml b/modules/python/clusterloader2/network-scale/config/modules/fortio/server-deployment.yaml new file mode 100644 index 0000000000..a0c1274c17 --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/modules/fortio/server-deployment.yaml @@ -0,0 +1,68 @@ +{{$CpuRequest := DefaultParam .CpuRequest "5m"}} +{{$MemoryRequest := DefaultParam .MemoryRequest "20Mi"}} +{{$Image := DefaultParam .Image "acnpublic.azurecr.io/fortio"}} + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: {{.Group}} + app: fortio + role: server + svc: {{.Name}} +spec: + replicas: {{.Replicas}} + selector: + matchLabels: + name: {{.Name}} + app: fortio + role: server + strategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 20% + maxSurge: 20% + template: + metadata: + annotations: + retina.sh: observe + labels: + name: {{.Name}} + group: {{.Group}} + app: fortio + role: server + svc: {{.Name}} + restart: {{.deploymentLabel}} + spec: + nodeSelector: + scale-test: "true" + containers: + - name: fortio + image: {{$Image}} + imagePullPolicy: IfNotPresent + args: ["server", "-http-port", "0.0.0.0:8080"] + ports: + - containerPort: 8078 # tcp echo + - containerPort: 8079 # grpc echo + - containerPort: 8080 # main serving port + - containerPort: 8081 # redirection to https port + resources: + requests: + cpu: {{$CpuRequest}} + memory: {{$MemoryRequest}} + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "scale-test" + operator: "Equal" + value: "true" + effect: "NoSchedule" diff --git a/modules/python/clusterloader2/network-scale/config/modules/fortio/service.yaml b/modules/python/clusterloader2/network-scale/config/modules/fortio/service.yaml new file mode 100644 index 0000000000..a3ebcb2d65 --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/modules/fortio/service.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{.Name}} +spec: + selector: + svc: {{.Name}} + ports: + - port: 8080 + targetPort: 8080 diff --git a/modules/python/clusterloader2/network-scale/config/modules/hubble.yaml b/modules/python/clusterloader2/network-scale/config/modules/hubble.yaml new file mode 100644 index 0000000000..744fb84927 --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/modules/hubble.yaml @@ -0,0 +1,26 @@ +## Hubble module creates Hubble pod monitor + +# Tuning set +{{$tuningSet := DefaultParam .tuningSet "DeploymentCreateQps"}} + +# interval +{{$interval := DefaultParam .interval "15s"}} +{{ $replicasPerNamespace := 1 }} + +{{if eq .actionName "create"}} + {{ $replicasPerNamespace = 1 }} +{{else}} + {{ $replicasPerNamespace = 0 }} +{{end}} + +steps: + - name: {{.actionName}} Hubble Pod Monitor + phases: + - namespaceList: + - "monitoring" + replicasPerNamespace: {{$replicasPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - objectTemplatePath: "modules/hubble/podmonitor.yaml" + basename: hubble-metrics + interval: 15s diff --git a/modules/python/clusterloader2/network-scale/config/modules/hubble/podmonitor.yaml b/modules/python/clusterloader2/network-scale/config/modules/hubble/podmonitor.yaml new file mode 100644 index 0000000000..21e792ad9a --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/modules/hubble/podmonitor.yaml @@ -0,0 +1,24 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: hubble-metrics + namespace: monitoring +spec: + # Hubble metrics are exposed by cilium-agent pods on port 9965. + # This PodMonitor scrapes Hubble metrics from cilium-agent. + selector: + matchLabels: + k8s-app: cilium + namespaceSelector: + matchNames: + - kube-system + podMetricsEndpoints: + - interval: 30s + honorLabels: true + path: /metrics + relabelings: + - sourceLabels: [__address__] + action: replace + targetLabel: __address__ + regex: (.+?)(\:\d+)? + replacement: $1:9965 diff --git a/modules/python/clusterloader2/network-scale/config/modules/measurements/ama-logs.yaml b/modules/python/clusterloader2/network-scale/config/modules/measurements/ama-logs.yaml new file mode 100644 index 0000000000..84c6e28e31 --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/modules/measurements/ama-logs.yaml @@ -0,0 +1,191 @@ +{{$action := .action}} # start, gather + +{{$suffix := DefaultParam .suffix ""}} + +steps: + - name: {{$action}} Additional AMA-Logs Measurements + measurements: + - Identifier: AMALogsAvgCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: AMALogs Average CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(container_cpu_usage_seconds_total{container="ama-logs"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(container_cpu_usage_seconds_total{container="ama-logs"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(container_cpu_usage_seconds_total{container="ama-logs"}[1m])[%v:])) + - Identifier: AMALogsMaxCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: AMALogs Max CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(container_cpu_usage_seconds_total{container="ama-logs"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(container_cpu_usage_seconds_total{container="ama-logs"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(container_cpu_usage_seconds_total{container="ama-logs"}[1m])[%v:])) + - Identifier: AMALogsAvgMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: AMALogs Avg Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(container_memory_usage_bytes{container="ama-logs"}[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, avg_over_time(container_memory_usage_bytes{container="ama-logs"}[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, avg_over_time(container_memory_usage_bytes{container="ama-logs"}[%v:]) / 1024 / 1024) + - Identifier: AMALogsMaxMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: AMALogs Max Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(container_memory_usage_bytes{container="ama-logs"}[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, max_over_time(container_memory_usage_bytes{container="ama-logs"}[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, max_over_time(container_memory_usage_bytes{container="ama-logs"}[%v:]) / 1024 / 1024) + - Identifier: AMALogsContainerFsAvgWrittenBytes{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: AMALogs Container FS Average Written Bytes {{$suffix}} + metricVersion: v1 + unit: bytes/s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(container_fs_writes_bytes_total{container="ama-logs"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(container_fs_writes_bytes_total{container="ama-logs"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(container_fs_writes_bytes_total{container="ama-logs"}[1m])[%v:])) + - Identifier: AMALogsContainerFsMaxWrittenBytes{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: AMALogs Container FS Max Written Bytes {{$suffix}} + metricVersion: v1 + unit: bytes/s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(container_fs_writes_bytes_total{container="ama-logs"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(container_fs_writes_bytes_total{container="ama-logs"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(container_fs_writes_bytes_total{container="ama-logs"}[1m])[%v:])) + - Identifier: AMALogsContainerFsAvgWriteLatency{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: AMALogs Container FS Average Write Latency {{$suffix}} + metricVersion: v1 + unit: s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time((rate(container_fs_write_seconds_total{container="ama-logs"}[1m]) / rate(container_fs_writes_total{container="ama-logs"}[1m]))[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time((rate(container_fs_write_seconds_total{container="ama-logs"}[1m]) / rate(container_fs_writes_total{container="ama-logs"}[1m]))[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time((rate(container_fs_write_seconds_total{container="ama-logs"}[1m]) / rate(container_fs_writes_total{container="ama-logs"}[1m]))[%v:])) + - Identifier: AMALogsContainerFsMaxWriteLatency{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: AMALogs Container FS Max Write Latency {{$suffix}} + metricVersion: v1 + unit: s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time((rate(container_fs_write_seconds_total{container="ama-logs"}[1m]) / rate(container_fs_writes_total{container="ama-logs"}[1m]))[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time((rate(container_fs_write_seconds_total{container="ama-logs"}[1m]) / rate(container_fs_writes_total{container="ama-logs"}[1m]))[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time((rate(container_fs_write_seconds_total{container="ama-logs"}[1m]) / rate(container_fs_writes_total{container="ama-logs"}[1m]))[%v:])) + - Identifier: AMALogsContainerRestarts{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: AMALogs Container Restarts {{$suffix}} + metricVersion: v1 + unit: "#" + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(increase(kube_pod_container_status_restarts_total{container="ama-logs"}[%v])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(increase(kube_pod_container_status_restarts_total{container="ama-logs"}[%v])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(increase(kube_pod_container_status_restarts_total{container="ama-logs"}[%v])[%v:])) + + - Identifier: AMALogsNetworkFlowInputAvgRecordsPerSecond{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: AMALogs Network Flow Input Avg Records Per Second {{$suffix}} + metricVersion: v1 + unit: "#/s" + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(fluentbit_input_records_total{name=~"oms_networkflow_input"}[5m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(fluentbit_input_records_total{name=~"oms_networkflow_input"}[5m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(fluentbit_input_records_total{name=~"oms_networkflow_input"}[5m])[%v:])) + + - Identifier: AMALogsNetworkFlowOutputAvgRecordsPerSecond{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: AMALogs Network Flow Output Avg Records Per Second {{$suffix}} + metricVersion: v1 + unit: "#/s" + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(fluentbit_output_proc_records_total{name=~"oms_network_flow_output"}[5m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(fluentbit_output_proc_records_total{name=~"oms_network_flow_output"}[5m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(fluentbit_output_proc_records_total{name=~"oms_network_flow_output"}[5m])[%v:])) + + - Identifier: AMALogsNetworkFlowDroppedAvgRecordsPerSecond{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: AMALogs Network Flow Dropped Avg Records Per Second {{$suffix}} + metricVersion: v1 + unit: "#/s" + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(fluentbit_filter_drop_records_total{name=~"oms_networkflow_throttle"}[5m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(fluentbit_filter_drop_records_total{name=~"oms_networkflow_throttle"}[5m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(fluentbit_filter_drop_records_total{name=~"oms_networkflow_throttle"}[5m])[%v:])) + diff --git a/modules/python/clusterloader2/network-scale/config/modules/measurements/cilium.yaml b/modules/python/clusterloader2/network-scale/config/modules/measurements/cilium.yaml new file mode 100644 index 0000000000..c6f715cfb2 --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/modules/measurements/cilium.yaml @@ -0,0 +1,213 @@ +{{$action := .action}} # start, gather + +{{$suffix := DefaultParam .suffix ""}} + +steps: + - name: {{$action}} Additional Cilium Measurements + measurements: + - Identifier: CiliumAvgCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Average CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(cilium_process_cpu_seconds_total[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(cilium_process_cpu_seconds_total[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(cilium_process_cpu_seconds_total[1m])[%v:])) + - Identifier: CiliumMaxCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Max CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(cilium_process_cpu_seconds_total[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(cilium_process_cpu_seconds_total[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(cilium_process_cpu_seconds_total[1m])[%v:])) + - Identifier: CiliumAvgMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Avg Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(cilium_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, avg_over_time(cilium_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, avg_over_time(cilium_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - Identifier: CiliumMaxMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Max Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(cilium_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, max_over_time(cilium_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, max_over_time(cilium_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - Identifier: CiliumOperatorAvgCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Operator Avg CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(cilium_operator_process_cpu_seconds_total[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(cilium_operator_process_cpu_seconds_total[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(cilium_operator_process_cpu_seconds_total[1m])[%v:])) + - Identifier: CiliumOperatorMaxCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Operator Max CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(cilium_operator_process_cpu_seconds_total[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(cilium_operator_process_cpu_seconds_total[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(cilium_operator_process_cpu_seconds_total[1m])[%v:])) + - Identifier: CiliumOperatorMaxMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Operator Max Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(cilium_operator_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, max_over_time(cilium_operator_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, max_over_time(cilium_operator_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - Identifier: CiliumOperatorAvgMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Operator Avg Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(cilium_operator_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, avg_over_time(cilium_operator_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, avg_over_time(cilium_operator_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - Identifier: CiliumContainerFsAvgWrittenBytes{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Container FS Average Written Bytes {{$suffix}} + metricVersion: v1 + unit: bytes/s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(container_fs_writes_bytes_total{container="cilium-agent"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(container_fs_writes_bytes_total{container="cilium-agent"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(container_fs_writes_bytes_total{container="cilium-agent"}[1m])[%v:])) + - Identifier: CiliumContainerFsMaxWrittenBytes{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Container FS Max Written Bytes {{$suffix}} + metricVersion: v1 + unit: bytes/s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(container_fs_writes_bytes_total{container="cilium-agent"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(container_fs_writes_bytes_total{container="cilium-agent"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(container_fs_writes_bytes_total{container="cilium-agent"}[1m])[%v:])) + - Identifier: CiliumContainerFsAvgWriteLatency{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Container FS Average Write Latency {{$suffix}} + metricVersion: v1 + unit: s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time((rate(container_fs_write_seconds_total{container="cilium-agent"}[1m]) / rate(container_fs_writes_total{container="cilium-agent"}[1m]))[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time((rate(container_fs_write_seconds_total{container="cilium-agent"}[1m]) / rate(container_fs_writes_total{container="cilium-agent"}[1m]))[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time((rate(container_fs_write_seconds_total{container="cilium-agent"}[1m]) / rate(container_fs_writes_total{container="cilium-agent"}[1m]))[%v:])) + - Identifier: CiliumContainerFsMaxWriteLatency{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Container FS Max Write Latency {{$suffix}} + metricVersion: v1 + unit: s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time((rate(container_fs_write_seconds_total{container="cilium-agent"}[1m]) / rate(container_fs_writes_total{container="cilium-agent"}[1m]))[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time((rate(container_fs_write_seconds_total{container="cilium-agent"}[1m]) / rate(container_fs_writes_total{container="cilium-agent"}[1m]))[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time((rate(container_fs_write_seconds_total{container="cilium-agent"}[1m]) / rate(container_fs_writes_total{container="cilium-agent"}[1m]))[%v:])) + - Identifier: CiliumContainerRestarts{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Container Restarts {{$suffix}} + metricVersion: v1 + unit: "#" + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(increase(kube_pod_container_status_restarts_total{container="cilium-agent"}[%v])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(increase(kube_pod_container_status_restarts_total{container="cilium-agent"}[%v])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(increase(kube_pod_container_status_restarts_total{container="cilium-agent"}[%v])[%v:])) + # - Identifier: AvgCiliumHubbleMetricsCardinality{{$suffix}} + # Method: GenericPrometheusQuery + # Params: + # action: {{$action}} + # metricName: Average Cilium Hubble Metrics Cardinality {{$suffix}} + # metricVersion: v1 + # unit: "#" + # enableViolations: true + # queries: + # - name: Avg + # query: count({__name__=~"hubble_.*"}) diff --git a/modules/python/clusterloader2/network-scale/config/modules/measurements/control-plane.yaml b/modules/python/clusterloader2/network-scale/config/modules/measurements/control-plane.yaml new file mode 100644 index 0000000000..47504cbf89 --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/modules/measurements/control-plane.yaml @@ -0,0 +1,86 @@ +{{$action := .action}} # start, gather + +# Feature gates +{{$podStartupLatencyThreshold := DefaultParam .CL2_POD_STARTUP_LATENCY_THRESHOLD "15s"}} +{{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE := DefaultParam .CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE true}} +{{$PROMETHEUS_SCRAPE_KUBE_PROXY := DefaultParam .PROMETHEUS_SCRAPE_KUBE_PROXY true}} +{{$NETWORK_LATENCY_THRESHOLD := DefaultParam .CL2_NETWORK_LATENCY_THRESHOLD "0s"}} +{{$ENABLE_IN_CLUSTER_NETWORK_LATENCY := DefaultParam .CL2_ENABLE_IN_CLUSTER_NETWORK_LATENCY true}} + +{{$suffix := DefaultParam .suffix ""}} + +steps: + - name: {{$action}} Additional Measurements + measurements: + - Identifier: APIResponsivenessPrometheus{{$suffix}} + Method: APIResponsivenessPrometheus + Params: + action: {{$action}} + enableViolations: {{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE}} + useSimpleLatencyQuery: true + - Identifier: PodStartupLatency{{$suffix}} + Method: PodStartupLatency + Params: + action: {{$action}} + labelSelector: group = {{.group}} + threshold: {{$podStartupLatencyThreshold}} + - Identifier: ApiserverAvgCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Apiserver Average CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:])) + - Identifier: ApiserverMaxCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Apiserver Max CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:])) + - Identifier: ApiserverAvgMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Apiserver Average Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, avg_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, avg_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024) + - Identifier: ApiserverMaxMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Apiserver Max Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, max_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, max_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024) diff --git a/modules/python/clusterloader2/network-scale/config/modules/measurements/node-disk.yaml b/modules/python/clusterloader2/network-scale/config/modules/measurements/node-disk.yaml new file mode 100644 index 0000000000..06efe84dd8 --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/modules/measurements/node-disk.yaml @@ -0,0 +1,67 @@ +{{$action := .action}} # start, gather + +{{$suffix := DefaultParam .suffix ""}} + +steps: + - name: {{$action}} Additional Node Disk Measurements + measurements: + - Identifier: NodeDiskAvgWrittenBytes{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Node Disk Average Written Bytes {{$suffix}} + metricVersion: v1 + unit: bytes/s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(node_disk_written_bytes_total[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(node_disk_written_bytes_total[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(node_disk_written_bytes_total[1m])[%v:])) + - Identifier: NodeDiskMaxWrittenBytes{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Node Disk Max Written Bytes {{$suffix}} + metricVersion: v1 + unit: bytes/s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(node_disk_written_bytes_total[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(node_disk_written_bytes_total[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(node_disk_written_bytes_total[1m])[%v:])) + - Identifier: NodeDiskAvgWriteLatency{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Node Disk Average Write Latency {{$suffix}} + metricVersion: v1 + unit: s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time((rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]))[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time((rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]))[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time((rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]))[%v:])) + - Identifier: NodeDiskMaxWriteLatency{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Node Disk Max Write Latency {{$suffix}} + metricVersion: v1 + unit: s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time((rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]))[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time((rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]))[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time((rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]))[%v:])) diff --git a/modules/python/clusterloader2/network-scale/config/modules/measurements/retina.yaml b/modules/python/clusterloader2/network-scale/config/modules/measurements/retina.yaml new file mode 100644 index 0000000000..89d83da2a7 --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/modules/measurements/retina.yaml @@ -0,0 +1,202 @@ +{{$action := .action}} # start, gather + +{{$suffix := DefaultParam .suffix ""}} + +steps: + - name: {{$action}} Additional Retina Measurements + measurements: + - Identifier: RetinaAvgCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Average CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(container_cpu_usage_seconds_total{container="retina"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(container_cpu_usage_seconds_total{container="retina"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(container_cpu_usage_seconds_total{container="retina"}[1m])[%v:])) + - Identifier: RetinaMaxCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Max CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(container_cpu_usage_seconds_total{container="retina"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(container_cpu_usage_seconds_total{container="retina"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(container_cpu_usage_seconds_total{container="retina"}[1m])[%v:])) + - Identifier: RetinaAvgMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Avg Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(container_memory_usage_bytes{container="retina"}[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, avg_over_time(container_memory_usage_bytes{container="retina"}[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, avg_over_time(container_memory_usage_bytes{container="retina"}[%v:]) / 1024 / 1024) + - Identifier: RetinaMaxMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Max Memory Usage{{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(container_memory_usage_bytes{container="retina"}[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, max_over_time(container_memory_usage_bytes{container="retina"}[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, max_over_time(container_memory_usage_bytes{container="retina"}[%v:]) / 1024 / 1024) + - Identifier: RetinaOperatorAvgCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Operator Avg CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(container_cpu_usage_seconds_total{container="retina-operator"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(container_cpu_usage_seconds_total{container="retina-operator"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(container_cpu_usage_seconds_total{container="retina-operator"}[1m])[%v:])) + - Identifier: RetinaOperatorMaxCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Operator Max CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(container_cpu_usage_seconds_total{container="retina-operator"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(container_cpu_usage_seconds_total{container="retina-operator"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(container_cpu_usage_seconds_total{container="retina-operator"}[1m])[%v:])) + - Identifier: RetinaOperatorMaxMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Operator Max Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(container_memory_usage_bytes{container="retina-operator"}[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, max_over_time(container_memory_usage_bytes{container="retina-operator"}[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, max_over_time(container_memory_usage_bytes{container="retina-operator"}[%v:]) / 1024 / 1024) + - Identifier: RetinaOperatorAvgMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Operator Avg Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(container_memory_usage_bytes{container="retina-operator"}[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, avg_over_time(container_memory_usage_bytes{container="retina-operator"}[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, avg_over_time(container_memory_usage_bytes{container="retina-operator"}[%v:]) / 1024 / 1024) + - Identifier: RetinaContainerFsAvgWrittenBytes{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Container FS Average Written Bytes {{$suffix}} + metricVersion: v1 + unit: bytes/s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(container_fs_writes_bytes_total{container="retina"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(container_fs_writes_bytes_total{container="retina"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(container_fs_writes_bytes_total{container="retina"}[1m])[%v:])) + - Identifier: RetinaContainerFsMaxWrittenBytes{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Container FS Max Written Bytes {{$suffix}} + metricVersion: v1 + unit: bytes/s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(container_fs_writes_bytes_total{container="retina"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(container_fs_writes_bytes_total{container="retina"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(container_fs_writes_bytes_total{container="retina"}[1m])[%v:])) + - Identifier: RetinaContainerFsAvgWriteLatency{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Container FS Average Write Latency {{$suffix}} + metricVersion: v1 + unit: s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time((rate(container_fs_write_seconds_total{container="retina"}[1m]) / rate(container_fs_writes_total{container="retina"}[1m]))[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time((rate(container_fs_write_seconds_total{container="retina"}[1m]) / rate(container_fs_writes_total{container="retina"}[1m]))[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time((rate(container_fs_write_seconds_total{container="retina"}[1m]) / rate(container_fs_writes_total{container="retina"}[1m]))[%v:])) + - Identifier: RetinaContainerFsMaxWriteLatency{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Container FS Max Write Latency {{$suffix}} + metricVersion: v1 + unit: s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time((rate(container_fs_write_seconds_total{container="retina"}[1m]) / rate(container_fs_writes_total{container="retina"}[1m]))[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time((rate(container_fs_write_seconds_total{container="retina"}[1m]) / rate(container_fs_writes_total{container="retina"}[1m]))[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time((rate(container_fs_write_seconds_total{container="retina"}[1m]) / rate(container_fs_writes_total{container="retina"}[1m]))[%v:])) + - Identifier: RetinaContainerRestarts{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Retina Container Restarts {{$suffix}} + metricVersion: v1 + unit: "#" + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(increase(kube_pod_container_status_restarts_total{container="retina"}[%v])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(increase(kube_pod_container_status_restarts_total{container="retina"}[%v])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(increase(kube_pod_container_status_restarts_total{container="retina"}[%v])[%v:])) diff --git a/modules/python/clusterloader2/network-scale/config/modules/networkpolicy-template.yaml b/modules/python/clusterloader2/network-scale/config/modules/networkpolicy-template.yaml new file mode 100644 index 0000000000..f89c57cdac --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/modules/networkpolicy-template.yaml @@ -0,0 +1,21 @@ +# NetworkPolicy for API/etcd object scale testing. +# Uses dummy labels to create policy objects without affecting actual traffic. +# Purpose: measure API server, etcd, and controller load from policy churn. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: scale-test-policy +spec: + egress: + - {} + ingress: + - from: + - podSelector: + matchLabels: + dummy-label: dummy-value-{{.Index}} + podSelector: + matchLabels: + dummy-selector: dummy-value-{{.Index}} + policyTypes: + - Egress + - Ingress diff --git a/modules/python/clusterloader2/network-scale/config/modules/node-exporter.yaml b/modules/python/clusterloader2/network-scale/config/modules/node-exporter.yaml new file mode 100644 index 0000000000..82e6a634c8 --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/modules/node-exporter.yaml @@ -0,0 +1,104 @@ +## Node Exporter module creates Node Exporter components + +# Tuning set +{{$tuningSet := DefaultParam .tuningSet "DeploymentCreateQps"}} +# interval +{{$interval := DefaultParam .interval "15s"}} +# operation timeout +{{$operationTimeout := DefaultParam .CL2_NODE_EXPORTER_OPERATION_TIMEOUT "30m"}} +# enable validation +{{$enableValidation := DefaultParam .CL2_NODE_EXPORTER_ENABLE_VALIDATION true}} +{{ $replicasPerNamespace := 1 }} + +{{if eq .actionName "create"}} + {{ $replicasPerNamespace = 1 }} +{{else}} + {{ $replicasPerNamespace = 0 }} +{{end}} + +steps: +{{if $enableValidation}} + - name: Start measurements + measurements: + - Identifier: WaitForNodeExporterPodsRunning + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: DaemonSet + labelSelector: app.kubernetes.io/name = node-exporter + operationTimeout: {{$operationTimeout}} + checkIfPodsAreUpdated: false +{{end}} + - name: {{.actionName}} Node Exporter Service Account + phases: + - namespaceList: + - "monitoring" + replicasPerNamespace: {{$replicasPerNamespace}} + tuningSet: Sequence + objectBundle: + - objectTemplatePath: "modules/node-exporter/serviceaccount.yaml" + basename: node-exporter + - name: {{.actionName}} Node Exporter Cluster Role + phases: + - namespaceList: + - "" + replicasPerNamespace: {{$replicasPerNamespace}} + tuningSet: Sequence + objectBundle: + - objectTemplatePath: "modules/node-exporter/clusterrole.yaml" + basename: node-exporter + - name: {{.actionName}} Node Exporter Cluster Role Binding + phases: + - namespaceList: + - "" + replicasPerNamespace: {{$replicasPerNamespace}} + tuningSet: Sequence + objectBundle: + - objectTemplatePath: "modules/node-exporter/clusterrolebinding.yaml" + basename: node-exporter + - name: {{.actionName}} Node Exporter Service Monitor + phases: + - namespaceList: + - "monitoring" + replicasPerNamespace: {{$replicasPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - objectTemplatePath: "modules/node-exporter/servicemonitor.yaml" + basename: node-exporter + interval: 15s + - name: {{.actionName}} Node Exporter Daemonset + phases: + - namespaceList: + - "monitoring" + replicasPerNamespace: {{$replicasPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - objectTemplatePath: "modules/node-exporter/daemonset.yaml" + basename: node-exporter + - name: {{.actionName}} Node Exporter Network Policy + phases: + - namespaceList: + - "monitoring" + replicasPerNamespace: {{$replicasPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - objectTemplatePath: "modules/node-exporter/networkpolicy.yaml" + basename: node-exporter + - name: {{.actionName}} Node Exporter Services + phases: + - namespaceList: + - "monitoring" + replicasPerNamespace: {{$replicasPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - objectTemplatePath: "modules/node-exporter/service.yaml" + basename: node-exporter +{{if $enableValidation}} + - name: Wait for pods to be running + measurements: + - Identifier: WaitForNodeExporterPodsRunning + Method: WaitForControlledPodsRunning + Params: + action: gather +{{end}} diff --git a/modules/python/clusterloader2/network-scale/config/modules/node-exporter/clusterrole.yaml b/modules/python/clusterloader2/network-scale/config/modules/node-exporter/clusterrole.yaml new file mode 100644 index 0000000000..9248447173 --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/modules/node-exporter/clusterrole.yaml @@ -0,0 +1,22 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: 1.9.1 + name: {{.Name}} +rules: +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create + diff --git a/modules/python/clusterloader2/network-scale/config/modules/node-exporter/clusterrolebinding.yaml b/modules/python/clusterloader2/network-scale/config/modules/node-exporter/clusterrolebinding.yaml new file mode 100644 index 0000000000..6763097793 --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/modules/node-exporter/clusterrolebinding.yaml @@ -0,0 +1,16 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: 1.9.1 + name: {{.Name}} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{.Name}} +subjects: +- kind: ServiceAccount + name: {{.Name}} + namespace: monitoring diff --git a/modules/python/clusterloader2/network-scale/config/modules/node-exporter/daemonset.yaml b/modules/python/clusterloader2/network-scale/config/modules/node-exporter/daemonset.yaml new file mode 100644 index 0000000000..24986d608a --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/modules/node-exporter/daemonset.yaml @@ -0,0 +1,120 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: 1.9.1 + name: {{.Name}} + namespace: monitoring +spec: + minReadySeconds: 0 + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: node-exporter + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: 1.9.1 + spec: + # kube-rbac-proxy needs SA token to call TokenReview/SubjectAccessReview for authn/authz + automountServiceAccountToken: true + containers: + - args: + - --web.listen-address=127.0.0.1:9100 + - --path.sysfs=/host/sys + - --path.rootfs=/host/root + - --path.udev.data=/host/root/run/udev/data + - --no-collector.wifi + - --no-collector.hwmon + - --no-collector.btrfs + - --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run/k3s/containerd/.+|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/) + - --collector.netclass.ignored-devices=^(veth.*|[a-f0-9]{15})$ + - --collector.netdev.device-exclude=^(veth.*|[a-f0-9]{15})$ + image: quay.io/prometheus/node-exporter:v1.9.1 + name: node-exporter + resources: + limits: + cpu: 250m + memory: 180Mi + requests: + cpu: 102m + memory: 180Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + volumeMounts: + - mountPath: /host/sys + mountPropagation: HostToContainer + name: sys + readOnly: true + - mountPath: /host/root + mountPropagation: HostToContainer + name: root + readOnly: true + - args: + - --secure-listen-address=[$(IP)]:9100 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --upstream=http://127.0.0.1:9100/ + env: + - name: IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: quay.io/brancz/kube-rbac-proxy:v0.19.0 + name: kube-rbac-proxy + ports: + - containerPort: 9100 + name: https + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + seccompProfile: + type: RuntimeDefault + # hostNetwork and hostPID required for node-exporter collectors to access host-level + # metrics (network stats, process info). Port 9100 exposed via Service for Prometheus scraping. + hostNetwork: true + hostPID: true + nodeSelector: + kubernetes.io/os: linux + priorityClassName: system-cluster-critical + securityContext: + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + serviceAccountName: {{.Name}} + tolerations: + - operator: Exists + volumes: + - hostPath: + path: /sys + name: sys + - hostPath: + path: / + name: root + updateStrategy: + rollingUpdate: + maxUnavailable: 10% + type: RollingUpdate + diff --git a/modules/python/clusterloader2/network-scale/config/modules/node-exporter/networkpolicy.yaml b/modules/python/clusterloader2/network-scale/config/modules/node-exporter/networkpolicy.yaml new file mode 100644 index 0000000000..83d111067e --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/modules/node-exporter/networkpolicy.yaml @@ -0,0 +1,27 @@ +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: 1.9.1 + name: {{.Name}} + namespace: monitoring +spec: + egress: + - {} + ingress: + - from: + - podSelector: + matchLabels: + app.kubernetes.io/name: prometheus + ports: + - port: 9100 + protocol: TCP + podSelector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + policyTypes: + - Egress + - Ingress diff --git a/modules/python/clusterloader2/network-scale/config/modules/node-exporter/service.yaml b/modules/python/clusterloader2/network-scale/config/modules/node-exporter/service.yaml new file mode 100644 index 0000000000..0dbedc0cc6 --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/modules/node-exporter/service.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: 1.9.1 + name: {{.Name}} + namespace: monitoring +spec: + clusterIP: None + ports: + - name: https + port: 9100 + targetPort: https + selector: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + diff --git a/modules/python/clusterloader2/network-scale/config/modules/node-exporter/serviceaccount.yaml b/modules/python/clusterloader2/network-scale/config/modules/node-exporter/serviceaccount.yaml new file mode 100644 index 0000000000..7b3093f022 --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/modules/node-exporter/serviceaccount.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: 1.9.1 + name: {{.Name}} + namespace: monitoring + diff --git a/modules/python/clusterloader2/network-scale/config/modules/node-exporter/servicemonitor.yaml b/modules/python/clusterloader2/network-scale/config/modules/node-exporter/servicemonitor.yaml new file mode 100644 index 0000000000..a6d9c075a7 --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/modules/node-exporter/servicemonitor.yaml @@ -0,0 +1,32 @@ +{{$interval := DefaultParam .interval "15s"}} + +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/version: 1.9.1 + name: {{.Name}} + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: {{$interval}} + port: https + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: instance + scheme: https + tlsConfig: + insecureSkipVerify: true + jobLabel: app.kubernetes.io/name + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + diff --git a/modules/python/clusterloader2/network-scale/config/modules/pfl/containernetworklog.yaml b/modules/python/clusterloader2/network-scale/config/modules/pfl/containernetworklog.yaml new file mode 100644 index 0000000000..653f1edced --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/modules/pfl/containernetworklog.yaml @@ -0,0 +1,24 @@ +{{$namespaces := .namespaces}} +apiVersion: acn.azure.com/v1alpha1 +kind: ContainerNetworkLog +metadata: + name: test +spec: + includefilters: # List of filters + {{range $i := Loop $namespaces}} + - name: filter-{{ AddInt $i 1 }} # Filter name + from: + namespacedPod: + - scale-test-{{ AddInt $i 1 }}/fortio-client- + to: + namespacedPod: + - scale-test-{{ AddInt $i 1 }}/fortio-server- + protocol: + - tcp + - dns + - udp + verdict: + - forwarded + - dropped + {{end}} + diff --git a/modules/python/clusterloader2/network-scale/config/modules/scale-test.yaml b/modules/python/clusterloader2/network-scale/config/modules/scale-test.yaml new file mode 100644 index 0000000000..e639fd8c5d --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/modules/scale-test.yaml @@ -0,0 +1,101 @@ +name: scale-test + +# generic config +{{$groupName := DefaultParam .CL2_GROUP_NAME "scale-test"}} +{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "15m"}} + +# topology config +{{$namespaces := DefaultParam .CL2_NAMESPACES 1}} +{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 10}} +{{$replicasPerNamespace := DefaultParam .CL2_REPLICAS_PER_NAMESPACE 5}} + +# topology config +{{$fortioServerReplicas := DefaultParam .CL2_FORTIO_SERVERS_PER_DEPLOYMENT 10}} +{{$fortioClientReplicas := DefaultParam .CL2_FORTIO_CLIENTS_PER_DEPLOYMENT 10}} +{{$fortioClientQueriesPerSecond := DefaultParam .CL2_FORTIO_CLIENT_QUERIES_PER_SECOND 1000}} +{{$fortioClientConnections := DefaultParam .CL2_FORTIO_CLIENT_CONNECTIONS 10}} +{{$fortioNamespaces := DefaultParam .CL2_FORTIO_NAMESPACES 1}} +{{$fortioDeploymentsPerNamespace := DefaultParam .CL2_FORTIO_DEPLOYMENTS_PER_NAMESPACE 1}} + +# Network Policies +{{$networkPoliciesPerNamespace := DefaultParam .CL2_NETWORK_POLICIES_PER_NAMESPACE 0}} + +# Container Network Log config +{{$createContainerNetworkLogs := DefaultParam .createContainerNetworkLogs false}} + +steps: + - module: + path: /modules/measurements/control-plane.yaml + params: + action: start + group: {{$groupName}} + + - module: + path: /modules/measurements/retina.yaml + params: + action: start + group: {{$groupName}} + + - module: + path: /modules/measurements/cilium.yaml + params: + action: start + group: {{$groupName}} + + - module: + path: /modules/measurements/ama-logs.yaml + params: + action: start + group: {{$groupName}} + + - module: + path: /modules/measurements/node-disk.yaml + params: + action: start + group: {{$groupName}} + + - module: + path: /modules/test-steps.yaml + params: + tuningSet: DeploymentCreateQps + operationTimeout: {{$operationTimeout}} + Group: {{$groupName}} + namespaces: {{$fortioNamespaces}} + fortioDeploymentsPerNamespace: {{$fortioDeploymentsPerNamespace}} + fortioServerReplicas: {{$fortioServerReplicas}} + fortioClientReplicas: {{$fortioClientReplicas}} + fortioClientQueriesPerSecond: {{$fortioClientQueriesPerSecond}} + fortioClientConnections: {{$fortioClientConnections}} + createContainerNetworkLogs: {{$createContainerNetworkLogs}} + deploymentLabel: start + networkPoliciesPerNamespace: {{$networkPoliciesPerNamespace}} + + - module: + path: /modules/measurements/control-plane.yaml + params: + action: gather + group: {{$groupName}} + + - module: + path: /modules/measurements/retina.yaml + params: + action: gather + group: {{$groupName}} + + - module: + path: /modules/measurements/cilium.yaml + params: + action: gather + group: {{$groupName}} + + - module: + path: /modules/measurements/ama-logs.yaml + params: + action: gather + group: {{$groupName}} + + - module: + path: /modules/measurements/node-disk.yaml + params: + action: gather + group: {{$groupName}} diff --git a/modules/python/clusterloader2/network-scale/config/modules/test-steps.yaml b/modules/python/clusterloader2/network-scale/config/modules/test-steps.yaml new file mode 100644 index 0000000000..dd1fb70422 --- /dev/null +++ b/modules/python/clusterloader2/network-scale/config/modules/test-steps.yaml @@ -0,0 +1,225 @@ +## Input params +{{$tuningSet := .tuningSet}} +{{$operationTimeout := .operationTimeout}} +{{$Group := .Group}} +{{$namespaces := .namespaces}} +{{$deploymentLabel := .deploymentLabel}} +{{$networkPoliciesPerNamespace := .networkPoliciesPerNamespace}} +{{$fortioDeploymentsPerNamespace := .fortioDeploymentsPerNamespace}} +{{$fortioServerReplicas := .fortioServerReplicas}} +{{$fortioClientReplicas := .fortioClientReplicas}} +{{$fortioClientQueriesPerSecond := .fortioClientQueriesPerSecond}} +{{$fortioClientConnections := .fortioClientConnections}} +{{$CpuRequest := DefaultParam .CpuRequest "5m"}} +{{$MemoryRequest := DefaultParam .MemoryRequest "20Mi"}} +{{$createContainerNetworkLogs := .createContainerNetworkLogs}} + +steps: +{{if $createContainerNetworkLogs}} +- name: Create Container Network Log + phases: + - namespaceList: + - "" + replicasPerNamespace: 1 + tuningSet: {{$tuningSet}} + objectBundle: + - basename: container-network-log + objectTemplatePath: modules/pfl/containernetworklog.yaml + templateFillMap: + Group: {{.Group}} + namespaces: {{$namespaces}} +{{end}} + +- name: Starting measurement for 'create traffic services' + measurements: + - Method: WaitForControlledPodsRunning + Instances: + - Identifier: WaitForServerPodsRunning + Params: + apiVersion: apps/v1 + kind: Deployment + labelSelector: role = server + Params: + action: start + checkIfPodsAreUpdated: true + labelSelector: group = {{.Group}} + operationTimeout: {{$operationTimeout}} + apiVersion: apps/v1 + +- name: "create traffic services" + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$fortioDeploymentsPerNamespace}} + tuningSet: Sequence + objectBundle: + - basename: fortio-server + objectTemplatePath: modules/fortio/service.yaml + +- name: 'create servers' + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$fortioDeploymentsPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - basename: fortio-server + objectTemplatePath: modules/fortio/server-deployment.yaml + templateFillMap: + Replicas: {{$fortioServerReplicas}} + # SvcName: fortio-server-service + Group: {{.Group}} + deploymentLabel: {{.deploymentLabel}} + CpuRequest: {{$CpuRequest}} + MemoryRequest: {{$MemoryRequest}} + +- name: Waiting for 'create servers' to be completed + measurements: + - Method: WaitForControlledPodsRunning + Instances: + - Identifier: WaitForServerPodsRunning + Params: + action: gather + refreshInterval: 15s + +- name: Starting measurement for 'create clients' + measurements: + - Method: WaitForControlledPodsRunning + Instances: + - Identifier: WaitForClientPodsRunning + Params: + apiVersion: apps/v1 + kind: Deployment + labelSelector: role = load + Params: + action: start + checkIfPodsAreUpdated: true + labelSelector: group = {{.Group}} + operationTimeout: {{$operationTimeout}} + apiVersion: apps/v1 + +# Create clients after the servers have been created because we want all servers to be backend pods for their service +- name: 'create clients' + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$fortioDeploymentsPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - basename: fortio-client + objectTemplatePath: modules/fortio/client-deployment.yaml + templateFillMap: + Replicas: {{$fortioClientReplicas}} + Group: {{.Group}} + deploymentLabel: {{.deploymentLabel}} + CpuRequest: {{$CpuRequest}} + MemoryRequest: {{$MemoryRequest}} + FortioClientConnections: {{$fortioClientConnections}} + FortioClientQueriesPerSecond: {{$fortioClientQueriesPerSecond}} + FortioServerServiceBasename: fortio-server + +- name: Waiting for 'create clients' to be completed + measurements: + - Method: WaitForControlledPodsRunning + Instances: + - Identifier: WaitForClientPodsRunning + Params: + action: gather + refreshInterval: 15s + +- name: 'create {{$networkPoliciesPerNamespace}} network policies per namespace' + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$networkPoliciesPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - basename: scale-test-policy + objectTemplatePath: modules/networkpolicy-template.yaml + +- name: Wait to get metrics + measurements: + - Identifier: Dummy + Method: Sleep + Params: + action: start + duration: 15m + +- name: 'delete network policies' + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: {{$tuningSet}} + objectBundle: + - basename: scale-test-policy + objectTemplatePath: modules/networkpolicy-template.yaml +- name: "delete k8s services" + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: Sequence + objectBundle: + - basename: fortio-server + objectTemplatePath: modules/fortio/service.yaml + +- name: 'delete servers' + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: {{$tuningSet}} + objectBundle: + - basename: fortio-server + objectTemplatePath: modules/fortio/server-deployment.yaml + templateFillMap: + Replicas: {{$fortioServerReplicas}} + # SvcName: fortio-server-service + Group: {{.Group}} + deploymentLabel: {{.deploymentLabel}} + CpuRequest: {{$CpuRequest}} + MemoryRequest: {{$MemoryRequest}} + +- name: 'delete clients' + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: {{$tuningSet}} + objectBundle: + - basename: fortio-client + objectTemplatePath: modules/fortio/client-deployment.yaml + templateFillMap: + Replicas: {{$fortioClientReplicas}} + Group: {{.Group}} + deploymentLabel: {{.deploymentLabel}} + CpuRequest: {{$CpuRequest}} + MemoryRequest: {{$MemoryRequest}} + FortioClientConnections: {{$fortioClientConnections}} + FortioClientQueriesPerSecond: {{$fortioClientQueriesPerSecond}} + FortioServerServiceBasename: fortio-server + +{{if $createContainerNetworkLogs}} +# Delete container network log +- name: Delete Container Network Log + phases: + - namespaceList: + - "" + replicasPerNamespace: 0 + tuningSet: {{$tuningSet}} + objectBundle: + - basename: container-network-log + objectTemplatePath: modules/pfl/containernetworklog.yaml + templateFillMap: + Group: {{.Group}} + namespaces: {{$namespaces}} +{{end}} diff --git a/modules/python/clusterloader2/network-scale/scale.py b/modules/python/clusterloader2/network-scale/scale.py new file mode 100644 index 0000000000..3363733773 --- /dev/null +++ b/modules/python/clusterloader2/network-scale/scale.py @@ -0,0 +1,229 @@ +import json +import os +import argparse + +from datetime import datetime, timezone +from clusterloader2.utils import parse_xml_to_json, run_cl2_command, process_cl2_reports +from utils.common import str2bool + +def configure_clusterloader2( + fortio_servers_per_deployment, + fortio_clients_per_deployment, + fortio_client_queries_per_second, + fortio_client_connections, + fortio_namespaces, + fortio_deployments_per_namespace, + network_policies_per_namespace, + generate_container_network_logs, + label_traffic_pods, + override_file): + + with open(override_file, 'w', encoding='utf-8') as file: + file.write("CL2_PROMETHEUS_TOLERATE_MASTER: true\n") + file.write("CL2_PROMETHEUS_MEMORY_LIMIT_FACTOR: 100.0\n") + file.write("CL2_PROMETHEUS_MEMORY_SCALE_FACTOR: 100.0\n") + file.write("CL2_PROMETHEUS_CPU_SCALE_FACTOR: 30.0\n") + file.write("CL2_PROMETHEUS_SCRAPE_CILIUM_AGENT: true\n") + file.write("CL2_PROMETHEUS_SCRAPE_CILIUM_OPERATOR: true\n") + file.write("CL2_PROMETHEUS_NODE_SELECTOR: \"prometheus: \\\"true\\\"\"\n") + file.write("CL2_POD_STARTUP_LATENCY_THRESHOLD: 3m\n") + file.write("CL2_NODE_EXPORTER_OPERATION_TIMEOUT: 60m\n") + file.write("CL2_NODE_EXPORTER_ENABLE_VALIDATION: false\n") + file.write(f"CL2_LABEL_TRAFFIC_PODS: {label_traffic_pods}\n") + + # topology config + file.write(f"CL2_FORTIO_SERVERS_PER_DEPLOYMENT: {fortio_servers_per_deployment}\n") + file.write(f"CL2_FORTIO_CLIENTS_PER_DEPLOYMENT: {fortio_clients_per_deployment}\n") + file.write(f"CL2_FORTIO_CLIENT_QUERIES_PER_SECOND: {fortio_client_queries_per_second}\n") + file.write(f"CL2_FORTIO_CLIENT_CONNECTIONS: {fortio_client_connections}\n") + file.write(f"CL2_FORTIO_NAMESPACES: {fortio_namespaces}\n") + file.write(f"CL2_FORTIO_DEPLOYMENTS_PER_NAMESPACE: {fortio_deployments_per_namespace}\n") + file.write("CL2_FORTIO_POD_CPU: 10m\n") + file.write("CL2_FORTIO_POD_MEMORY: 50Mi\n") + file.write(f"CL2_NETWORK_POLICIES_PER_NAMESPACE: {network_policies_per_namespace}\n") + file.write(f"CL2_GENERATE_CONTAINER_NETWORK_LOGS: {generate_container_network_logs}\n") + + with open(override_file, 'r', encoding='utf-8') as file: + print(f"Content of file {override_file}:\n{file.read()}") + +def execute_clusterloader2( + cl2_image, + cl2_config_dir, + cl2_report_dir, + cl2_config_file, + kubeconfig, + provider, + scrape_containerd +): + run_cl2_command(kubeconfig, cl2_image, cl2_config_dir, cl2_report_dir, provider, + cl2_config_file=cl2_config_file, overrides=True, enable_prometheus=True, + scrape_containerd=scrape_containerd, tear_down_prometheus=True, + scrape_kubelets=True, scrape_ksm=True, + scrape_metrics_server=True) + + +def collect_clusterloader2( + cl2_report_dir, + cloud_info, + run_id, + run_url, + result_file, + test_type, + start_timestamp, + observability_tool, + repository, + repository_ref, + fortio_servers_per_deployment, + fortio_clients_per_deployment, + fortio_client_queries_per_second, + fortio_client_connections, + fortio_namespaces, + fortio_deployments_per_namespace, + network_policies_per_namespace, + generate_container_network_logs=False, + label_traffic_pods=False, + trigger_reason="", +): + details = parse_xml_to_json(os.path.join(cl2_report_dir, "junit.xml"), indent = 2) + json_data = json.loads(details) + testsuites = json_data["testsuites"] + + if testsuites: + status = "success" if testsuites[0]["failures"] == 0 else "failure" + else: + raise Exception(f"No testsuites found in the report! Raw data: {details}") + + # TODO: Expose optional parameter to include test details + template = { + "timestamp": datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'), + "status": status, + "group": None, + "measurement": None, + "result": None, + "observability_tool": observability_tool, + "test_details": { + # add more details here about tests (e.g. features tested) + "trigger_reason": trigger_reason, + "observability_tool": observability_tool, + "repository": repository, + "repository_ref": repository_ref, + "traffic_generator": "fortio", + "traffic_namespaces": fortio_namespaces, + "traffic_deployments_per_namespace": fortio_deployments_per_namespace, + "traffic_servers_per_deployment": fortio_servers_per_deployment, + "traffic_clients_per_deployment": fortio_clients_per_deployment, + "traffic_pods": fortio_namespaces * fortio_deployments_per_namespace * (fortio_clients_per_deployment + fortio_servers_per_deployment), + "network_policies": network_policies_per_namespace, + "generate_container_network_logs": generate_container_network_logs, + "label_traffic_pods": label_traffic_pods, + "requests_per_second": fortio_client_queries_per_second, + "details": testsuites[0]["testcases"][0].get("failure", None) if testsuites[0].get("testcases") else None, + }, + "cloud_info": cloud_info, + "run_id": run_id, + "run_url": run_url, + "test_type": test_type, + "start_timestamp": start_timestamp, + # parameters + "fortio_servers_per_deployment": fortio_servers_per_deployment, + "fortio_clients_per_deployment": fortio_clients_per_deployment, + "fortio_client_queries_per_second": fortio_client_queries_per_second, + "fortio_client_connections": fortio_client_connections, + "fortio_namespaces": fortio_namespaces, + "fortio_deployments_per_namespace": fortio_deployments_per_namespace, + } + content = process_cl2_reports(cl2_report_dir, template) + + os.makedirs(os.path.dirname(result_file), exist_ok=True) + with open(result_file, 'w', encoding='utf-8') as file: + file.write(content) + +def main(): + parser = argparse.ArgumentParser(description="SLO Kubernetes resources.") + subparsers = parser.add_subparsers(dest="command") + + # Sub-command for configure_clusterloader2 + parser_configure = subparsers.add_parser("configure", help="Override CL2 config file") + parser_configure.add_argument("--fortio-servers-per-deployment", type=int, required=True, help="Number of Fortio servers per deployment") + parser_configure.add_argument("--fortio-clients-per-deployment", type=int, required=True, help="Number of Fortio clients per deployment") + parser_configure.add_argument("--fortio-client-queries-per-second", type=int, required=True, help="Queries per second for each Fortio client pod. NOT queries per second per connection") + parser_configure.add_argument("--fortio-client-connections", type=int, required=True, help="Number of simultaneous connections for each Fortio client") + parser_configure.add_argument("--fortio-namespaces", type=int, required=True, help="Number of namespaces, each with their own service. Fortio clients query servers in the same namespace. Be wary of integer division causing less pods than expected regarding this parameter, pods, and pods per node.") + parser_configure.add_argument("--fortio-deployments-per-namespace", type=int, required=True, help="Number of Fortio server deployments (and number of client deployments) per service/partition. Be wary of integer division causing less pods than expected regarding this parameter, namespaces, pods, and pods per node.") + parser_configure.add_argument("--network-policies-per-namespace", type=int, help="Number of network policies to be created per namespace", default=0, nargs='?') + parser_configure.add_argument("--generate-container-network-logs", type=str2bool, choices=[True, False], nargs='?', default=False, help="Generate Container Network Logs (default=False)") + parser_configure.add_argument("--label_traffic_pods", type=str2bool, choices=[True, False], nargs='?', default=False, help="Add/Remove label to client traffic pods(default=False)") + parser_configure.add_argument("--cl2_override_file", type=str, help="Path to the overrides of CL2 config file") + + # Sub-command for execute_clusterloader2 + parser_execute = subparsers.add_parser("execute", help="Execute scale up operation") + parser_execute.add_argument("--cl2-image", type=str, required=True, help="Name of the CL2 image") + parser_execute.add_argument("--cl2-config-dir", type=str, required=True, help="Path to the CL2 config directory") + parser_execute.add_argument("--cl2-report-dir", type=str, required=True, help="Path to the CL2 report directory") + parser_execute.add_argument("--cl2-config-file", type=str, required=True, help="Path to the CL2 config file") + parser_execute.add_argument("--kubeconfig", type=str, required=True, help="Path to the kubeconfig file") + parser_execute.add_argument("--provider", type=str, required=True, help="Cloud provider name") + parser_execute.add_argument("--scrape-containerd", type=str2bool, choices=[True, False], default=False, + help="Whether to scrape containerd metrics. Must be either True or False") + + # Sub-command for collect_clusterloader2 + parser_collect = subparsers.add_parser("collect", help="Collect scale up data") + parser_collect.add_argument("--cl2_report_dir", type=str, help="Path to the CL2 report directory") + parser_collect.add_argument("--cloud_info", type=str, help="Cloud information") + parser_collect.add_argument("--run_id", type=str, help="Run ID") + parser_collect.add_argument("--run_url", type=str, help="Run URL") + parser_collect.add_argument("--result_file", type=str, help="Path to the result file") + parser_collect.add_argument("--test_type", type=str, nargs='?', default="default-config", + help="Description of test type") + parser_collect.add_argument("--start_timestamp", type=str, help="Test start timestamp") + parser_collect.add_argument("--observability_tool", type=str, help="Observability tool evaluated in the test") + parser_collect.add_argument("--repository", type=str, help="Repository of observability tool evaluated in the test") + parser_collect.add_argument("--repository_ref", type=str, help="Repository Ref (branch/tag/SHA) of observability tool evaluated in the test") + parser_collect.add_argument("--fortio-servers-per-deployment", type=int, required=True, help="Number of Fortio servers per deployment") + parser_collect.add_argument("--fortio-clients-per-deployment", type=int, required=True, help="Number of Fortio clients per deployment") + parser_collect.add_argument("--fortio-client-queries-per-second", type=int, required=True, help="Queries per second for each Fortio client pod. NOT queries per second per connection") + parser_collect.add_argument("--fortio-client-connections", type=int, required=True, help="Number of simultaneous connections for each Fortio client") + parser_collect.add_argument("--fortio-namespaces", type=int, required=True, help="Number of namespaces, each with their own service. Fortio clients query servers in the same namespace. Be wary of integer division causing less pods than expected regarding this parameter, pods, and pods per node.") + parser_collect.add_argument("--fortio-deployments-per-namespace", type=int, required=True, help="Number of Fortio server deployments (and number of client deployments) per service/partition. Be wary of integer division causing less pods than expected regarding this parameter, namespaces, pods, and pods per node.") + parser_collect.add_argument("--network-policies-per-namespace", type=int, help="Number of network policies to be created per namespace", default=0, nargs='?') + parser_collect.add_argument("--generate-container-network-logs", type=str2bool, choices=[True, False], nargs='?', default=False, help="Generate Container Network Logs (default=False)") + parser_collect.add_argument("--label_traffic_pods", type=str2bool, choices=[True, False], nargs='?', default=False, help="Add/Remove label to client traffic pods(default=False)") + parser_collect.add_argument("--trigger_reason", type=str, help="What triggered the test", nargs='?', default="") + + args = parser.parse_args() + + if args.command == "configure": + configure_clusterloader2(args.fortio_servers_per_deployment, + args.fortio_clients_per_deployment, + args.fortio_client_queries_per_second, + args.fortio_client_connections, + args.fortio_namespaces, + args.fortio_deployments_per_namespace, + args.network_policies_per_namespace, + args.generate_container_network_logs, + args.label_traffic_pods, + args.cl2_override_file, + ) + elif args.command == "execute": + execute_clusterloader2(args.cl2_image, args.cl2_config_dir, args.cl2_report_dir, args.cl2_config_file, + args.kubeconfig, args.provider, args.scrape_containerd) + elif args.command == "collect": + collect_clusterloader2(args.cl2_report_dir, args.cloud_info, args.run_id, args.run_url, + args.result_file, args.test_type, args.start_timestamp, + args.observability_tool, + args.repository, + args.repository_ref, + args.fortio_servers_per_deployment, + args.fortio_clients_per_deployment, + args.fortio_client_queries_per_second, + args.fortio_client_connections, + args.fortio_namespaces, + args.fortio_deployments_per_namespace, + args.network_policies_per_namespace, + args.generate_container_network_logs, + args.label_traffic_pods, + args.trigger_reason, + ) + +if __name__ == "__main__": + main() diff --git a/modules/python/clusterloader2/utils.py b/modules/python/clusterloader2/utils.py index 8212b5ae7f..50deb2ed85 100644 --- a/modules/python/clusterloader2/utils.py +++ b/modules/python/clusterloader2/utils.py @@ -90,9 +90,29 @@ def get_measurement(file_path): group_name = file_name.split("_")[1] return file_prefix, group_name if file_name.startswith(PROM_QUERY_PREFIX): - group_name = file_name.split("_")[1] - measurement_name = file_name.split("_")[0][len(PROM_QUERY_PREFIX)+1:] - return measurement_name, group_name + # Remove "GenericPrometheusQuery " or "GenericPrometheusQuery_" prefix + if file_name.startswith(PROM_QUERY_PREFIX + " "): + remainder = file_name[len(PROM_QUERY_PREFIX) + 1:] + elif file_name.startswith(PROM_QUERY_PREFIX + "_"): + remainder = file_name[len(PROM_QUERY_PREFIX) + 1:] + else: + return None, None + + # Format: __.json + # Split on underscore to extract parts + parts = remainder.split("_") + if len(parts) >= 2: + # Find where the group starts (it's the part before the timestamp) + # Timestamp format: 2026-02-25T13:51:31Z.json (contains 'T' and 'Z') + for i in range(len(parts) - 1, 0, -1): + if 'T' in parts[i]: + # Found timestamp, group is parts[i-1] + group_name = parts[i - 1] + # Measurement is everything before the group + measurement_name = "_".join(parts[:i - 1]).rstrip("_") + return measurement_name, group_name + + return None, None if file_name.startswith(JOB_LIFECYCLE_LATENCY_PREFIX): group_name = file_name.split("_")[1] return JOB_LIFECYCLE_LATENCY_PREFIX, group_name diff --git a/modules/python/tests/mock_data/network-scale/report/GenericPrometheusQuery_CiliumAvgCPUUsage_scale-test_2025-03-04T05:35:56Z.json b/modules/python/tests/mock_data/network-scale/report/GenericPrometheusQuery_CiliumAvgCPUUsage_scale-test_2025-03-04T05:35:56Z.json new file mode 100644 index 0000000000..e37e687b64 --- /dev/null +++ b/modules/python/tests/mock_data/network-scale/report/GenericPrometheusQuery_CiliumAvgCPUUsage_scale-test_2025-03-04T05:35:56Z.json @@ -0,0 +1,29 @@ +{ + "version": "v1", + "dataItems": [ + { + "labels": { + "Metric": "Perc99" + }, + "data": { + "value": 0.5 + } + }, + { + "labels": { + "Metric": "Perc90" + }, + "data": { + "value": 0.3 + } + }, + { + "labels": { + "Metric": "Perc50" + }, + "data": { + "value": 0.1 + } + } + ] +} diff --git a/modules/python/tests/mock_data/network-scale/report/junit.xml b/modules/python/tests/mock_data/network-scale/report/junit.xml new file mode 100644 index 0000000000..1b80a746d6 --- /dev/null +++ b/modules/python/tests/mock_data/network-scale/report/junit.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/modules/python/tests/test_network_scale.py b/modules/python/tests/test_network_scale.py new file mode 100644 index 0000000000..42d1fadd97 --- /dev/null +++ b/modules/python/tests/test_network_scale.py @@ -0,0 +1,300 @@ +import json +import importlib.util +import os +import sys +import tempfile +import unittest +from pathlib import Path +from unittest.mock import patch + +MODULE_PATH = ( + Path(__file__).resolve().parents[1] + / "clusterloader2" + / "network-scale" + / "scale.py" +) +MODULE_SPEC = importlib.util.spec_from_file_location( + "clusterloader2_network_scale", MODULE_PATH +) +if MODULE_SPEC is None or MODULE_SPEC.loader is None: + raise ImportError(f"Unable to load module from {MODULE_PATH}") +network_scale_module = importlib.util.module_from_spec(MODULE_SPEC) +MODULE_SPEC.loader.exec_module(network_scale_module) + +configure_clusterloader2 = network_scale_module.configure_clusterloader2 +execute_clusterloader2 = network_scale_module.execute_clusterloader2 +collect_clusterloader2 = network_scale_module.collect_clusterloader2 +main = network_scale_module.main + + +class TestConfigureNetworkScale(unittest.TestCase): + """Test cases for configure_clusterloader2 function""" + + def test_basic_configuration(self): + """Test basic configuration with default parameters""" + with tempfile.NamedTemporaryFile( + delete=False, mode="w+", encoding="utf-8" + ) as tmp: + tmp_path = tmp.name + + try: + configure_clusterloader2( + fortio_servers_per_deployment=15, + fortio_clients_per_deployment=15, + fortio_client_queries_per_second=1500, + fortio_client_connections=50, + fortio_namespaces=1, + fortio_deployments_per_namespace=1000, + network_policies_per_namespace=100, + generate_container_network_logs=False, + label_traffic_pods=False, + override_file=tmp_path, + ) + + with open(tmp_path, "r", encoding="utf-8") as f: + content = f.read() + + # Assert Prometheus config + self.assertIn("CL2_PROMETHEUS_TOLERATE_MASTER: true", content) + self.assertIn("CL2_PROMETHEUS_MEMORY_LIMIT_FACTOR: 100.0", content) + self.assertIn("CL2_PROMETHEUS_SCRAPE_CILIUM_AGENT: true", content) + self.assertIn("CL2_PROMETHEUS_SCRAPE_CILIUM_OPERATOR: true", content) + self.assertIn("CL2_NODE_EXPORTER_OPERATION_TIMEOUT: 60m", content) + self.assertIn("CL2_NODE_EXPORTER_ENABLE_VALIDATION: false", content) + + # Assert Fortio config + self.assertIn("CL2_FORTIO_SERVERS_PER_DEPLOYMENT: 15", content) + self.assertIn("CL2_FORTIO_CLIENTS_PER_DEPLOYMENT: 15", content) + self.assertIn("CL2_FORTIO_CLIENT_QUERIES_PER_SECOND: 1500", content) + self.assertIn("CL2_FORTIO_CLIENT_CONNECTIONS: 50", content) + self.assertIn("CL2_FORTIO_NAMESPACES: 1", content) + self.assertIn("CL2_FORTIO_DEPLOYMENTS_PER_NAMESPACE: 1000", content) + + # Assert network policies and flags + self.assertIn("CL2_NETWORK_POLICIES_PER_NAMESPACE: 100", content) + self.assertIn("CL2_GENERATE_CONTAINER_NETWORK_LOGS: False", content) + self.assertIn("CL2_LABEL_TRAFFIC_PODS: False", content) + finally: + os.remove(tmp_path) + + def test_configuration_with_container_network_logs(self): + """Test configuration with Container Network Logs enabled""" + with tempfile.NamedTemporaryFile( + delete=False, mode="w+", encoding="utf-8" + ) as tmp: + tmp_path = tmp.name + + try: + configure_clusterloader2( + fortio_servers_per_deployment=10, + fortio_clients_per_deployment=10, + fortio_client_queries_per_second=1000, + fortio_client_connections=25, + fortio_namespaces=5, + fortio_deployments_per_namespace=100, + network_policies_per_namespace=50, + generate_container_network_logs=True, + label_traffic_pods=True, + override_file=tmp_path, + ) + + with open(tmp_path, "r", encoding="utf-8") as f: + content = f.read() + + self.assertIn("CL2_GENERATE_CONTAINER_NETWORK_LOGS: True", content) + self.assertIn("CL2_LABEL_TRAFFIC_PODS: True", content) + finally: + os.remove(tmp_path) + + +class TestExecuteNetworkScale(unittest.TestCase): + """Test cases for execute_clusterloader2 function""" + + @patch.object(network_scale_module, "run_cl2_command") + def test_execute_calls_run_cl2_command(self, mock_run_cl2): + """Test that execute_clusterloader2 calls run_cl2_command with correct params""" + execute_clusterloader2( + cl2_image="ghcr.io/azure/clusterloader2:v20250513", + cl2_config_dir="/path/to/config", + cl2_report_dir="/path/to/report", + cl2_config_file="config.yaml", + kubeconfig="/path/to/kubeconfig", + provider="aks", + scrape_containerd=False, + ) + + mock_run_cl2.assert_called_once_with( + "/path/to/kubeconfig", + "ghcr.io/azure/clusterloader2:v20250513", + "/path/to/config", + "/path/to/report", + "aks", + cl2_config_file="config.yaml", + overrides=True, + enable_prometheus=True, + scrape_containerd=False, + tear_down_prometheus=True, + scrape_kubelets=True, + scrape_ksm=True, + scrape_metrics_server=True, + ) + + +class TestCollectNetworkScale(unittest.TestCase): + """Test cases for collect_clusterloader2 function""" + + def test_collect_creates_result_file(self): + """Test that collect_clusterloader2 creates result file with correct structure""" + cl2_report_dir = os.path.join( + os.path.dirname(__file__), "mock_data", "network-scale", "report" + ) + result_file = tempfile.mktemp(suffix=".jsonl") + + try: + collect_clusterloader2( + cl2_report_dir=cl2_report_dir, + cloud_info=json.dumps({"cloud": "azure", "region": "eastus2"}), + run_id="test-run-123", + run_url="http://example.com/run123", + result_file=result_file, + test_type="unit-test", + start_timestamp="2025-03-04T05:00:00Z", + observability_tool="cnl", + repository="https://github.com/microsoft/retina", + repository_ref="main", + fortio_servers_per_deployment=15, + fortio_clients_per_deployment=15, + fortio_client_queries_per_second=1500, + fortio_client_connections=50, + fortio_namespaces=1, + fortio_deployments_per_namespace=1000, + network_policies_per_namespace=100, + generate_container_network_logs=True, + label_traffic_pods=False, + trigger_reason="Manual", + ) + + self.assertTrue(os.path.exists(result_file)) + with open(result_file, "r", encoding="utf-8") as f: + content = f.read() + + # Result should contain JSONL lines + self.assertTrue(len(content) > 0) + + # Parse the first line and verify structure + lines = content.strip().split("\n") + if lines and lines[0]: + result = json.loads(lines[0]) + self.assertEqual(result["status"], "success") + self.assertEqual(result["run_id"], "test-run-123") + self.assertEqual(result["test_type"], "unit-test") + self.assertEqual(result["observability_tool"], "cnl") + self.assertIn("test_details", result) + self.assertEqual(result["test_details"]["traffic_generator"], "fortio") + self.assertEqual(result["test_details"]["traffic_namespaces"], 1) + self.assertEqual(result["test_details"]["network_policies"], 100) + finally: + if os.path.exists(result_file): + os.remove(result_file) + + def test_collect_calculates_traffic_pods(self): + """Test that traffic_pods is calculated correctly""" + cl2_report_dir = os.path.join( + os.path.dirname(__file__), "mock_data", "network-scale", "report" + ) + result_file = tempfile.mktemp(suffix=".jsonl") + + try: + # 5 namespaces * 10 deployments * (3 servers + 3 clients) = 300 pods + collect_clusterloader2( + cl2_report_dir=cl2_report_dir, + cloud_info=json.dumps({"cloud": "azure"}), + run_id="test-run", + run_url="http://example.com", + result_file=result_file, + test_type="unit-test", + start_timestamp="2025-03-04T05:00:00Z", + observability_tool="cnl", + repository="", + repository_ref="", + fortio_servers_per_deployment=3, + fortio_clients_per_deployment=3, + fortio_client_queries_per_second=100, + fortio_client_connections=10, + fortio_namespaces=5, + fortio_deployments_per_namespace=10, + network_policies_per_namespace=0, + ) + + with open(result_file, "r", encoding="utf-8") as f: + content = f.read() + + lines = content.strip().split("\n") + if lines and lines[0]: + result = json.loads(lines[0]) + # 5 * 10 * (3 + 3) = 300 + self.assertEqual(result["test_details"]["traffic_pods"], 300) + finally: + if os.path.exists(result_file): + os.remove(result_file) + + +class TestMainArgumentParsing(unittest.TestCase): + """Test cases for main() argument parsing""" + + @patch.object(network_scale_module, "configure_clusterloader2") + def test_configure_command_parsing(self, mock_configure): + """Test that configure command parses arguments correctly""" + test_args = [ + "network-scale/scale.py", + "configure", + "--fortio-servers-per-deployment", "15", + "--fortio-clients-per-deployment", "15", + "--fortio-client-queries-per-second", "1500", + "--fortio-client-connections", "50", + "--fortio-namespaces", "1", + "--fortio-deployments-per-namespace", "1000", + "--network-policies-per-namespace", "100", + "--generate-container-network-logs", "True", + "--label_traffic_pods", "False", + "--cl2_override_file", "/tmp/overrides.yaml", + ] + + with patch.object(sys, "argv", test_args): + main() + + mock_configure.assert_called_once_with( + 15, 15, 1500, 50, 1, 1000, 100, True, False, "/tmp/overrides.yaml" + ) + + @patch.object(network_scale_module, "execute_clusterloader2") + def test_execute_command_parsing(self, mock_execute): + """Test that execute command parses arguments correctly""" + test_args = [ + "network-scale/scale.py", + "execute", + "--cl2-image", "ghcr.io/azure/clusterloader2:v20250513", + "--cl2-config-dir", "/path/to/config", + "--cl2-report-dir", "/path/to/report", + "--cl2-config-file", "config.yaml", + "--kubeconfig", "/path/to/kubeconfig", + "--provider", "aks", + "--scrape-containerd", "False", + ] + + with patch.object(sys, "argv", test_args): + main() + + mock_execute.assert_called_once_with( + "ghcr.io/azure/clusterloader2:v20250513", + "/path/to/config", + "/path/to/report", + "config.yaml", + "/path/to/kubeconfig", + "aks", + False, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/modules/terraform/azure/aks-cli/main.tf b/modules/terraform/azure/aks-cli/main.tf index 94b3437980..d0546d2361 100644 --- a/modules/terraform/azure/aks-cli/main.tf +++ b/modules/terraform/azure/aks-cli/main.tf @@ -230,7 +230,7 @@ resource "terraform_data" "enable_aks_cli_preview_extension" { EOT ) : ( <[_]+ (e.g. azure_eastus2, aws_eastus_westus) + dependsOn: [] + jobs: + - template: /jobs/competitive-test.yml # must keep as is + parameters: + cloud: azure + regions: + - eastus2 + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20250513" + install: false + topology: network-scale + matrix: + cnl: + traffic_deployment_count: 1000 + traffic_replica_count: 15 + network_policies_per_namespace: 1000 + cl2_config_file: config.yaml + # fortio variables + fortio_servers_per_deployment: 15 + fortio_clients_per_deployment: 15 + fortio_client_queries_per_second: 1500 + fortio_client_connections: 50 + fortio_namespaces: 1 + fortio_deployments_per_namespace: 1000 + generate_container_network_logs: true + label_traffic_pods: false + trigger_reason: ${{ variables['Build.Reason'] }} + max_parallel: 1 + timeout_in_minutes: 720 + credential_type: service_connection + ssh_key_enabled: false diff --git a/scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-inputs/azure.tfvars b/scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-inputs/azure.tfvars new file mode 100644 index 0000000000..1bdc0d6f97 --- /dev/null +++ b/scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-inputs/azure.tfvars @@ -0,0 +1,98 @@ +scenario_type = "perf-eval" +scenario_name = "cnl-azurecni-overlay-cilium" +deletion_delay = "20h" +owner = "aks" + +aks_cli_config_list = [ + { + role = "slo" + aks_name = "telescope-acns-scale-test" + kubernetes_version = "1.34" + sku_tier = "Standard" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { + name = "generate-ssh-keys" + value = "" + }, + { + name = "max-pods" + value = "250" + }, + { + name = "network-plugin" + value = "azure" + }, + { + name = "network-plugin-mode" + value = "overlay" + }, + { + name = "pod-cidr" + value = "100.64.0.0/10" + }, + { + name = "enable-acns" + value = "" + }, + { + name = "enable-container-network-logs" + value = "" + }, + { + name = "enable-addons" + value = "monitoring" + }, + { + name = "enable-high-log-scale-mode" + value = "" + }, + { + name = "network-dataplane" + value = "cilium" + }, + { + name = "zones" + value = "1 2 3" + } + ] + + default_node_pool = { + name = "default" + node_count = 5 + auto_scaling_enabled = false + vm_size = "Standard_D4_v3" + zones = ["1", "2", "3"] + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D64_v3" + zones = ["1", "2", "3"] + optional_parameters = [ + { + name = "labels" + value = "prometheus=true" + } + ] + }, + { + name = "traffic" + node_count = 1000 + auto_scaling_enabled = false + max_pods = 250 + vm_size = "Standard_D4_v3" + zones = ["1", "2", "3"] + optional_parameters = [ + { + name = "labels" + value = "slo=true scale-test=true" + } + ] + } + ] + } +] diff --git a/scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-test-inputs/azure.json b/scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-test-inputs/azure.json new file mode 100644 index 0000000000..ea27a572c6 --- /dev/null +++ b/scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-test-inputs/azure.json @@ -0,0 +1,4 @@ +{ + "run_id" : "123456789", + "region" : "eastus" +} diff --git a/steps/engine/clusterloader2/network-scale/collect.yml b/steps/engine/clusterloader2/network-scale/collect.yml new file mode 100644 index 0000000000..692e81f26f --- /dev/null +++ b/steps/engine/clusterloader2/network-scale/collect.yml @@ -0,0 +1,47 @@ +parameters: +- name: cloud + type: string + default: '' +- name: engine_input + type: object + default: {} +- name: region + type: string + +steps: +- template: /steps/cloud/${{ parameters.cloud }}/collect-cloud-info.yml + parameters: + region: ${{ parameters.region }} +- script: | + set -x + set -eo pipefail + + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect \ + --cl2_report_dir ${CL2_REPORT_DIR:-0} \ + --cloud_info "${CLOUD_INFO:-""}" \ + --run_id $RUN_ID \ + --run_url $RUN_URL \ + --result_file $TEST_RESULTS_FILE \ + --start_timestamp $START_TIME \ + --observability_tool ${OBSERVABILITY_TOOL:-""} \ + --repository ${REPOSITORY:-""} \ + --repository_ref ${REPOSITORY_REF:-""} \ + --fortio-servers-per-deployment $FORTIO_SERVERS_PER_DEPLOYMENT \ + --fortio-clients-per-deployment $FORTIO_CLIENTS_PER_DEPLOYMENT \ + --fortio-client-queries-per-second $FORTIO_CLIENT_QUERIES_PER_SECOND \ + --fortio-client-connections $FORTIO_CLIENT_CONNECTIONS \ + --fortio-namespaces $FORTIO_NAMESPACES \ + --fortio-deployments-per-namespace $FORTIO_DEPLOYMENTS_PER_NAMESPACE \ + --network-policies-per-namespace $NETWORK_POLICIES_PER_NAMESPACE \ + --generate-container-network-logs ${GENERATE_CONTAINER_NETWORK_LOGS:-False} \ + --label_traffic_pods ${LABEL_TRAFFIC_PODS:-False} \ + --trigger_reason ${TRIGGER_REASON:-""} + workingDirectory: modules/python + env: + CLOUD: ${{ parameters.cloud }} + RUN_URL: $(RUN_URL) + PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/clusterloader2/network-scale/scale.py + CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/network-scale/results + REPOSITORY: ${{ parameters.engine_input.repository }} + REPOSITORY_REF: ${{ parameters.engine_input.ref }} + displayName: "Collect Results" diff --git a/steps/engine/clusterloader2/network-scale/execute.yml b/steps/engine/clusterloader2/network-scale/execute.yml new file mode 100644 index 0000000000..99b1cea821 --- /dev/null +++ b/steps/engine/clusterloader2/network-scale/execute.yml @@ -0,0 +1,107 @@ +parameters: + - name: cloud + type: string + default: "" + - name: engine_input + type: object + default: {} + - name: region + type: string + - name: install + type: boolean + default: false + - name: repository + type: string + default: "" + - name: makeargs + type: string + default: "" + - name: ref + type: string + default: "main" + +steps: + - script: | + echo "Set the start time for test execution" + startTimestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + echo "Start: $startTimestamp" + echo "##vso[task.setvariable variable=START_TIME]$startTimestamp" + displayName: set up timestamp variable + + - ${{ if parameters.install }}: + - script: | + set -eo pipefail + if [ -z "$REPOSITORY" ]; then + echo "##vso[task.logissue type=error]install=true but repository parameter is empty. Please provide a valid repository URL." + exit 1 + fi + if [ -z "$MAKEARGS" ]; then + echo "##vso[task.logissue type=error]install=true but makeargs parameter is empty. Please provide make target(s) (e.g., 'helm-install')." + exit 1 + fi + env: + REPOSITORY: ${{ parameters.repository }} + MAKEARGS: ${{ parameters.makeargs }} + displayName: Validate install parameters + + - script: | + set -eo pipefail + git clone ${{ parameters.repository }} retina --no-checkout + cd retina + git fetch --depth 1 origin ${{ parameters.ref }} + git checkout FETCH_HEAD + displayName: Clone Retina OSS Repo + + - script: | + set -eo pipefail + cd retina + make ${{ parameters.makeargs }} + displayName: Install Retina + + - script: | + set -eo pipefail + + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE configure \ + --fortio-servers-per-deployment $FORTIO_SERVERS_PER_DEPLOYMENT \ + --fortio-clients-per-deployment $FORTIO_CLIENTS_PER_DEPLOYMENT \ + --fortio-client-queries-per-second $FORTIO_CLIENT_QUERIES_PER_SECOND \ + --fortio-client-connections $FORTIO_CLIENT_CONNECTIONS \ + --fortio-namespaces $FORTIO_NAMESPACES \ + --fortio-deployments-per-namespace $FORTIO_DEPLOYMENTS_PER_NAMESPACE \ + --network-policies-per-namespace $NETWORK_POLICIES_PER_NAMESPACE \ + --generate-container-network-logs ${GENERATE_CONTAINER_NETWORK_LOGS:-False} \ + --label_traffic_pods ${LABEL_TRAFFIC_PODS:-False} \ + --cl2_override_file ${CL2_CONFIG_DIR}/overrides.yaml + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \ + --cl2-image "${CL2_IMAGE}" \ + --cl2-config-dir "${CL2_CONFIG_DIR}" \ + --cl2-report-dir "${CL2_REPORT_DIR}" \ + --cl2-config-file "${CL2_CONFIG_FILE}" \ + --kubeconfig "${HOME}/.kube/config" \ + --provider "${CLOUD}" \ + --scrape-containerd ${SCRAPE_CONTAINERD:-False} + workingDirectory: modules/python + env: + ${{ if eq(parameters.cloud, 'azure') }}: + CLOUD: aks + ${{ else }}: + CLOUD: ${{ parameters.cloud }} + REGION: ${{ parameters.region }} + PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/clusterloader2/network-scale/scale.py + CL2_IMAGE: ${{ parameters.engine_input.image }} + CL2_CONFIG_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/network-scale/config + CL2_CONFIG_FILE: config.yaml + CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/network-scale/results + displayName: "Run Benchmark" + + - ${{ if parameters.install }}: + - script: | + set -eo pipefail + if [ -d "retina" ]; then + cd retina + make helm-uninstall + else + echo "Retina directory does not exist, skipping uninstall" + fi + condition: always() + displayName: Uninstall Helm Chart diff --git a/steps/topology/network-scale/collect-clusterloader2.yml b/steps/topology/network-scale/collect-clusterloader2.yml new file mode 100644 index 0000000000..554ed17a31 --- /dev/null +++ b/steps/topology/network-scale/collect-clusterloader2.yml @@ -0,0 +1,18 @@ +parameters: +- name: cloud + type: string + default: '' +- name: engine_input + type: object + default: {} +- name: regions + type: object + default: {} + +steps: +- template: /steps/set-run-id.yml +- template: /steps/engine/clusterloader2/network-scale/collect.yml + parameters: + cloud: ${{ parameters.cloud }} + engine_input: ${{ parameters.engine_input }} + region: ${{ parameters.regions[0] }} diff --git a/steps/topology/network-scale/execute-clusterloader2.yml b/steps/topology/network-scale/execute-clusterloader2.yml new file mode 100644 index 0000000000..9a72cf39e6 --- /dev/null +++ b/steps/topology/network-scale/execute-clusterloader2.yml @@ -0,0 +1,25 @@ +parameters: +- name: cloud + type: string + default: '' +- name: engine_input + type: object + default: {} +- name: regions + type: object + default: {} + +steps: +- template: /steps/engine/clusterloader2/network-scale/execute.yml + parameters: + cloud: ${{ parameters.cloud }} + engine_input: ${{ parameters.engine_input }} + region: ${{ parameters.regions[0] }} + ${{ if eq(parameters.engine_input.install, true) }}: + install: true + ${{ if not(eq(parameters.engine_input.repository, '')) }}: + repository: ${{ parameters.engine_input.repository }} + ${{ if not(eq(parameters.engine_input.makeargs, '')) }}: + makeargs: ${{ parameters.engine_input.makeargs }} + ${{ if not(eq(parameters.engine_input.ref, '')) }}: + ref: ${{ parameters.engine_input.ref }} diff --git a/steps/topology/network-scale/validate-resources.yml b/steps/topology/network-scale/validate-resources.yml new file mode 100644 index 0000000000..f4fadb4298 --- /dev/null +++ b/steps/topology/network-scale/validate-resources.yml @@ -0,0 +1,47 @@ +parameters: + - name: cloud + type: string + - name: engine + type: string + - name: regions + type: object + +steps: + - template: /steps/cloud/${{ parameters.cloud }}/update-kubeconfig.yml + parameters: + role: slo + region: ${{ parameters.regions[0] }} + - template: /steps/engine/clusterloader2/large-cluster/validate.yml + parameters: + desired_nodes: 16 + validation_timeout_in_minutes: 60 + - script: | + set -eo pipefail + echo "Waiting for ContainerNetworkLog CRD to be available..." + timeout=300 # 5 minutes timeout + interval=10 + elapsed=0 + + # Debug: Show available acn.azure.com CRDs + echo "Checking for ACNS-related CRDs..." + kubectl get crd | grep -E "acn\.azure\.com|cilium" || echo "No ACNS/Cilium CRDs found yet" + + while ! kubectl get crd containernetworklogs.acn.azure.com &>/dev/null; do + if [ $elapsed -ge $timeout ]; then + echo "##vso[task.logissue type=error]Timeout waiting for ContainerNetworkLog CRD." + echo "" + echo "The ContainerNetworkLog CRD requires the AdvancedNetworkingFlowLogsPreview feature flag" + echo "to be registered BEFORE cluster creation." + echo "" + echo "Available CRDs:" + kubectl get crd | grep -E "acn|cilium|network" || true + exit 1 + fi + echo "ContainerNetworkLog CRD not found yet. Retrying in ${interval}s... (${elapsed}s elapsed)" + sleep $interval + elapsed=$((elapsed + interval)) + done + + echo "ContainerNetworkLog CRD is available!" + kubectl get crd containernetworklogs.acn.azure.com + displayName: "Wait for ContainerNetworkLog CRD"