From 5976c8c3f7eeb200a80ec94f1eb557c6311f7c13 Mon Sep 17 00:00:00 2001 From: Murali Krishnasamy Date: Tue, 18 Nov 2025 11:38:41 -0500 Subject: [PATCH] rebased and squashed all commits Signed-off-by: Murali Krishnasamy --- .../metrics-endpoint.yml | 1 + .../aro/hosted-cp-metrics.yml | 21 ++++++++++++------- .../metrics-profiles/aro/mc-metrics.yml | 10 +-------- workloads/kube-burner-ocp-wrapper/run.sh | 13 +++++++----- 4 files changed, 24 insertions(+), 21 deletions(-) diff --git a/workloads/kube-burner-ocp-wrapper/metrics-endpoint.yml b/workloads/kube-burner-ocp-wrapper/metrics-endpoint.yml index 0945e0f7..7149ae12 100644 --- a/workloads/kube-burner-ocp-wrapper/metrics-endpoint.yml +++ b/workloads/kube-burner-ocp-wrapper/metrics-endpoint.yml @@ -1,4 +1,5 @@ - endpoint: {{.MC_OBO}} + {{if ne .HC_PLATFORM "aws"}}token: {{.MC_PROMETHEUS_TOKEN}}{{end}} metrics: - metrics-profiles/{{.HC_PRODUCT}}/hosted-cp-metrics.yml alerts: diff --git a/workloads/kube-burner-ocp-wrapper/metrics-profiles/aro/hosted-cp-metrics.yml b/workloads/kube-burner-ocp-wrapper/metrics-profiles/aro/hosted-cp-metrics.yml index d188b488..9761e7c0 100644 --- a/workloads/kube-burner-ocp-wrapper/metrics-profiles/aro/hosted-cp-metrics.yml +++ b/workloads/kube-burner-ocp-wrapper/metrics-profiles/aro/hosted-cp-metrics.yml @@ -4,29 +4,36 @@ # OVN service sync latency -- query: histogram_quantile(0.99, sum(rate(ovnkube_master_network_programming_duration_seconds_bucket{namespace=~".+{{.HCP_NAMESPACE}}", kind="service"}[2m])) by (le)) +- query: histogram_quantile(0.99, sum(rate(ovnkube_master_network_programming_duration_seconds_bucket{cluster="{{.MC_NAME}}",namespace=~".+{{.HCP_NAMESPACE}}", kind="service"}[2m])) by (le)) metricName: serviceSyncLatency # Etcd metrics -- query: sum(rate(etcd_server_leader_changes_seen_total{namespace=~".+{{.HCP_NAMESPACE}}"}[2m])) +- query: sum(rate(etcd_server_leader_changes_seen_total{cluster="{{.MC_NAME}}",namespace=~".+{{.HCP_NAMESPACE}}"}[2m])) metricName: etcdLeaderChangesRate -- query: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{namespace=~".+{{.HCP_NAMESPACE}}"}[2m])) +- query: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster="{{.MC_NAME}}",namespace=~".+{{.HCP_NAMESPACE}}"}[2m])) metricName: 99thEtcdDiskBackendCommitDurationSeconds -- query: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{namespace=~".+{{.HCP_NAMESPACE}}"}[2m])) +- query: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{cluster="{{.MC_NAME}}",namespace=~".+{{.HCP_NAMESPACE}}"}[2m])) metricName: 99thEtcdDiskWalFsyncDurationSeconds -- query: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{namespace=~".+{{.HCP_NAMESPACE}}"}[5m])) +- query: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{cluster="{{.MC_NAME}}",namespace=~".+{{.HCP_NAMESPACE}}"}[5m])) metricName: 99thEtcdRoundTripTimeSeconds -- query: sum by (cluster_version)(etcd_cluster_version) +- query: sum by (cluster_version)(etcd_cluster_version{cluster="{{.MC_NAME}}"}) metricName: etcdVersion instant: true # Cluster version +- query: sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster="{{.MC_NAME}}",namespace=~".*{{.HCP_NAMESPACE}}"}) by (pod,container,namespace) + metricName: podCPUReq + instant: true + +- query: sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster="{{.MC_NAME}}",namespace=~".*{{.HCP_NAMESPACE}}"}) by (pod,container,namespace) + metricName: podMemoryReq + instant: true -- query: cluster_version{type="completed", namespace=~".+{{.HCP_NAMESPACE}}"} +- query: cluster_version{cluster="{{.MC_NAME}}",type="completed", namespace=~".+{{.HCP_NAMESPACE}}"} metricName: clusterVersion instant: true diff --git a/workloads/kube-burner-ocp-wrapper/metrics-profiles/aro/mc-metrics.yml b/workloads/kube-burner-ocp-wrapper/metrics-profiles/aro/mc-metrics.yml index 7b1f47e0..50075084 100644 --- a/workloads/kube-burner-ocp-wrapper/metrics-profiles/aro/mc-metrics.yml +++ b/workloads/kube-burner-ocp-wrapper/metrics-profiles/aro/mc-metrics.yml @@ -4,7 +4,7 @@ # Management Node metrics: CPU & Memory -- query: kube_node_role{} +- query: kube_node_info{cluster="{{.MC_NAME}}"} metricName: mgmtNodeRoles - query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) unless on (instance) label_replace(kube_node_role{cluster="{{.MC_NAME}}",role="infra"}, "instance", "$1", "node", "(.+)")) > 0 @@ -89,14 +89,6 @@ - query: sum(container_memory_cache{cluster="{{.MC_NAME}}",name!="",container!="POD",namespace=~".+{{.HCP_NAMESPACE}}"}) by (pod, container, namespace, node) metricName: podMemoryCache-Controlplane -- query: sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{namespace=~".+{{.HCP_NAMESPACE}}"}) by (pod,container,namespace) - metricName: podCPUReq - instant: true - -- query: sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{namespace=~".+{{.HCP_NAMESPACE}}"}) by (pod,container,namespace) - metricName: podMemoryReq - instant: true - - query: kubernetes_build_info{cluster="{{.MC_NAME}}"} metricName: mgmtClusterVersion instant: true diff --git a/workloads/kube-burner-ocp-wrapper/run.sh b/workloads/kube-burner-ocp-wrapper/run.sh index 858f0cf4..e8a9be45 100755 --- a/workloads/kube-burner-ocp-wrapper/run.sh +++ b/workloads/kube-burner-ocp-wrapper/run.sh @@ -53,10 +53,12 @@ hypershift(){ else echo "Detected ${HC_PLATFORM} environment..." - MC_NAME=$(kubectl config view -o jsonpath='{.clusters[].name}' --kubeconfig=${MC_KUBECONFIG}) - HC_NAME=$(oc get infrastructure cluster -o go-template --template='{{.status.etcdDiscoveryDomain}}' | awk -F. '{print$1}') + if [ -z "${MC_NAME}" ]; then + MC_NAME=$(kubectl config view -o jsonpath='{.clusters[].name}' --kubeconfig="${MC_KUBECONFIG}") + fi + HC_NAME=$(oc get infrastructure cluster -o go-template --template='{{.status.etcdDiscoveryDomain}}' | awk -F. '{print$2}') HCP_NAMESPACE=${HC_NAME} - QUERY="sum(kube_node_role{cluster=\"$MC_NAME\",role=\"worker\"})by(node)" + QUERY="sum(node_memory_MemTotal_bytes{cluster=\"$MC_NAME\",instance=~\".*user.*\"})by(instance)" if [[ -z ${AKS_PROM} ]] || [[ -z ${AZURE_PROM} ]] ; then echo "Azure/AKS prometheus inputs are missing, exiting.." @@ -94,7 +96,7 @@ EOF echo "Get all management worker nodes, excludes infra, obo, workload" Q_NODES="" Q_STDOUT=$(curl -H "Authorization: Bearer ${MC_PROMETHEUS_TOKEN}" -k --silent --globoff ${MC_PROMETHEUS}/api/v1/query?query=${QUERY}&time='$(date +"%s")') - for n in $(echo $Q_STDOUT | jq -r '.data.result[].metric.node'); do + for n in $(echo "$Q_STDOUT" | jq -r ".data.result[].metric.$([ \"$HC_PLATFORM\" = \"aws\" ] && echo node || echo instance)"); do if [[ ${Q_NODES} == "" ]]; then Q_NODES=${n} else @@ -114,13 +116,14 @@ HOSTED_PROMETHEUS_TOKEN: HCP_NAMESPACE: ${HCP_NAMESPACE} MGMT_WORKER_NODES: ${MGMT_WORKER_NODES} HC_PRODUCT: ${HC_PRODUCT} +HC_PLATFORM: ${HC_PLATFORM} EOF if [[ ${WORKLOAD} =~ "index" ]]; then export elapsed=${ELAPSED:-20m} fi - export MC_OBO MC_PROMETHEUS MC_PROMETHEUS_TOKEN HOSTED_PROMETHEUS HOSTED_PROMETHEUS_TOKEN HCP_NAMESPACE MGMT_WORKER_NODES HC_PRODUCT MC_NAME + export MC_OBO MC_PROMETHEUS MC_PROMETHEUS_TOKEN HOSTED_PROMETHEUS HOSTED_PROMETHEUS_TOKEN HCP_NAMESPACE MGMT_WORKER_NODES HC_PRODUCT MC_NAME HC_PLATFORM }