From 240f5420bfe02fb2ec1bcd30c012f73a29b9bb6e Mon Sep 17 00:00:00 2001
From: Srikanth Muppandam <smuppand@qti.qualcomm.com>
Date: Mon, 8 Dec 2025 11:30:46 +0530
Subject: [PATCH 1/3] lib_performance: centralize systemd-analyze wait logic
 Expose wait_analyze_ready() to poll systemd-analyze time safely. Record
 systemctl list-jobs whenever boot is not yet finished. Allow configurable
 timeout and poll interval via environment variables. Prepare shared helpers
 for reuse across performance KPI test suites.

Signed-off-by: Srikanth Muppandam <smuppand@qti.qualcomm.com>
---
 Runner/utils/lib_performance.sh | 962 ++++++++++++++++++++++++++++++++
 1 file changed, 962 insertions(+)
 create mode 100755 Runner/utils/lib_performance.sh

diff --git a/Runner/utils/lib_performance.sh b/Runner/utils/lib_performance.sh
new file mode 100755
index 00000000..a27a1d91
--- /dev/null
+++ b/Runner/utils/lib_performance.sh
@@ -0,0 +1,962 @@
+#!/bin/sh
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause-Clear
+# Common performance-related helpers for KPI-style tests.
+# ---------------------------------------------------------------------------
+# Generic timestamp + escaping
+# ---------------------------------------------------------------------------
+
+nowstamp() {
+    date -u +%Y%m%dT%H%M%SZ 2>/dev/null || date +%s
+}
+
+# Basic JSON string escaper (used by KPI tests)
+esc() {
+    # Escape backslash and double-quote
+    printf '%s' "$1" \
+        | sed 's/\\/\\\\/g;s/"/\\"/g'
+}
+
+# ---------------------------------------------------------------------------
+# CPU governor helpers
+# ---------------------------------------------------------------------------
+
+# Put all CPUs into performance governor, saving previous governor for restore.
+# Uses SAVED_GOV_FILE (auto set if not provided).
+set_performance_governor() {
+    SAVED_GOV_FILE="${SAVED_GOV_FILE:-/tmp/perf_saved_governors.$$}"
+    : >"$SAVED_GOV_FILE" 2>/dev/null || return 0
+
+    for c in /sys/devices/system/cpu/cpu[0-9]*; do
+        [ -d "$c" ] || continue
+        gov_file="$c/cpufreq/scaling_governor"
+        [ -f "$gov_file" ] || continue
+
+        cur_gov=$(cat "$gov_file" 2>/dev/null || echo "")
+        # Record current governor
+        printf '%s:%s\n' "$gov_file" "$cur_gov" >>"$SAVED_GOV_FILE" 2>/dev/null || true
+
+        # Try to set performance, but do not fail test if it does not exist
+        echo performance >"$gov_file" 2>/dev/null || true
+    done
+
+    if command -v log_info >/dev/null 2>&1; then
+        log_info "CPU governors set to performance (saved in $SAVED_GOV_FILE)"
+    fi
+}
+
+# Restore governors from the temp file created by set_performance_governor()
+restore_governor() {
+    if [ -z "${SAVED_GOV_FILE:-}" ]; then
+        return 0
+    fi
+    if [ ! -f "$SAVED_GOV_FILE" ]; then
+        return 0
+    fi
+
+    while IFS= read -r line; do
+        [ -n "$line" ] || continue
+        gov_file=${line%%:*}
+        old_gov=${line#*:}
+        [ -f "$gov_file" ] || continue
+        [ -n "$old_gov" ] || continue
+        echo "$old_gov" >"$gov_file" 2>/dev/null || true
+    done <"$SAVED_GOV_FILE"
+
+    rm -f "$SAVED_GOV_FILE" 2>/dev/null || true
+
+    if command -v log_info >/dev/null 2>&1; then
+        log_info "Restored original CPU governors from saved state"
+    fi
+}
+
+# ---------------------------------------------------------------------------
+# Clocksource
+# ---------------------------------------------------------------------------
+
+# Capture the current clocksource into a text file.
+# Usage: capture_clocksource /path/to/file
+capture_clocksource() {
+    out_file=$1
+    [ -n "$out_file" ] || out_file="./clocksource.txt"
+
+    if [ -r /sys/devices/system/clocksource/clocksource0/current_clocksource ]; then
+        cs=$(cat /sys/devices/system/clocksource/clocksource0/current_clocksource 2>/dev/null || echo "unknown")
+        {
+            echo "timestamp=$(nowstamp)"
+            echo "clocksource=$cs"
+        } >"$out_file" 2>/dev/null || true
+
+        if command -v log_info >/dev/null 2>&1; then
+            log_info "Clocksource: $cs → $out_file"
+        fi
+    else
+        if command -v log_warn >/dev/null 2>&1; then
+            log_warn "current_clocksource not available; skipping clocksource capture"
+        fi
+    fi
+}
+
+# ---------------------------------------------------------------------------
+# Boot type tag
+# ---------------------------------------------------------------------------
+
+# Capture boot type tag (cold/warm/unknown) into a text file.
+# Usage: capture_boot_type <tag> <file>
+capture_boot_type() {
+    tag=$1
+    out_file=$2
+
+    [ -n "$tag" ] || tag="unknown"
+    [ -n "$out_file" ] || out_file="./boot_type.txt"
+
+    {
+        echo "timestamp=$(nowstamp)"
+        echo "boot_type=$tag"
+    } >"$out_file" 2>/dev/null || true
+
+    if command -v log_info >/dev/null 2>&1; then
+        log_info "Boot type tagged as '$tag' → $out_file"
+    fi
+}
+
+# ---------------------------------------------------------------------------
+# System services / “heavy” log producers
+# ---------------------------------------------------------------------------
+
+# Optionally disable heavy services for KPI runs.
+# Usage: disable_heavy_services_if_requested <disable_getty_flag> <disable_sshd_flag>
+# Flags are "1" to disable, anything else to leave alone.
+disable_heavy_services_if_requested() {
+    disable_getty=$1
+    disable_sshd=$2
+
+    if ! command -v systemctl >/dev/null 2>&1; then
+        if command -v log_warn >/dev/null 2>&1; then
+            log_warn "systemctl not found; cannot apply getty/sshd KPI tweaks"
+        fi
+        return 0
+    fi
+
+    if [ "$disable_getty" = "1" ]; then
+        systemctl disable serial-getty@ttyS0.service >/dev/null 2>&1 || true
+        systemctl stop serial-getty@ttyS0.service >/dev/null 2>&1 || true
+        if command -v log_info >/dev/null 2>&1; then
+            log_info "Disabled serial-getty@ttyS0.service for KPI run"
+        fi
+    fi
+
+    if [ "$disable_sshd" = "1" ]; then
+        systemctl disable sshd.service >/dev/null 2>&1 || true
+        systemctl stop sshd.service >/dev/null 2>&1 || true
+        if command -v log_info >/dev/null 2>&1; then
+            log_info "Disabled sshd.service for KPI run"
+        fi
+    fi
+}
+
+# ---------------------------------------------------------------------------
+# Bootchart
+# ---------------------------------------------------------------------------
+
+# Check if systemd-bootchart is enabled via kernel cmdline.
+# Returns 0 if init=/lib/systemd/systemd-bootchart is present.
+bootchart_enabled() {
+    if [ -r /proc/cmdline ]; then
+        grep -qw 'init=/lib/systemd/systemd-bootchart' /proc/cmdline 2>/dev/null
+        return $?
+    fi
+    return 1
+}
+
+# ---------------------------------------------------------------------------
+# Boot KPI helpers: systemd-analyze time parsing + UEFI loader times + networkd
+# ---------------------------------------------------------------------------
+
+# Convert a single duration token like "3.801s", "174ms", "2min" to seconds.
+perf_time_token_to_sec() {
+  token="$1"
+  [ -n "$token" ] || { echo ""; return 0; }
+
+  printf '%s\n' "$token" | awk '
+    {
+      v = $1
+      if (v ~ /min/) {
+        gsub(/[^0-9.]/, "", v)
+        if (v == "") { print ""; exit }
+        s = v * 60
+      } else if (v ~ /ms$/) {
+        gsub(/[^0-9.]/, "", v)
+        if (v == "") { print ""; exit }
+        s = v / 1000.0
+      } else if (v ~ /s$/) {
+        gsub(/[^0-9.]/, "", v)
+        if (v == "") { print ""; exit }
+        s = v
+      } else {
+        s = 0
+      }
+    }
+    END {
+      if (s > 0) {
+        printf("%.3f\n", s)
+      }
+    }'
+}
+
+# Convert a segment like "2min 7.045s" or "187ms" to seconds.
+perf_time_segment_to_sec() {
+  seg="$1"
+  [ -n "$seg" ] || { echo ""; return 0; }
+
+  printf '%s\n' "$seg" | awk '
+    {
+      sec = 0
+      for (i = 1; i <= NF; i++) {
+        v = $i
+        if (v ~ /min/) {
+          gsub(/[^0-9.]/, "", v)
+          if (v != "") sec += v * 60
+        } else if (v ~ /ms$/) {
+          gsub(/[^0-9.]/, "", v)
+          if (v != "") sec += v / 1000.0
+        } else if (v ~ /s$/) {
+          gsub(/[^0-9.]/, "", v)
+          if (v != "") sec += v
+        }
+      }
+    }
+    END {
+      if (sec > 0) {
+        printf("%.3f\n", sec)
+      }
+    }'
+}
+
+# Read UEFI loader times from efivars (if present)
+# Sets:
+# PERF_UEFI_INIT_SEC, PERF_UEFI_EXEC_SEC, PERF_UEFI_TOTAL_SEC
+perf_read_uefi_loader_times() {
+  base="/sys/firmware/efi/efivars"
+  init_var="$base/LoaderTimeInitUSec-4a67b082-0a4c-41cf-b6c7-440b29bb8c4f"
+  exec_var="$base/LoaderTimeExecUSec-4a67b082-0a4c-41cf-b6c7-440b29bb8c4f"
+
+  PERF_UEFI_INIT_SEC=""
+  PERF_UEFI_EXEC_SEC=""
+  PERF_UEFI_TOTAL_SEC=""
+
+  if [ ! -r "$init_var" ] || [ ! -r "$exec_var" ]; then
+    export PERF_UEFI_INIT_SEC PERF_UEFI_EXEC_SEC PERF_UEFI_TOTAL_SEC
+    return 0
+  fi
+
+  init_us=$(tail -c 8 "$init_var" 2>/dev/null | od -An -t u8 2>/dev/null | awk '{print $1}')
+  exec_us=$(tail -c 8 "$exec_var" 2>/dev/null | od -An -t u8 2>/dev/null | awk '{print $1}')
+
+  if [ -n "$init_us" ] && [ -n "$exec_us" ]; then
+    PERF_UEFI_INIT_SEC=$(printf '%s\n' "$init_us" | awk '{printf("%.3f", $1/1000000)}')
+    PERF_UEFI_EXEC_SEC=$(printf '%s\n' "$exec_us" | awk '{printf("%.3f", $1/1000000)}')
+    PERF_UEFI_TOTAL_SEC=$(printf '%s %s\n' "$PERF_UEFI_INIT_SEC" "$PERF_UEFI_EXEC_SEC" \
+      | awk '{printf("%.3f", $1 + $2)}')
+  fi
+
+  export PERF_UEFI_INIT_SEC PERF_UEFI_EXEC_SEC PERF_UEFI_TOTAL_SEC
+}
+
+# Parse systemd-analyze time output + blame, and optionally exclude
+# systemd-networkd-wait-online.service from userspace/total and an
+# arbitrary list of services given via CLI.
+#
+# perf_parse_boot_times <analyze_time.txt> <blame.txt> <exclude_networkd_flag> <exclude_services_list>
+#
+# Sets:
+#   PERF_FIRMWARE_SEC
+#   PERF_LOADER_SEC
+#   PERF_KERNEL_SEC
+#   PERF_USERSPACE_SEC
+#   PERF_TOTAL_SEC
+#   PERF_NETWORKD_WAIT_ONLINE_SEC
+#   PERF_EXCLUDED_SERVICES_LIST
+#   PERF_EXCLUDED_SERVICES_SEC
+#   PERF_EXCLUDED_TOTAL_SEC          (networkd + other excluded services)
+#   PERF_USERSPACE_EFFECTIVE_SEC
+#   PERF_TOTAL_EFFECTIVE_SEC
+perf_parse_boot_times() {
+  at_file="$1"
+  blame_file="$2"
+  exclude_networkd="$3"
+  exclude_services_raw="$4"
+ 
+  PERF_FIRMWARE_SEC=""
+  PERF_LOADER_SEC=""
+  PERF_KERNEL_SEC=""
+  PERF_USERSPACE_SEC=""
+  PERF_TOTAL_SEC=""
+  PERF_NETWORKD_WAIT_ONLINE_SEC=""
+  PERF_EXCLUDED_SERVICES_LIST=""
+  PERF_EXCLUDED_SERVICES_SEC=""
+  PERF_EXCLUDED_TOTAL_SEC=""
+  PERF_USERSPACE_EFFECTIVE_SEC=""
+  PERF_TOTAL_EFFECTIVE_SEC=""
+ 
+  if [ ! -f "$at_file" ]; then
+    export PERF_FIRMWARE_SEC PERF_LOADER_SEC PERF_KERNEL_SEC PERF_USERSPACE_SEC PERF_TOTAL_SEC \
+           PERF_NETWORKD_WAIT_ONLINE_SEC PERF_EXCLUDED_SERVICES_LIST PERF_EXCLUDED_SERVICES_SEC \
+           PERF_EXCLUDED_TOTAL_SEC PERF_USERSPACE_EFFECTIVE_SEC PERF_TOTAL_EFFECTIVE_SEC
+    return 0
+  fi
+ 
+  line=$(grep -m1 'Startup finished in' "$at_file" 2>/dev/null || true)
+  if [ -z "$line" ]; then
+    export PERF_FIRMWARE_SEC PERF_LOADER_SEC PERF_KERNEL_SEC PERF_USERSPACE_SEC PERF_TOTAL_SEC \
+           PERF_NETWORKD_WAIT_ONLINE_SEC PERF_EXCLUDED_SERVICES_LIST PERF_EXCLUDED_SERVICES_SEC \
+           PERF_EXCLUDED_TOTAL_SEC PERF_USERSPACE_EFFECTIVE_SEC PERF_TOTAL_EFFECTIVE_SEC
+    return 0
+  fi
+ 
+  firmware_tok=$(printf '%s\n' "$line" \
+    | sed -n 's/.*Startup finished in \([^ ]*\) (firmware).*/\1/p')
+  loader_tok=$(printf '%s\n' "$line" \
+    | sed -n 's/.*(firmware) + \([^ ]*\) (loader).*/\1/p')
+  kernel_tok=$(printf '%s\n' "$line" \
+    | sed -n 's/.*(loader) + \([^ ]*\) (kernel).*/\1/p')
+  userspace_seg=$(printf '%s\n' "$line" \
+    | sed -n 's/.*(kernel) + \(.*\) (userspace) =.*/\1/p')
+  total_seg=$(printf '%s\n' "$line" \
+    | sed -n 's/.*= \(.*\)$/\1/p')
+ 
+  PERF_FIRMWARE_SEC=$(perf_time_token_to_sec "$firmware_tok")
+  PERF_LOADER_SEC=$(perf_time_token_to_sec "$loader_tok")
+  PERF_KERNEL_SEC=$(perf_time_token_to_sec "$kernel_tok")
+  PERF_USERSPACE_SEC=$(perf_time_segment_to_sec "$userspace_seg")
+  PERF_TOTAL_SEC=$(perf_time_segment_to_sec "$total_seg")
+ 
+  # --- systemd-networkd-wait-online.service contribution ---
+  if [ "$exclude_networkd" = "1" ] && [ -f "$blame_file" ]; then
+    net_seg=$(grep 'systemd-networkd-wait-online.service' "$blame_file" 2>/dev/null \
+      | head -n 1 | awk '{print $1, $2}')
+    PERF_NETWORKD_WAIT_ONLINE_SEC=$(perf_time_segment_to_sec "$net_seg")
+  fi
+ 
+  # --- Generic exclude-services list (comma or space separated) ---
+  # We only look in blame_file, summing first match for each service.
+  EX_SVC_LIST=""
+  EX_SVC_TOTAL_SEC=""
+  if [ -n "$exclude_services_raw" ] && [ -f "$blame_file" ]; then
+    services=$(printf '%s\n' "$exclude_services_raw" | tr ',' ' ')
+    for svc in $services; do
+      [ -n "$svc" ] || continue
+ 
+      # Avoid double-counting networkd if user also passed it in the list.
+      if [ "$exclude_networkd" = "1" ] && [ "$svc" = "systemd-networkd-wait-online.service" ]; then
+        continue
+      fi
+ 
+      line_svc=$(grep " $svc\$" "$blame_file" 2>/dev/null | head -n 1)
+      [ -n "$line_svc" ] || continue
+ 
+      seg_svc=$(printf '%s\n' "$line_svc" | awk '{print $1, $2}')
+      sec_svc=$(perf_time_segment_to_sec "$seg_svc")
+      [ -n "$sec_svc" ] || continue
+ 
+      if [ -n "$EX_SVC_LIST" ]; then
+        EX_SVC_LIST="$EX_SVC_LIST,$svc"
+      else
+        EX_SVC_LIST="$svc"
+      fi
+ 
+      if [ -n "$EX_SVC_TOTAL_SEC" ]; then
+        EX_SVC_TOTAL_SEC=$(printf '%s %s\n' "$EX_SVC_TOTAL_SEC" "$sec_svc" \
+          | awk '{printf("%.3f", $1 + $2)}')
+      else
+        EX_SVC_TOTAL_SEC="$sec_svc"
+      fi
+    done
+  fi
+ 
+  PERF_EXCLUDED_SERVICES_LIST="$EX_SVC_LIST"
+  PERF_EXCLUDED_SERVICES_SEC="$EX_SVC_TOTAL_SEC"
+ 
+  # --- Aggregate excluded total (networkd + generic services) ---
+  EXCL_TOTAL=""
+  if [ "$exclude_networkd" = "1" ] && [ -n "$PERF_NETWORKD_WAIT_ONLINE_SEC" ]; then
+    EXCL_TOTAL="$PERF_NETWORKD_WAIT_ONLINE_SEC"
+  fi
+  if [ -n "$PERF_EXCLUDED_SERVICES_SEC" ]; then
+    if [ -n "$EXCL_TOTAL" ]; then
+      EXCL_TOTAL=$(printf '%s %s\n' "$EXCL_TOTAL" "$PERF_EXCLUDED_SERVICES_SEC" \
+        | awk '{printf("%.3f", $1 + $2)}')
+    else
+      EXCL_TOTAL="$PERF_EXCLUDED_SERVICES_SEC"
+    fi
+  fi
+  PERF_EXCLUDED_TOTAL_SEC="$EXCL_TOTAL"
+ 
+  PERF_USERSPACE_EFFECTIVE_SEC="$PERF_USERSPACE_SEC"
+  PERF_TOTAL_EFFECTIVE_SEC="$PERF_TOTAL_SEC"
+ 
+  if [ -n "$EXCL_TOTAL" ] && [ -n "$PERF_USERSPACE_SEC" ] && [ -n "$PERF_TOTAL_SEC" ]; then
+    PERF_USERSPACE_EFFECTIVE_SEC=$(printf '%s %s\n' "$PERF_USERSPACE_SEC" "$EXCL_TOTAL" \
+      | awk '{d = $1 - $2; if (d < 0) d = 0; printf("%.3f\n", d)}')
+    PERF_TOTAL_EFFECTIVE_SEC=$(printf '%s %s\n' "$PERF_TOTAL_SEC" "$EXCL_TOTAL" \
+      | awk '{d = $1 - $2; if (d < 0) d = 0; printf("%.3f\n", d)}')
+  fi
+ 
+  export PERF_FIRMWARE_SEC PERF_LOADER_SEC PERF_KERNEL_SEC PERF_USERSPACE_SEC PERF_TOTAL_SEC \
+         PERF_NETWORKD_WAIT_ONLINE_SEC PERF_EXCLUDED_SERVICES_LIST PERF_EXCLUDED_SERVICES_SEC \
+         PERF_EXCLUDED_TOTAL_SEC PERF_USERSPACE_EFFECTIVE_SEC PERF_TOTAL_EFFECTIVE_SEC
+}
+
+# ---------------------------------------------------------------------------
+# Boot-complete detection (multi-user.target)
+# ---------------------------------------------------------------------------
+
+# Wait for multi-user.target up to <timeout> seconds.
+# Usage: wait_for_multi_user_target <timeout_seconds>
+wait_for_multi_user_target() {
+    timeout="$1"
+
+    if ! command -v systemctl >/dev/null 2>&1; then
+        if command -v log_warn >/dev/null 2>&1; then
+            log_warn "systemctl not found; cannot verify multi-user.target boot-complete state"
+        fi
+        return 0
+    fi
+
+    i=0
+    while [ "$i" -lt "$timeout" ]; do
+        if systemctl is-active --quiet multi-user.target; then
+            if command -v log_info >/dev/null 2>&1; then
+                log_info "Boot complete: multi-user.target is active"
+            fi
+            return 0
+        fi
+        sleep 1
+        i=$((i+1))
+    done
+
+    if systemctl is-active --quiet multi-user.target; then
+        if command -v log_info >/dev/null 2>&1; then
+            log_info "Boot complete: multi-user.target became active after timeout window"
+        fi
+    else
+        if command -v log_warn >/dev/null 2>&1; then
+            log_warn "multi-user.target not active after ${timeout}s; continuing KPI collection anyway"
+        fi
+    fi
+}
+
+# ---------------------------------------------------------------------------
+# Boot KPI loop helpers: state + systemd hook + KPI CSV / averages
+# ---------------------------------------------------------------------------
+
+# Internal helper for safe double-quote escaping
+_perf_kpi_escape_dq() {
+    printf '%s' "$1" | sed 's/\\/\\\\/g;s/"/\\"/g'
+}
+
+# Write/refresh KPI loop state file.
+# Usage:
+# perf_kpi_write_loop_state STATE_FILE ITER_TOTAL ITER_DONE BOOT_TYPE \
+# DISABLE_GETTY DISABLE_SSHD EXCLUDE_NETWORKD EXCLUDE_SERVICES \
+# KPI_SCRIPT KPI_OUT_DIR
+perf_kpi_write_loop_state() {
+    state_file=$1
+    iter_total=$2
+    iter_done=$3
+    boot_type=$4
+    disable_getty=$5
+    disable_sshd=$6
+    exclude_networkd=$7
+    exclude_services=$8
+    kpi_script=$9
+    kpi_out_dir=${10}
+
+    dir=$(dirname "$state_file")
+    mkdir -p "$dir" 2>/dev/null || true
+
+    {
+        echo "KPI_LOOP_ITERATIONS_TOTAL=$iter_total"
+        echo "KPI_LOOP_ITERATIONS_DONE=$iter_done"
+        echo "KPI_LOOP_BOOT_TYPE=\"$(_perf_kpi_escape_dq "$boot_type")\""
+        echo "KPI_LOOP_DISABLE_GETTY=$disable_getty"
+        echo "KPI_LOOP_DISABLE_SSHD=$disable_sshd"
+        echo "KPI_LOOP_EXCLUDE_NETWORKD=$exclude_networkd"
+        echo "KPI_LOOP_EXCLUDE_SERVICES=\"$(_perf_kpi_escape_dq "$exclude_services")\""
+        echo "KPI_LOOP_KPI_SCRIPT=\"$(_perf_kpi_escape_dq "$kpi_script")\""
+        echo "KPI_LOOP_KPI_OUT_DIR=\"$(_perf_kpi_escape_dq "$kpi_out_dir")\""
+    } >"$state_file" 2>/dev/null || true
+
+    if command -v log_info >/dev/null 2>&1; then
+        log_info "KPI loop state written to $state_file (done=$iter_done, total=$iter_total)"
+    fi
+}
+
+# Load KPI loop state; exports KPI_LOOP_* vars if present.
+# Returns 0 on success, 1 on missing file.
+perf_kpi_load_loop_state() {
+    state_file=$1
+    if [ ! -f "$state_file" ]; then
+        return 1
+    fi
+
+    # shellcheck disable=SC1090
+    . "$state_file"
+
+    export KPI_LOOP_ITERATIONS_TOTAL KPI_LOOP_ITERATIONS_DONE KPI_LOOP_BOOT_TYPE \
+           KPI_LOOP_DISABLE_GETTY KPI_LOOP_DISABLE_SSHD KPI_LOOP_EXCLUDE_NETWORKD \
+           KPI_LOOP_EXCLUDE_SERVICES KPI_LOOP_KPI_SCRIPT KPI_LOOP_KPI_OUT_DIR
+
+    if command -v log_info >/dev/null 2>&1; then
+        log_info "Loaded KPI loop state from $state_file (done=${KPI_LOOP_ITERATIONS_DONE:-0}, total=${KPI_LOOP_ITERATIONS_TOTAL:-1})"
+    fi
+    return 0
+}
+
+# Install a systemd hook to run the KPI loop script at each boot.
+# Usage:
+# perf_install_kpi_systemd_hook /full/path/to/run.sh [service_name]
+perf_install_kpi_systemd_hook() {
+    # $1 = full path to KPI loop script (e.g. /var/Runner/.../Boot_Systemd_KPI_Loop/run.sh)
+    # $2 = service base name (e.g. boot-systemd-kpi-loop)
+    kpi_script=$1
+    svc_name=$2
+ 
+    if [ -z "$kpi_script" ] || [ -z "$svc_name" ]; then
+        log_error "perf_install_kpi_systemd_hook: missing script or service name"
+        return 1
+    fi
+ 
+    # Normalise service name (strip accidental .service/.timer)
+    case "$svc_name" in
+        *.service) svc_name=${svc_name%.service} ;;
+        *.timer)   svc_name=${svc_name%.timer} ;;
+    esac
+ 
+    # Resolve script dir; do NOT assume /var – use the real path
+    # shellcheck disable=SC2039  # POSIX sh does not guarantee dirname builtin, but /usr/bin/dirname exists
+    script_dir=$(dirname "$kpi_script")
+    unit_dir=/etc/systemd/system
+ 
+    service_unit="$unit_dir/$svc_name.service"
+    timer_unit="$unit_dir/$svc_name.timer"
+ 
+    log_info "Installing KPI loop systemd units: $service_unit + $timer_unit"
+ 
+    # Service: just runs the script once; not bound to WantedBy targets directly
+    cat >"$service_unit" <<EOF
+[Unit]
+Description=Perf KPI auto-reboot loop
+After=multi-user.target
+Wants=multi-user.target
+ 
+[Service]
+Type=oneshot
+ExecStart=$kpi_script
+WorkingDirectory=$script_dir
+User=root
+ 
+# The script's own state file controls:
+#  - whether AUTO_REBOOT is active
+#  - when to stop the loop and remove hooks
+EOF
+ 
+    # Timer: fires some seconds after boot, outside the boot transaction
+    cat >"$timer_unit" <<EOF
+[Unit]
+Description=Run Perf KPI auto-reboot loop after boot has settled
+ 
+[Timer]
+OnBootSec=30s
+Unit=$svc_name.service
+Persistent=true
+ 
+[Install]
+WantedBy=timers.target
+EOF
+ 
+    # Reload + enable timer (not the service directly)
+    if command -v systemctl >/dev/null 2>&1; then
+        systemctl daemon-reload || true
+        systemctl enable --now "$svc_name.timer" || true
+    else
+        log_warn "systemctl not found, KPI loop units created but not enabled"
+    fi
+ 
+    return 0
+}
+
+# Remove systemd hook and reload daemon.
+# Usage:
+# perf_remove_kpi_systemd_hook [service_name]
+perf_remove_kpi_systemd_hook() {
+    # $1 = service base name (e.g. boot-systemd-kpi-loop or boot-systemd-kpi-loop.service)
+    svc_name=$1
+ 
+    if [ -z "$svc_name" ]; then
+        log_error "perf_remove_kpi_systemd_hook: missing service name"
+        return 1
+    fi
+ 
+    case "$svc_name" in
+        *.service) svc_name=${svc_name%.service} ;;
+        *.timer)   svc_name=${svc_name%.timer} ;;
+    esac
+ 
+    unit_dir=/etc/systemd/system
+    service_unit="$unit_dir/$svc_name.service"
+    timer_unit="$unit_dir/$svc_name.timer"
+ 
+    log_info "Removing KPI loop systemd units: $service_unit + $timer_unit"
+ 
+    if command -v systemctl >/dev/null 2>&1; then
+        systemctl disable --now "$svc_name.timer" 2>/dev/null || true
+        # Service is oneshot; usually inactive, but disable anyway in case it was enabled manually
+        systemctl disable "$svc_name.service" 2>/dev/null || true
+    fi
+ 
+    rm -f "$timer_unit" "$service_unit" 2>/dev/null || true
+ 
+    if command -v systemctl >/dev/null 2>&1; then
+        systemctl daemon-reload || true
+    fi
+ 
+    return 0
+}
+
+# Wait for systemd-analyze time to report a finished boot
+# $1 = analyze_time.txt path
+# $2 = list_jobs_when_boot_unfinished.txt path
+# $3 = max wait seconds (optional, default 180)
+# $4 = poll interval seconds (optional, default 5)
+wait_analyze_ready() {
+    out_file=$1
+    jobs_file=$2
+    max_wait=${3:-180}
+    interval=${4:-5}
+ 
+    # Fallbacks if someone passes empty
+    [ -z "$max_wait" ] && max_wait=180
+    [ -z "$interval" ] && interval=5
+ 
+    elapsed=0
+ 
+    while :; do
+        # Capture stdout + stderr
+        if systemd-analyze time >"$out_file" 2>&1; then
+            if grep -q "Bootup is not yet finished" "$out_file"; then
+                # Boot not finished yet according to systemd
+                log_warn "systemd-analyze: boot not finished yet (elapsed=${elapsed}s); capturing systemctl list-jobs → $jobs_file"
+                systemctl list-jobs >"$jobs_file" 2>&1 || true
+ 
+                if [ "$elapsed" -ge "$max_wait" ]; then
+                    log_warn "systemd-analyze: boot STILL not finished after ${elapsed}s; keeping analyze_time.txt as-is (KPI times may be 'unknown')."
+                    # We return 1: caller treats KPI as degraded but does not fail test.
+                    return 1
+                fi
+            else
+                # We got a proper finished-boot line
+                log_info "systemd-analyze: boot finished; analyze_time.txt captured after ${elapsed}s."
+                return 0
+            fi
+        else
+            rc=$?
+            log_warn "systemd-analyze time failed with rc=$rc; see $out_file for details."
+            # Don’t retry endlessly on hard failure, just return special code
+            return 2
+        fi
+ 
+        # Boot is not done yet, but we still have budget; sleep and retry
+        sleep "$interval" || break
+        elapsed=$((elapsed + interval))
+    done
+ 
+    # If we somehow break the loop without a clear result, treat as not finished
+    log_warn "systemd-analyze: exited wait loop without finished-boot output; see $out_file / $jobs_file."
+    return 1
+}
+
+# ---------------------------------------------------------------------------
+# KPI file parsing + CSV append + averaging
+# ---------------------------------------------------------------------------
+
+# Get " key : value" from KPI text file
+# e.g. " boot_type : cold" → "cold"
+kpi_get_line_val() {
+    key=$1
+    file=$2
+    sed -n "s/^ ${key} : //p" "$file" 2>/dev/null | head -n 1
+}
+
+# Extract first numeric token from a KPI line
+# e.g. " boot_total_sec : 137.008" → "137.008"
+# " uefi_time_sec : 438093.283 (Init=..., Exec=...)" → "438093.283"
+kpi_get_num_from_line() {
+    key=$1
+    file=$2
+    val=$(kpi_get_line_val "$key" "$file")
+    printf '%s\n' "$val" | awk '{print $1}'
+}
+
+# Extract metrics from boot_kpi_this_run.txt into PERF_KPI_* env vars
+# Usage: perf_kpi_extract_from_file /path/to/boot_kpi_this_run.txt
+perf_kpi_extract_from_file() {
+    file=$1
+
+    PERF_KPI_BOOT_TYPE=$(kpi_get_line_val "boot_type" "$file")
+    PERF_KPI_ITERATIONS_HINT=$(kpi_get_line_val "iterations" "$file")
+    PERF_KPI_CLOCKSOURCE=$(kpi_get_line_val "clocksource" "$file")
+
+    PERF_KPI_UEFI_TIME_SEC=$(kpi_get_num_from_line "uefi_time_sec" "$file")
+    PERF_KPI_FIRMWARE_SEC=$(kpi_get_num_from_line "firmware_time_sec" "$file")
+    PERF_KPI_BOOTLOADER_SEC=$(kpi_get_num_from_line "bootloader_time_sec" "$file")
+    PERF_KPI_KERNEL_SEC=$(kpi_get_num_from_line "kernel_time_sec" "$file")
+    PERF_KPI_USERSPACE_SEC=$(kpi_get_num_from_line "userspace_time_sec" "$file")
+    PERF_KPI_USERSPACE_EFFECTIVE_SEC=$(kpi_get_num_from_line "userspace_effective_time_sec" "$file")
+    PERF_KPI_BOOT_TOTAL_SEC=$(kpi_get_num_from_line "boot_total_sec" "$file")
+    PERF_KPI_BOOT_TOTAL_EFFECTIVE_SEC=$(kpi_get_num_from_line "boot_total_effective_sec" "$file")
+
+    export PERF_KPI_BOOT_TYPE PERF_KPI_ITERATIONS_HINT PERF_KPI_CLOCKSOURCE \
+           PERF_KPI_UEFI_TIME_SEC PERF_KPI_FIRMWARE_SEC PERF_KPI_BOOTLOADER_SEC \
+           PERF_KPI_KERNEL_SEC PERF_KPI_USERSPACE_SEC PERF_KPI_USERSPACE_EFFECTIVE_SEC \
+           PERF_KPI_BOOT_TOTAL_SEC PERF_KPI_BOOT_TOTAL_EFFECTIVE_SEC
+}
+
+# Append a CSV row using PERF_KPI_* vars.
+# Optionally override boot_type via 2nd arg.
+# Usage:
+# perf_kpi_append_csv_row CSV_PATH [boot_type_override]
+perf_kpi_append_csv_row() {
+    csv=$1
+    override_bt=$2
+
+    bt=$override_bt
+    [ -n "$bt" ] || bt=$PERF_KPI_BOOT_TYPE
+
+    if [ ! -f "$csv" ]; then
+        echo "timestamp,boot_type,iterations_hint,clocksource,uefi_time_sec,firmware_time_sec,bootloader_time_sec,kernel_time_sec,userspace_time_sec,userspace_effective_time_sec,boot_total_sec,boot_total_effective_sec" >"$csv"
+    fi
+
+    ts=$(nowstamp)
+    echo "$ts,$bt,$PERF_KPI_ITERATIONS_HINT,$PERF_KPI_CLOCKSOURCE,$PERF_KPI_UEFI_TIME_SEC,$PERF_KPI_FIRMWARE_SEC,$PERF_KPI_BOOTLOADER_SEC,$PERF_KPI_KERNEL_SEC,$PERF_KPI_USERSPACE_SEC,$PERF_KPI_USERSPACE_EFFECTIVE_SEC,$PERF_KPI_BOOT_TOTAL_SEC,$PERF_KPI_BOOT_TOTAL_EFFECTIVE_SEC" >>"$csv" 2>/dev/null || true
+
+    if command -v log_info >/dev/null 2>&1; then
+        log_info "Appended KPI row to $csv (boot_type=$bt, total_sec=${PERF_KPI_BOOT_TOTAL_SEC:-unknown}, total_eff_sec=${PERF_KPI_BOOT_TOTAL_EFFECTIVE_SEC:-unknown})"
+    fi
+}
+
+# Compute averages for last N rows of a given boot_type into summary_file.
+# Usage:
+# perf_kpi_compute_average CSV_PATH BOOT_TYPE WINDOW SUMMARY_FILE
+perf_kpi_compute_average() {
+    csv=$1
+    bt=$2
+    window=$3
+    summary_file=$4
+
+    if [ ! -f "$csv" ]; then
+        if command -v log_warn >/dev/null 2>&1; then
+            log_warn "perf_kpi_compute_average: CSV not found: $csv"
+        fi
+        return 1
+    fi
+
+    tmp_filtered="${csv}.filtered.$$"
+    tmp_last="${csv}.last.$$"
+
+    awk -F',' -v bt="$bt" '
+        NR == 1 { next }
+        $2 == bt { print }
+    ' "$csv" >"$tmp_filtered" 2>/dev/null || true
+
+    tail -n "$window" "$tmp_filtered" >"$tmp_last" 2>/dev/null || true
+
+    if [ ! -s "$tmp_last" ]; then
+        rm -f "$tmp_filtered" "$tmp_last" 2>/dev/null || true
+        if command -v log_warn >/dev/null 2>&1; then
+            log_warn "perf_kpi_compute_average: no entries for boot_type=$bt"
+        fi
+        return 1
+    fi
+
+    awk -F',' -v bt="$bt" -v target="$window" '
+      {
+        n++;
+        if ($5 ~ /^[0-9.]+$/) { uefi_sum += $5; uefi_n++ }
+        if ($6 ~ /^[0-9.]+$/) { fw_sum += $6; fw_n++ }
+        if ($7 ~ /^[0-9.]+$/) { bl_sum += $7; bl_n++ }
+        if ($8 ~ /^[0-9.]+$/) { k_sum += $8; k_n++ }
+        if ($9 ~ /^[0-9.]+$/) { us_sum += $9; us_n++ }
+        if ($10 ~ /^[0-9.]+$/) { use_sum += $10; use_n++ }
+        if ($11 ~ /^[0-9.]+$/) { tot_sum += $11; tot_n++ }
+        if ($12 ~ /^[0-9.]+$/) { tote_sum += $12; tote_n++ }
+      }
+      END {
+        if (n == 0) { exit 0 }
+
+        if (uefi_n > 0) uefi_avg = uefi_sum / uefi_n; else uefi_avg = -1;
+        if (fw_n > 0) fw_avg = fw_sum / fw_n; else fw_avg = -1;
+        if (bl_n > 0) bl_avg = bl_sum / bl_n; else bl_avg = -1;
+        if (k_n > 0) k_avg = k_sum / k_n; else k_avg = -1;
+        if (us_n > 0) us_avg = us_sum / us_n; else us_avg = -1;
+        if (use_n > 0) use_avg = use_sum / use_n; else use_avg = -1;
+        if (tot_n > 0) tot_avg = tot_sum / tot_n; else tot_avg = -1;
+        if (tote_n > 0) tote_avg = tote_sum / tote_n; else tote_avg = -1;
+
+        out = summary_file
+        printf("Boot KPI summary (last %d %s boot(s))\n", n, bt) > out
+        printf(" entries_used : %d\n", n) >> out
+        printf(" target_iterations : %d\n", target) >> out
+        printf(" boot_type : %s\n", bt) >> out
+
+        if (uefi_avg >= 0)
+          printf(" avg_uefi_time_sec : %.3f\n", uefi_avg) >> out
+        if (fw_avg >= 0)
+          printf(" avg_firmware_time_sec : %.3f\n", fw_avg) >> out
+        if (bl_avg >= 0)
+          printf(" avg_bootloader_time_sec : %.3f\n", bl_avg) >> out
+        if (k_avg >= 0)
+          printf(" avg_kernel_time_sec : %.3f\n", k_avg) >> out
+        if (us_avg >= 0)
+          printf(" avg_userspace_time_sec : %.3f\n", us_avg) >> out
+        if (use_avg >= 0)
+          printf(" avg_userspace_effective_time_sec : %.3f\n", use_avg) >> out
+        if (tot_avg >= 0)
+          printf(" avg_boot_total_sec : %.3f\n", tot_avg) >> out
+        if (tote_avg >= 0)
+          printf(" avg_boot_total_effective_sec : %.3f\n", tote_avg) >> out
+      }
+    ' summary_file="$summary_file" "$tmp_last"
+
+    rm -f "$tmp_filtered" "$tmp_last" 2>/dev/null || true
+
+    if [ -f "$summary_file" ] && command -v log_info >/dev/null 2>&1; then
+        log_info "perf_kpi_compute_average: summary written to $summary_file"
+    fi
+}
+
+# ---------------------------------------------------------------------------
+# Boot identity + reboot tracking helpers for KPI loops
+# ---------------------------------------------------------------------------
+
+# Capture current boot identity:
+# - PERF_KPI_BOOT_ID = kernel boot_id (or "unknown")
+# - PERF_KPI_UPTIME_SEC = uptime in seconds (float, or empty)
+perf_kpi_get_boot_identity() {
+    PERF_KPI_BOOT_ID="unknown"
+    PERF_KPI_UPTIME_SEC=""
+
+    if [ -r /proc/sys/kernel/random/boot_id ]; then
+        PERF_KPI_BOOT_ID=$(cat /proc/sys/kernel/random/boot_id 2>/dev/null || echo "unknown")
+    fi
+
+    if [ -r /proc/uptime ]; then
+        PERF_KPI_UPTIME_SEC=$(awk '{printf("%.3f\n", $1)}' /proc/uptime 2>/dev/null || echo "")
+    fi
+
+    export PERF_KPI_BOOT_ID PERF_KPI_UPTIME_SEC
+}
+
+# State file layout:
+# boot_id=...
+# uptime_sec=...
+# pending_reboot=0|1
+# iterations_done=N
+perf_kpi_reboot_state_load() {
+    state_file=$1
+
+    PERF_KPI_STATE_BOOT_ID=""
+    PERF_KPI_STATE_UPTIME=""
+    PERF_KPI_STATE_PENDING="0"
+    PERF_KPI_STATE_ITER_DONE=""
+
+    if [ -f "$state_file" ]; then
+        while IFS='=' read -r k v; do
+            case "$k" in
+                boot_id) PERF_KPI_STATE_BOOT_ID=$v ;;
+                uptime_sec) PERF_KPI_STATE_UPTIME=$v ;;
+                pending_reboot) PERF_KPI_STATE_PENDING=$v ;;
+                iterations_done) PERF_KPI_STATE_ITER_DONE=$v ;;
+            esac
+        done <"$state_file"
+    fi
+
+    export PERF_KPI_STATE_BOOT_ID PERF_KPI_STATE_UPTIME \
+           PERF_KPI_STATE_PENDING PERF_KPI_STATE_ITER_DONE
+}
+
+perf_kpi_reboot_state_save() {
+    state_file=$1
+    boot_id=$2
+    uptime=$3
+    pending=$4
+    iter_done=$5
+
+    {
+        echo "boot_id=$boot_id"
+        echo "uptime_sec=$uptime"
+        echo "pending_reboot=$pending"
+        echo "iterations_done=$iter_done"
+    } >"$state_file" 2>/dev/null || true
+}
+
+# Low-level "request reboot" helper.
+# Does *not* manage any state, just tries hard to reboot.
+perf_kpi_request_reboot() {
+    msg=$1
+
+    if command -v log_info >/dev/null 2>&1; then
+        log_info "Requesting reboot: $msg"
+    fi
+
+    sync || true
+
+    if command -v systemctl >/dev/null 2>&1; then
+        systemctl reboot || reboot || shutdown -r now || :
+    else
+        reboot || shutdown -r now || :
+    fi
+
+    # If we are still alive after a short delay, try once more.
+    sleep 5
+    if command -v systemctl >/dev/null 2>&1; then
+        systemctl reboot || reboot || shutdown -r now || :
+    else
+        reboot || shutdown -r now || :
+    fi
+}
+
+# At the **start** of KPI loop:
+# - Detect whether a previous reboot request actually produced a new boot.
+# - If not, immediately re-issue reboot.
+# - Logs uptimes for debugging.
+perf_kpi_check_previous_reboot() {
+    state_file=$1
+
+    perf_kpi_reboot_state_load "$state_file"
+    perf_kpi_get_boot_identity
+
+    # Nothing pending → nothing to do.
+    if [ "$PERF_KPI_STATE_PENDING" != "1" ] || [ -z "$PERF_KPI_STATE_BOOT_ID" ]; then
+        return 0
+    fi
+
+    if [ "$PERF_KPI_STATE_BOOT_ID" = "$PERF_KPI_BOOT_ID" ]; then
+        # Same boot-id as when we asked to reboot → reboot clearly did not happen.
+        if command -v log_warn >/dev/null 2>&1; then
+            log_warn "Previous reboot request did NOT change boot-id; re-issuing reboot now."
+            log_warn "Previous boot_id=$PERF_KPI_STATE_BOOT_ID uptime=${PERF_KPI_STATE_UPTIME:-unknown}s; current uptime=${PERF_KPI_UPTIME_SEC:-unknown}s"
+        fi
+        perf_kpi_request_reboot "Retrying failed reboot for KPI loop"
+        # Should not return if reboot succeeds; if it does, caller will just exit.
+        return 0
+    fi
+
+    # Boot-id changed → reboot successful, just log it and clear pending flag in state.
+    if command -v log_info >/dev/null 2>&1; then
+        log_info "Detected new boot after KPI reboot: old_boot_id=$PERF_KPI_STATE_BOOT_ID, new_boot_id=$PERF_KPI_BOOT_ID"
+        log_info "Previous uptime at reboot request=${PERF_KPI_STATE_UPTIME:-unknown}s, current uptime=${PERF_KPI_UPTIME_SEC:-unknown}s"
+    fi
+
+    perf_kpi_reboot_state_save "$state_file" "$PERF_KPI_BOOT_ID" "$PERF_KPI_UPTIME_SEC" "0" "$PERF_KPI_STATE_ITER_DONE"
+}

From 0ca37fb2bfa1f56002e03bbbc623d5c5059daafd Mon Sep 17 00:00:00 2001
From: Srikanth Muppandam <smuppand@qti.qualcomm.com>
Date: Mon, 8 Dec 2025 11:31:23 +0530
Subject: [PATCH 2/3] Boot_Systemd_Validate: use wait_analyze_ready for robust
 timings

- Replace the ad-hoc systemd-analyze polling loop with the shared helper.
- Capture list-jobs when boot is unfinished and log this explicitly.
- Honor extended boot times on slow platforms before parsing KPIs.
- Preserve existing outputs while avoiding misleading 'boot not finished' results.

Signed-off-by: Srikanth Muppandam <smuppand@qti.qualcomm.com>
---
 .../Boot_Systemd_Validate.yaml                |  37 ++
 .../Systemd_Boot_KPI_Tests_Overview.md        | 514 ++++++++++++++++
 .../Performance/Boot_Systemd_Validate/run.sh  | 564 ++++++++++++++++++
 3 files changed, 1115 insertions(+)
 create mode 100755 Runner/suites/Performance/Boot_Systemd_Validate/Boot_Systemd_Validate.yaml
 create mode 100644 Runner/suites/Performance/Boot_Systemd_Validate/Systemd_Boot_KPI_Tests_Overview.md
 create mode 100755 Runner/suites/Performance/Boot_Systemd_Validate/run.sh

diff --git a/Runner/suites/Performance/Boot_Systemd_Validate/Boot_Systemd_Validate.yaml b/Runner/suites/Performance/Boot_Systemd_Validate/Boot_Systemd_Validate.yaml
new file mode 100755
index 00000000..40a9052e
--- /dev/null
+++ b/Runner/suites/Performance/Boot_Systemd_Validate/Boot_Systemd_Validate.yaml
@@ -0,0 +1,37 @@
+metadata:
+  name: boot-systemd-validate
+  format: "Lava-Test Test Definition 1.0"
+  description: "Systemd boot KPI capture + required unit gating + artifacts (critical-chain, blame, plot, unit states)."
+  os:
+    - linux
+  scope:
+    - performance
+    - functional
+
+params:
+  OUT_DIR: "./logs_Boot_Systemd_Validate"
+
+  # Either provide REQUIRED_UNITS_FILE (existing file path) OR REQUIRED_UNITS (list).
+  # REQUIRED_UNITS supports commas/spaces and will be written to OUT_DIR/required_units.txt automatically.
+  REQUIRED_UNITS_FILE: ""
+  REQUIRED_UNITS: ""
+
+  TIMEOUT_PER_UNIT: "30"
+  SVG: "yes" # yes|no
+  BOOT_TYPE: "unknown" # cold|warm|unknown etc
+  DISABLE_GETTY: "0" # 1|0
+  DISABLE_SSHD: "0" # 1|0
+  EXCLUDE_NETWORKD_WAIT_ONLINE: "0" # 1|0
+  EXCLUDE_SERVICES: "" # space-separated service names
+  BOOT_KPI_ITERATIONS: "1"
+  VERBOSE: "0"
+
+  # Optional improvement: configurable boot-complete wait (seconds)
+  WAIT_FOR_BOOT_COMPLETE_TIMEOUT: "300"
+
+run:
+  steps:
+    - REPO_PATH=$PWD
+    - cd Runner/suites/Performance/Boot_Systemd_Validate/
+    - ./run.sh || true
+    - $REPO_PATH/Runner/utils/send-to-lava.sh Boot_Systemd_Validate.res || true
diff --git a/Runner/suites/Performance/Boot_Systemd_Validate/Systemd_Boot_KPI_Tests_Overview.md b/Runner/suites/Performance/Boot_Systemd_Validate/Systemd_Boot_KPI_Tests_Overview.md
new file mode 100644
index 00000000..01036f7d
--- /dev/null
+++ b/Runner/suites/Performance/Boot_Systemd_Validate/Systemd_Boot_KPI_Tests_Overview.md
@@ -0,0 +1,514 @@
+Systemd Boot KPI: How to Use the Two Tests
+==========================================
+
+We provide two complementary tests for measuring systemd boot KPIs:
+
+1. **Per-boot KPI collector**  
+   `Boot_Systemd_Validate/run.sh`
+2. **Reboot loop wrapper / KPI aggregator**  
+   `Boot_Systemd_KPI_Loop/run.sh`
+
+They are designed to work together but serve **different use-cases**.
+
+Typical paths in qcom-linux-testkit:
+
+```text
+suites/Performance/Boot_Systemd_Validate/run.sh
+suites/Performance/Boot_Systemd_KPI_Loop/run.sh
+```
+
+---
+
+1. `Boot_Systemd_Validate` – Per-boot KPI collector
+---------------------------------------------------
+
+**Path (example):**
+
+```text
+suites/Performance/Boot_Systemd_Validate/run.sh
+```
+
+### Purpose
+
+Runs **once per boot** and collects detailed systemd boot KPIs:
+
+- `systemd-analyze time` (parsed into firmware/loader/kernel/userspace/total)
+- `systemd-analyze blame` (full + top-20)
+- `systemd-analyze critical-chain`
+- `systemd-analyze plot` → `boot_analysis.svg` (optional)
+- `systemd-analyze dot` → `boot.dot`
+- `systemctl` unit dependency trees and per-unit state CSV
+- Journals: full boot, warnings, errors (when `journalctl` is available)
+- Optional **gating on required units** (e.g. “all critical services must be active”)
+- **UEFI loader timings** from efivars (Init/Exec/Total) when EFI vars exist
+- **Exclusion of slow services** from userspace/total (e.g. `systemd-networkd-wait-online.service`)
+
+All logs are stored under a test-local directory:
+
+```text
+./logs_Boot_Systemd_Validate/
+```
+
+When `--iterations N` is passed, the script still runs **once**, but includes
+this hint in the KPI output so that the KPI loop wrapper knows the intended
+window size.
+
+---
+
+### Usage (CLI help)
+
+The script has a built-in help that matches the implementation:
+
+```text
+Usage: ./run.sh [OPTIONS]
+
+Options:
+  --out DIR           Output directory for logs (default: ./logs_Boot_Systemd_Validate)
+  --required FILE     File listing systemd units that must become active
+  --timeout S         Timeout per required unit (seconds, default: $TIMEOUT_PER_UNIT)
+  --no-svg            Skip systemd-analyze plot SVG generation
+  --boot-type TYPE    Tag boot type (e.g. cold, warm, unknown)
+  --disable-getty     Disable serial-getty@ttyS0.service for this KPI run
+  --disable-sshd      Disable sshd.service for this KPI run
+
+  --exclude-networkd-wait-online
+                      Exclude systemd-networkd-wait-online.service time
+                      from userspace/total based on systemd-analyze blame
+
+  --exclude-services "svc1 svc2 ..."
+                      Exclude one or more services (matching names in
+                      systemd-analyze blame) from userspace/total.
+                      The summed time is subtracted and reported as
+                      an effective KPI.
+
+  --iterations N      Hint for KPI iterations (wrapper/LAVA metadata; this
+                      script still runs once per invocation)
+
+  --verbose           Dump key .txt artifacts from OUT_DIR to console for
+                      LAVA debugging (skips large journal_*.txt files)
+
+  -h, --help          Show this help and exit
+```
+
+**Environment knobs (optional):**
+
+- `TIMEOUT_PER_UNIT` – default per-unit wait time for `--required`
+- `SVG=yes|no` – default for SVG generation (overridden by `--no-svg`)
+- `BOOT_TYPE` – default boot type tag (overridden by `--boot-type`)
+- `BOOT_KPI_ITERATIONS` – default for the `iterations` field in the KPI output
+
+---
+
+### Outputs / Artifacts
+
+All written under `OUT_DIR` (default: `./logs_Boot_Systemd_Validate`):
+
+- Platform + metadata  
+  - `platform.txt`, `platform.json`  
+  - `clocksource.txt` (current clocksource)  
+  - `boot_type.txt` (e.g. `cold`, `warm`, `unknown`)
+
+- Units & dependencies  
+  - `sysinit_deps.txt`, `basic_deps.txt`  
+  - `units.list`  
+  - `unit_states.csv` (per-unit state/export from `systemctl show`)
+
+- Systemd timing & graphs  
+  - `analyze_time.txt` (raw `systemd-analyze time` output)  
+  - `blame.txt`, `blame_top20.txt`  
+  - `critical_chain.txt`  
+  - `boot_analysis.svg` (unless `--no-svg`)  
+  - `boot.dot`
+
+- Journals  
+  - `journal_boot.txt` – full boot journal  
+  - `journal_warn.txt` – warnings and above  
+  - `journal_err.txt` – errors and above  
+
+- Bootchart (if enabled via `init=/lib/systemd/systemd-bootchart`)  
+  - `bootchart.tgz` (if present under `/run/log/...`)
+
+- Required units  
+  - `failed_units.txt` (from `systemctl --failed`)  
+
+- **KPI breakdown (this run)**  
+  - `boot_kpi_this_run.txt` – structured, human-readable KPI summary
+
+---
+
+### KPI breakdown: fields and exclusions
+
+At the end of the run, the script prints a KPI summary **to console** and
+writes the same content into `boot_kpi_this_run.txt`, for example:
+
+```text
+Boot KPI (this run)
+ boot_type : cold
+ iterations : 5
+ clocksource : arch_sys_counter
+ uefi_time_sec : 438093.283 (Init=214751.707, Exec=223341.576)
+ firmware_time_sec : 3.765
+ bootloader_time_sec : 0.176
+ kernel_time_sec : 6.124
+ userspace_time_sec : 126.942
+ userspace_effective_time_sec : 6.825
+ boot_total_sec : 137.008
+ boot_total_effective_sec : 16.891
+```
+
+Fields:
+
+- `uefi_time_sec`  
+  Sum of UEFI loader Init+Exec time in seconds, derived from EFI vars:
+
+  - `LoaderTimeInitUSec-4a67b082-0a4c-41cf-b6c7-440b29bb8c4f`
+  - `LoaderTimeExecUSec-4a67b082-0a4c-41cf-b6c7-440b29bb8c4f`
+
+  with individual Init/Exec components also printed.
+
+- `firmware_time_sec`, `bootloader_time_sec`, `kernel_time_sec`,
+  `userspace_time_sec`, `boot_total_sec`  
+  Parsed from `systemd-analyze time`:
+
+  ```text
+  Startup finished in 3.801s (firmware) + 174ms (loader) + 6.106s (kernel) + 2min 7.045s (userspace) = 2min 17.127s
+  ```
+
+- `userspace_effective_time_sec`, `boot_total_effective_sec`  
+
+  These are derived from the raw userspace/total time by subtracting:
+
+  1. `systemd-networkd-wait-online.service` time when
+     `--exclude-networkd-wait-online` is passed.
+  2. Any additional services given via `--exclude-services "svc1 svc2"`.
+
+The script logs exclusions clearly, for example:
+
+```text
+[INFO] ... Excluded systemd-networkd-wait-online.service=120.117s from userspace/total; boot_total_effective_sec=16.891
+[INFO] ... Excluded services from userspace/total (sum=2.500s): docker.service=0.966s; NetworkManager.service=1.534s;  boot_total_effective_sec=14.391
+```
+
+If `systemd-analyze time` reports:
+
+```text
+Bootup is not yet finished (org.freedesktop.systemd1.Manager.FinishTimestampMonotonic=0).
+```
+
+the script:
+
+- Marks the timing fields as `unknown`.
+- Logs the active jobs from `systemctl list-jobs` to **console** so that
+  blocking services (including our own KPI service if misconfigured) are
+  visible during LAVA debugging.
+
+This diagnostic logging happens **even without `--verbose`**.
+
+---
+
+### Verbose mode (`--verbose`)
+
+When `--verbose` is set, the script:
+
+- Prints all “reasonable” `.txt` artifacts from `OUT_DIR` to console
+  (excluding `journal_*.txt` for size reasons).
+- This is intended for LAVA and other CI where you cannot easily inspect the
+  filesystem but can scroll the job log.
+
+Example tail of the verbose section:
+
+```text
+[INFO] ... Verbose mode: dumping text artifacts from ./logs_Boot_Systemd_Validate (excluding journal_*.txt)
+===== analyze_time.txt =====
+Startup finished in ...
+...
+===== boot_kpi_this_run.txt =====
+Boot KPI (this run)
+ ...
+```
+
+---
+
+### Typical usage examples
+
+**1) Basic per-boot KPI with required units**
+
+```sh
+./run.sh   --timeout 60   --required required-units.txt
+```
+
+**2) Cold-boot KPI, excluding networkd-wait-online + Docker/Weston**
+
+```sh
+./run.sh   --boot-type cold   --disable-getty   --exclude-networkd-wait-online   --exclude-services "docker.service weston.service"
+```
+
+**3) LAVA-friendly verbose run**
+
+```sh
+./run.sh   --boot-type warm   --disable-getty   --exclude-networkd-wait-online   --iterations 5   --verbose
+```
+
+In all cases, the main KPI is in `logs_Boot_Systemd_Validate/boot_kpi_this_run.txt`
+and echoed to console.
+
+---
+
+2. `Boot_Systemd_KPI_Loop` – Reboot loop wrapper & KPI aggregator
+-----------------------------------------------------------------
+
+**Path (example):**
+
+```text
+suites/Performance/Boot_Systemd_KPI_Loop/run.sh
+```
+
+### Purpose
+
+A **thin wrapper** that drives multiple KPI iterations across reboots and
+computes averages over the last **N boots** of a given `boot_type`.
+
+On each (re)boot it:
+
+1. Loads state from `Boot_Systemd_KPI_Loop.state` (if present) to determine:
+   - Total iterations requested
+   - Iterations already completed
+   - Boot type & options
+   - KPI script path + base out dir
+2. Computes **this iteration index**, and a per-iteration out dir:
+
+   ```text
+   <base_out_dir>/iter_<N>
+   ```
+
+3. Calls `Boot_Systemd_Validate/run.sh` once with:
+   - `--out <base_out_dir>/iter_N`
+   - `--boot-type <TYPE>`
+   - `--iterations <TOTAL>`
+   - Forwarded flags (`--disable-getty`, `--exclude-...`, `--verbose`, etc.)
+4. Parses `boot_kpi_this_run.txt` for this iteration, appends a row into:
+
+   ```text
+   Boot_Systemd_KPI_stats.csv
+   ```
+
+5. Computes averages over the last **N entries** for this `boot_type` and writes:
+
+   ```text
+   Boot_Systemd_KPI_summary.txt
+   ```
+
+6. In **auto-reboot mode**, if more iterations are pending:
+   - Updates `Boot_Systemd_KPI_Loop.state`
+   - Triggers a reboot
+   - A small systemd service (`boot-systemd-kpi-loop.service`) invokes this
+     script again on the next boot until all iterations complete.
+
+When all iterations finish, the wrapper:
+
+- Prints the KPI average summary to console.
+- Leaves `.csv` and `.summary.txt` for further analysis.
+- Cleans up the systemd hook + state file in auto-reboot mode.
+
+---
+
+### Usage (CLI help)
+
+```text
+Usage: ./run.sh [OPTIONS]
+
+This wrapper:
+  * Runs Boot_Systemd_Validate once for the *current boot*
+  * Uses a per-iteration KPI out dir when --iterations > 1:
+      base: ../Boot_Systemd_Validate/logs_Boot_Systemd_Validate
+      iter: <base>/iter_<N>
+  * Parses boot_kpi_this_run.txt from that test
+  * Appends a row into Boot_Systemd_KPI_stats.csv
+  * Computes averages over the last N boots (per boot_type) and prints summary.
+
+Options:
+  --kpi-script PATH   Override Boot_Systemd_Validate script path
+                      (default: ../Boot_Systemd_Validate/run.sh)
+
+  --kpi-out-dir DIR   Override base KPI output dir
+                      (default: ../Boot_Systemd_Validate/logs_Boot_Systemd_Validate)
+
+  --iterations N      Number of boots to average over (default: 1)
+  --boot-type TYPE    Tag for this run (e.g. cold, warm, unknown)
+
+  # Options forwarded to Boot_Systemd_Validate:
+  --disable-getty     Disable serial-getty@ttyS0.service
+  --disable-sshd      Disable sshd.service
+  --exclude-networkd-wait-online
+                      Exclude systemd-networkd-wait-online.service
+  --exclude-services "A B"
+                      Exclude these services from userspace/total
+  --no-svg            Disable SVG plot generation
+  --verbose           Print KPI .txt artifacts to console for debug
+
+  # Auto-reboot orchestration:
+  --auto-reboot       Install systemd hook and auto-reboot until
+                      --iterations boots are collected. State is
+                      stored in: Boot_Systemd_KPI_Loop.state
+
+  -h, --help          Show this help and exit
+```
+
+---
+
+### Files written by the loop wrapper
+
+Under the same directory as `Boot_Systemd_KPI_Loop/run.sh`:
+
+- `Boot_Systemd_KPI_Loop.res`  
+  PASS/FAIL status for the wrapper itself.
+
+- `Boot_Systemd_KPI_Loop.state`  
+  Persistent state across reboots (total iterations, done so far, boot_type,
+  options, KPI script path/out dir). Removed automatically when all iterations
+  complete or on error.
+
+- `Boot_Systemd_KPI_stats.csv`  
+  Rolling KPI database across boots. Each row corresponds to the parsed
+  `boot_kpi_this_run.txt` of one boot (for a given `boot_type`).
+
+- `Boot_Systemd_KPI_summary.txt`  
+  Human-readable summary of averages over the last **N** entries of that
+  `boot_type`, e.g.:
+
+  ```text
+  Boot KPI summary (last 5 cold boot(s))
+   entries_used : 5
+   target_iterations : 5
+   boot_type : cold
+   avg_uefi_time_sec : ...
+   avg_firmware_time_sec : ...
+   avg_bootloader_time_sec : ...
+   avg_kernel_time_sec : ...
+   avg_userspace_time_sec : ...
+   avg_userspace_effective_time_sec : ...
+   avg_boot_total_sec : ...
+   avg_boot_total_effective_sec : ...
+  ```
+
+- `Boot_Systemd_KPI_Loop_stdout_<timestamp>.log`  
+  Stdout/stderr log(s) for the wrapper itself (if you preserve them).
+
+Per-iteration artifacts from `Boot_Systemd_Validate` live under:
+
+```text
+../Boot_Systemd_Validate/logs_Boot_Systemd_Validate/iter_1/
+../Boot_Systemd_Validate/logs_Boot_Systemd_Validate/iter_2/
+...
+```
+
+Each `iter_N` has its own `boot_kpi_this_run.txt`, `analyze_time.txt`, etc.
+
+---
+
+### Auto-reboot mode details
+
+When `--auto-reboot` is passed:
+
+- The wrapper installs a small systemd service (e.g. `boot-systemd-kpi-loop.service`)
+  that runs the wrapper at boot.
+- On each boot, the wrapper:
+  - Runs `Boot_Systemd_Validate` once.
+  - Updates the `.state` file with the new iteration count.
+  - If more iterations are required, it requests `reboot` again.
+- After the final iteration:
+  - KPI averages are computed and printed.
+  - The systemd hook is removed.
+  - The state file is deleted.
+
+The reboot logic is designed to:
+
+- Ensure the reboot actually happens (falling back between `reboot` and `/sbin/reboot`).
+- Avoid blocking `systemd-analyze` permanently: the KPI scripts finish quickly,
+  and if any unit (including our own) prevents boot from completing, it will
+  show up in the “Bootup is not yet finished … list-jobs” diagnostics inside
+  each `iter_N/analyze_time.txt` and in the **console logs**.
+
+---
+
+### Typical usage examples
+
+**1) Manual KPI over last 5 cold boots (no auto-reboot)**
+
+You manually reboot the board between runs:
+
+```sh
+# Boot 1 (cold boot)
+./run.sh --iterations 5 --boot-type cold --disable-getty --exclude-networkd-wait-online
+
+# Reboot the board manually (power-cycle or reboot)
+
+# Boot 2..5 – re-run the same command each time
+./run.sh --iterations 5 --boot-type cold --disable-getty --exclude-networkd-wait-online
+...
+```
+
+After the 5th run, `Boot_Systemd_KPI_summary.txt` will contain the averages over
+the last 5 `cold` entries.
+
+**2) Fully automated cold-boot KPI campaign (auto-reboot)**
+
+```sh
+./run.sh   --iterations 5   --boot-type cold   --disable-getty   --exclude-networkd-wait-online   --auto-reboot
+```
+
+The wrapper will:
+
+- Run `Boot_Systemd_Validate` on this boot.
+- Reboot automatically until 5 iterations are captured.
+- Finally, print a KPI summary and clean up the systemd hook/state.
+
+**3) Warm-boot KPI with extra service exclusions and verbose logs**
+
+```sh
+./run.sh   --iterations 3   --boot-type warm   --disable-getty   --exclude-networkd-wait-online   --exclude-services "docker.service weston.service"   --auto-reboot   --verbose
+```
+
+This gives:
+
+- Per-iteration directories: `iter_1`, `iter_2`, `iter_3`.
+- Detailed logs printed to console from `Boot_Systemd_Validate` via `--verbose`.
+- Aggregated averages in `Boot_Systemd_KPI_summary.txt`.
+
+---
+
+3. Which one should I use?
+--------------------------
+
+| Scenario                                      | Recommended test                      | Notes                                                                 |
+|----------------------------------------------|---------------------------------------|-----------------------------------------------------------------------|
+| Standard CI pipeline (no reboot-resume)      | `Boot_Systemd_Validate`               | Run once per job; no reboot inside the script.                        |
+| Manual KPI measurement on a single boot      | `Boot_Systemd_Validate`               | E.g. after changing kernel/systemd configs.                          |
+| Quick health-check of systemd units          | `Boot_Systemd_Validate`               | Use `--required` to gate on critical services.                        |
+| Lab KPI across N cold/warm boots             | `Boot_Systemd_KPI_Loop`               | Wrapper handles per-boot dirs + CSV + averages; you may reboot manually. |
+| Automated multi-boot campaign in lab         | `Boot_Systemd_KPI_Loop` with `--auto-reboot` | State file + systemd hook handle the full loop.                 |
+| CI with explicit reboot-resume support       | `Boot_Systemd_KPI_Loop` (if allowed)  | CI must re-run the script after each reboot.                          |
+
+---
+
+4. Design principles
+--------------------
+
+- **Single responsibility**  
+  - `Boot_Systemd_Validate`: _measure one boot and emit KPIs_.  
+  - `Boot_Systemd_KPI_Loop`: _across boots: state, reboots, aggregation_.
+
+- **CI friendliness**  
+  - CI that cannot handle reboots should only use `Boot_Systemd_Validate`.  
+  - Reboot orchestration via `--auto-reboot` is explicitly opt-in.
+
+- **Robust & transparent**  
+  - Rolling CSV + summary for long-term trends.  
+  - Clear console logs for:
+    - service time exclusions,
+    - non-finished boots (`Bootup is not yet finished` + `systemctl list-jobs`),
+    - per-iteration KPI values.
+
+- **Local logs only**  
+  - All artifacts (CSV, SVG, journals, etc.) are stored under the test’s
+    working directory, making log collection and LAVA parsing straightforward.
diff --git a/Runner/suites/Performance/Boot_Systemd_Validate/run.sh b/Runner/suites/Performance/Boot_Systemd_Validate/run.sh
new file mode 100755
index 00000000..c7722a93
--- /dev/null
+++ b/Runner/suites/Performance/Boot_Systemd_Validate/run.sh
@@ -0,0 +1,564 @@
+#!/bin/sh
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause-Clear
+# Systemd boot KPI + validation (single run).
+
+SCRIPT_DIR="$(
+  cd "$(dirname "$0")" || exit 1
+  pwd
+)"
+
+TESTNAME="Boot_Systemd_Validate"
+RES_FILE="./${TESTNAME}.res"
+
+# Defaults (env may override; CLI parsing later overrides again)
+OUT_DIR="${OUT_DIR:-./logs_${TESTNAME}}"
+REQ_UNITS_FILE="${REQ_UNITS_FILE:-}"
+REQUIRED_UNITS="${REQUIRED_UNITS:-}"
+TIMEOUT_PER_UNIT="${TIMEOUT_PER_UNIT:-30}"
+SVG="${SVG:-yes}"
+BOOT_TYPE="${BOOT_TYPE:-unknown}"
+DISABLE_GETTY="${DISABLE_GETTY:-0}"
+DISABLE_SSHD="${DISABLE_SSHD:-0}"
+EXCLUDE_NETWORKD_WAIT_ONLINE="${EXCLUDE_NETWORKD_WAIT_ONLINE:-0}"
+EXCLUDE_SERVICES="${EXCLUDE_SERVICES:-}"
+BOOT_KPI_ITERATIONS="${BOOT_KPI_ITERATIONS:-1}"
+VERBOSE="${VERBOSE:-0}"
+BOOT_NOT_FINISHED=0
+
+# Optional: make boot-complete wait configurable
+WAIT_FOR_BOOT_COMPLETE_TIMEOUT="${WAIT_FOR_BOOT_COMPLETE_TIMEOUT:-300}"
+
+usage() {
+  cat <<EOF
+Usage: $0 [OPTIONS]
+
+Options:
+  --out DIR Output directory for logs (default: ${OUT_DIR})
+  --required FILE File listing systemd units that must become active
+  --timeout S Timeout per required unit (seconds, default: ${TIMEOUT_PER_UNIT})
+  --no-svg Skip systemd-analyze plot SVG generation
+  --boot-type TYPE Tag boot type (e.g. cold, warm, unknown)
+  --disable-getty Disable serial-getty@ttyS0.service for this KPI run
+  --disable-sshd Disable sshd.service for this KPI run
+  --exclude-networkd-wait-online
+                                Exclude systemd-networkd-wait-online.service time from userspace/total
+  --exclude-services "A B" Exclude one or more services (from systemd-analyze blame) from userspace/total
+  --iterations N Hint for KPI iterations (wrapper/LAVA metadata; this script still runs once)
+  --verbose Dump key .txt artifacts from OUT_DIR to console for LAVA debugging
+  -h, --help Show this help and exit
+
+Artifacts in OUT:
+  - platform.txt, platform.json
+  - boot_type.txt, clocksource.txt
+  - sysinit_deps.txt, basic_deps.txt
+  - units.list, unit_states.csv
+  - critical_chain.txt, blame.txt, blame_top20.txt, failed_units.txt
+  - analyze_time.txt
+  - journal_boot.txt, journal_warn.txt, journal_err.txt (if journalctl present)
+  - boot_analysis.svg (unless --no-svg)
+  - boot.dot
+  - boot_kpi_this_run.txt
+EOF
+}
+
+# EARLY help handling: do this BEFORE init_env/functestlib stdout capture
+case "${1:-}" in
+  -h|--help)
+    usage >&2
+    exit 0
+    ;;
+esac
+
+# --- locate and source init_env → functestlib.sh + lib_performance.sh ---
+INIT_ENV=""
+SEARCH="$SCRIPT_DIR"
+
+while [ "$SEARCH" != "/" ]; do
+    if [ -f "$SEARCH/init_env" ]; then
+        INIT_ENV="$SEARCH/init_env"
+        break
+    fi
+    SEARCH=$(dirname "$SEARCH")
+done
+
+if [ -z "$INIT_ENV" ]; then
+    echo "[ERROR] Could not find init_env (starting at $SCRIPT_DIR)" >&2
+    exit 1
+fi
+
+# Only source once (idempotent)
+# NOTE: We intentionally **do not export** any new vars. They stay local to this shell.
+if [ -z "${__INIT_ENV_LOADED:-}" ]; then
+    # shellcheck disable=SC1090
+    . "$INIT_ENV"
+    __INIT_ENV_LOADED=1
+fi
+
+# shellcheck disable=SC1090
+. "$INIT_ENV"
+# shellcheck disable=SC1091
+. "$TOOLS/functestlib.sh"
+# shellcheck disable=SC1091
+. "$TOOLS/lib_performance.sh"
+
+# --- allow LAVA params (env) to drive defaults cleanly ---
+# (CLI still overrides these later via parsing)
+OUT_DIR="${OUT_DIR:-./logs_${TESTNAME}}"
+
+# Support either REQUIRED_UNITS_FILE (file path) or REQUIRED_UNITS (list)
+REQ_UNITS_FILE="${REQUIRED_UNITS_FILE:-${REQ_UNITS_FILE:-}}"
+REQUIRED_UNITS="${REQUIRED_UNITS:-}"
+
+TIMEOUT_PER_UNIT="${TIMEOUT_PER_UNIT:-30}"
+SVG="${SVG:-yes}"
+BOOT_TYPE="${BOOT_TYPE:-unknown}"
+
+DISABLE_GETTY="${DISABLE_GETTY:-0}"
+DISABLE_SSHD="${DISABLE_SSHD:-0}"
+EXCLUDE_NETWORKD_WAIT_ONLINE="${EXCLUDE_NETWORKD_WAIT_ONLINE:-0}"
+EXCLUDE_SERVICES="${EXCLUDE_SERVICES:-}"
+BOOT_KPI_ITERATIONS="${BOOT_KPI_ITERATIONS:-1}"
+VERBOSE="${VERBOSE:-0}"
+
+# If REQUIRED_UNITS is provided (space/comma-separated) and no file given, materialize it.
+if [ -z "$REQ_UNITS_FILE" ] && [ -n "$REQUIRED_UNITS" ]; then
+  mkdir -p "$OUT_DIR" 2>/dev/null || true
+  REQ_UNITS_FILE="$OUT_DIR/required_units.txt"
+  printf '%s\n' "$REQUIRED_UNITS" | tr ',' ' ' | tr ' ' '\n' | sed '/^$/d' >"$REQ_UNITS_FILE" 2>/dev/null || true
+fi
+
+# --- CLI parsing ---
+while [ "$#" -gt 0 ]; do
+  case "$1" in
+    --out) shift; OUT_DIR="$1" ;;
+    --required) shift; REQ_UNITS_FILE="$1" ;;
+    --timeout) shift; TIMEOUT_PER_UNIT="$1" ;;
+    --no-svg) SVG="no" ;;
+    --boot-type) shift; BOOT_TYPE="$1" ;;
+    --disable-getty) DISABLE_GETTY=1 ;;
+    --disable-sshd) DISABLE_SSHD=1 ;;
+    --exclude-networkd-wait-online) EXCLUDE_NETWORKD_WAIT_ONLINE=1 ;;
+    --exclude-services) shift; EXCLUDE_SERVICES="$1" ;;
+    --iterations) shift; BOOT_KPI_ITERATIONS="$1" ;;
+    --verbose) VERBOSE=1 ;;
+    -h|--help)
+      usage >&2
+      exit 0
+      ;;
+    *)
+      log_warn "Unknown option: $1"
+      usage >&2
+      echo "$TESTNAME FAIL" >"$RES_FILE"
+      exit 1
+      ;;
+  esac
+  shift
+done
+
+mkdir -p "$OUT_DIR" || {
+  log_error "Cannot create $OUT_DIR"
+  echo "$TESTNAME FAIL" >"$RES_FILE"
+  exit 1
+}
+
+# Basic tools check (keep light; others are optional)
+check_dependencies systemd-analyze uname sed awk grep find sort || {
+  log_skip "$TESTNAME SKIP - basic tools missing"
+  echo "$TESTNAME SKIP" >"$RES_FILE"
+  exit 0
+}
+
+# --- ensure CPU governors restored on exit ---
+cleanup() {
+  restore_governor
+}
+trap cleanup EXIT
+
+# --- Set performance governor for KPI run ---
+set_performance_governor
+
+# --- Clocksource + boot type tagging ---
+capture_clocksource "$OUT_DIR/clocksource.txt"
+capture_boot_type "$BOOT_TYPE" "$OUT_DIR/boot_type.txt"
+
+# --- Optionally disable heavy services (getty/sshd) ---
+disable_heavy_services_if_requested "$DISABLE_GETTY" "$DISABLE_SSHD"
+
+# --- Wait for boot complete (multi-user.target) if possible ---
+if command -v wait_for_boot_complete >/dev/null 2>&1; then
+  wait_for_boot_complete "$WAIT_FOR_BOOT_COMPLETE_TIMEOUT"
+else
+  if command -v systemctl >/dev/null 2>&1; then
+    if systemctl is-active --quiet multi-user.target; then
+      log_info "Boot complete: multi-user.target is active"
+    else
+      log_warn "multi-user.target not active; continuing KPI capture anyway"
+    fi
+  else
+    log_warn "systemctl not found; cannot verify boot-complete target"
+  fi
+fi
+
+# ---------- Platform snapshot ----------
+detect_platform
+
+{
+  echo "timestamp=$(nowstamp)"
+  echo "kernel=$PLATFORM_KERNEL"
+  echo "arch=$PLATFORM_ARCH"
+  echo "uname_s=$PLATFORM_UNAME_S"
+  echo "hostname=$PLATFORM_HOSTNAME"
+  echo "soc_machine=$PLATFORM_SOC_MACHINE"
+  echo "soc_id=$PLATFORM_SOC_ID"
+  echo "soc_family=$PLATFORM_SOC_FAMILY"
+  echo "dt_model=$PLATFORM_DT_MODEL"
+  echo "dt_compatible=$PLATFORM_DT_COMPAT"
+  echo "os_like=$PLATFORM_OS_LIKE"
+  echo "os_name=$PLATFORM_OS_NAME"
+  echo "target=$PLATFORM_TARGET"
+  echo "machine=$PLATFORM_MACHINE"
+} >"$OUT_DIR/platform.txt"
+log_info "Platform info → $OUT_DIR/platform.txt"
+
+{
+  printf '{'
+  printf '"timestamp":"%s",' "$(nowstamp)"
+  printf '"kernel":"%s",' "$(esc "$PLATFORM_KERNEL")"
+  printf '"arch":"%s",' "$(esc "$PLATFORM_ARCH")"
+  printf '"uname_s":"%s",' "$(esc "$PLATFORM_UNAME_S")"
+  printf '"hostname":"%s",' "$(esc "$PLATFORM_HOSTNAME")"
+  printf '"soc_machine":"%s",' "$(esc "$PLATFORM_SOC_MACHINE")"
+  printf '"soc_id":"%s",' "$(esc "$PLATFORM_SOC_ID")"
+  printf '"soc_family":"%s",' "$(esc "$PLATFORM_SOC_FAMILY")"
+  printf '"dt_model":"%s",' "$(esc "$PLATFORM_DT_MODEL")"
+  printf '"dt_compatible":"%s",' "$(esc "$PLATFORM_DT_COMPAT")"
+  printf '"os_like":"%s",' "$(esc "$PLATFORM_OS_LIKE")"
+  printf '"os_name":"%s",' "$(esc "$PLATFORM_OS_NAME")"
+  printf '"target":"%s",' "$(esc "$PLATFORM_TARGET")"
+  printf '"machine":"%s"' "$(esc "$PLATFORM_MACHINE")"
+  printf '}\n'
+} >"$OUT_DIR/platform.json"
+log_info "Platform JSON → $OUT_DIR/platform.json"
+
+# ---------- systemd dependency trees ----------
+if command -v systemctl >/dev/null 2>&1; then
+  systemctl list-dependencies sysinit.target --plain --all >"$OUT_DIR/sysinit_deps.txt" 2>&1 || true
+  systemctl list-dependencies basic.target --plain --all >"$OUT_DIR/basic_deps.txt" 2>&1 || true
+else
+  log_warn "systemctl not found; skipping dependency trees"
+fi
+
+# ---------- units + states CSV ----------
+units_file="$OUT_DIR/units.list"
+: >"$units_file"
+if command -v systemctl >/dev/null 2>&1; then
+  systemctl list-dependencies sysinit.target --plain --all 2>/dev/null \
+    | sed '1d' | tr -d '●' | sed 's/^[[:space:]]*//' >>"$units_file" || true
+  systemctl list-dependencies basic.target --plain --all 2>/dev/null \
+    | sed '1d' | tr -d '●' | sed 's/^[[:space:]]*//' >>"$units_file" || true
+  systemctl list-units --type=service --state=active --no-legend 2>/dev/null \
+    | awk '{print $1}' >>"$units_file" || true
+  sort -u "$units_file" | grep -E '\.(service|target|mount|socket|path|timer)$' >"$units_file.tmp" 2>/dev/null || true
+  mv -f "$units_file.tmp" "$units_file" 2>/dev/null || true
+
+  csv="$OUT_DIR/unit_states.csv"
+  echo "unit,active_state,sub_state,load_state,enabled,start_usec,fragment_path,source_path,default_deps" >"$csv"
+  while IFS= read -r u; do
+    [ -n "$u" ] || continue
+    show_out="$(systemctl show "$u" \
+      -p Id -p ActiveState -p SubState -p LoadState -p UnitFileState \
+      -p ActiveEnterTimestampMonotonic -p FragmentPath -p SourcePath -p DefaultDependencies 2>/dev/null || true)"
+    id=$(printf '%s\n' "$show_out" | sed -n 's/^Id=//p' | head -n 1)
+    act=$(printf '%s\n' "$show_out" | sed -n 's/^ActiveState=//p' | head -n 1)
+    sub=$(printf '%s\n' "$show_out" | sed -n 's/^SubState=//p' | head -n 1)
+    load=$(printf '%s\n' "$show_out" | sed -n 's/^LoadState=//p' | head -n 1)
+    en=$(printf '%s\n' "$show_out" | sed -n 's/^UnitFileState=//p' | head -n 1)
+    usec=$(printf '%s\n' "$show_out" | sed -n 's/^ActiveEnterTimestampMonotonic=//p' | head -n 1)
+    frag=$(printf '%s\n' "$show_out" | sed -n 's/^FragmentPath=//p' | head -n 1)
+    src=$(printf '%s\n' "$show_out" | sed -n 's/^SourcePath=//p' | head -n 1)
+    ddef=$(printf '%s\n' "$show_out" | sed -n 's/^DefaultDependencies=//p' | head -n 1)
+
+    id=$(printf '%s' "$id" | tr '"' "'")
+    act=$(printf '%s' "$act" | tr '"' "'")
+    sub=$(printf '%s' "$sub" | tr '"' "'")
+    load=$(printf '%s' "$load" | tr '"' "'")
+    en=$(printf '%s' "$en" | tr '"' "'")
+    usec=$(printf '%s' "$usec" | tr '"' "'")
+    frag=$(printf '%s' "$frag" | tr '"' "'")
+    src=$(printf '%s' "$src" | tr '"' "'")
+    ddef=$(printf '%s' "$ddef" | tr '"' "'")
+
+    printf '"%s","%s","%s","%s","%s","%s","%s","%s","%s"\n' \
+      "$id" "$act" "$sub" "$load" "$en" "$usec" "$frag" "$src" "$ddef" >>"$csv"
+  done <"$units_file"
+  log_info "Wrote unit states CSV → $csv"
+else
+  log_warn "systemctl not found; skipping unit state CSV"
+fi
+
+# ---------- systemd-analyze artifacts ----------
+an_time="$OUT_DIR/analyze_time.txt"
+an_blame="$OUT_DIR/blame.txt"
+an_blame_top="$OUT_DIR/blame_top20.txt"
+an_chain="$OUT_DIR/critical_chain.txt"
+jobs_unfinished="$OUT_DIR/list_jobs_when_boot_unfinished.txt"
+
+BOOT_NOT_FINISHED=0
+
+if command -v systemd-analyze >/dev/null 2>&1; then
+  : >"$jobs_unfinished"
+
+  if command -v wait_analyze_ready >/dev/null 2>&1; then
+    # Preferred path: shared helper from lib_performance.sh
+    max_wait="${WAIT_ANALYZE_FINISH_TIMEOUT:-240}" # bump default to 240s
+    interval="${WAIT_ANALYZE_FINISH_INTERVAL:-5}"
+
+    if wait_analyze_ready "$an_time" "$jobs_unfinished" \
+         "$max_wait" "$interval"; then
+      BOOT_NOT_FINISHED=0
+    else
+      BOOT_NOT_FINISHED=1
+      log_warn "systemd-analyze did not report finished boot within ${max_wait}s, KPIs may stay 'unknown'. See $an_time and $jobs_unfinished."
+    fi
+  else
+    # Fallback: original inline loop, with larger default timeout
+    wait_analyze="${WAIT_ANALYZE_FINISH_TIMEOUT:-240}"
+    i=0
+    got_finish=0
+
+    while [ "$i" -le "$wait_analyze" ]; do
+      systemd-analyze time >"$an_time" 2>&1 || true
+
+      if grep -q 'Startup finished in' "$an_time" 2>/dev/null; then
+        got_finish=1
+        BOOT_NOT_FINISHED=0
+        break
+      fi
+
+      if grep -q 'Bootup is not yet finished' "$an_time" 2>/dev/null; then
+        BOOT_NOT_FINISHED=1
+        systemctl list-jobs >"$jobs_unfinished" 2>&1 || true
+      fi
+
+      i=$((i+1))
+      sleep 1
+    done
+
+    if [ "$got_finish" -eq 1 ]; then
+      first_line=$(sed -n '1p' "$an_time" 2>/dev/null || true)
+      if [ -n "$first_line" ]; then
+        log_info "systemd-analyze time: $first_line"
+      else
+        log_info "systemd-analyze time written to $an_time"
+      fi
+    else
+      log_warn "systemd-analyze reports boot not finished even after ${wait_analyze}s KPI breakdown may remain 'unknown'. See $an_time and $jobs_unfinished."
+    fi
+  fi
+
+  systemd-analyze critical-chain >"$an_chain" 2>&1 || true
+  log_info "systemd-analyze critical-chain → $an_chain"
+
+  systemd-analyze blame >"$an_blame" 2>&1 || true
+  head -n 20 "$an_blame" >"$an_blame_top" 2>/dev/null \
+    || cp "$an_blame" "$an_blame_top" 2>/dev/null || true
+  log_info "Top 20 services by time (systemd-analyze blame) → $an_blame_top"
+
+  if [ "$SVG" = "yes" ]; then
+    systemd-analyze plot >"$OUT_DIR/boot_analysis.svg" 2>/dev/null || true
+    log_info "Boot SVG timeline → $OUT_DIR/boot_analysis.svg"
+  else
+    log_info "SVG plot disabled via --no-svg"
+  fi
+
+  systemd-analyze dot >"$OUT_DIR/boot.dot" 2>/dev/null || true
+  log_info "Boot dependency DOT graph → $OUT_DIR/boot.dot"
+else
+  log_warn "systemd-analyze not found, skipping timing/critical-chain/blame/plot"
+fi
+
+# ---------- Bootchart (optional) ----------
+if bootchart_enabled; then
+  for p in /run/log/bootchart.tgz /run/log/bootchart/bootchart.tgz; do
+    if [ -f "$p" ]; then
+      cp "$p" "$OUT_DIR/bootchart.tgz" 2>/dev/null || true
+      if [ -f "$OUT_DIR/bootchart.tgz" ]; then
+        log_info "Bootchart archive → $OUT_DIR/bootchart.tgz"
+      fi
+      break
+    fi
+  done
+else
+  log_skip "systemd-bootchart not enabled in cmdline; skipping bootchart-specific collection"
+fi
+
+# ---------- Failed units + journal ----------
+if command -v systemctl >/dev/null 2>&1; then
+  systemctl --failed >"$OUT_DIR/failed_units.txt" 2>&1 || true
+fi
+
+if command -v journalctl >/dev/null 2>&1; then
+  journalctl -b >"$OUT_DIR/journal_boot.txt" 2>&1 || true
+  journalctl -b -p warning..alert >"$OUT_DIR/journal_warn.txt" 2>&1 || true
+  journalctl -b -p err..alert >"$OUT_DIR/journal_err.txt" 2>&1 || true
+else
+  log_warn "journalctl not found; skipping boot journal capture"
+fi
+
+# ---------- required units gating ----------
+suite_rc=0
+if [ -n "$REQ_UNITS_FILE" ]; then
+  if command -v systemctl >/dev/null 2>&1; then
+    rc=0
+    while IFS= read -r u; do
+      [ -n "$u" ] || continue
+      if ! systemctl is-active --quiet "$u"; then
+        log_info "Waiting for $u (up to ${TIMEOUT_PER_UNIT}s)..."
+        i=0
+        while [ "$i" -lt "$TIMEOUT_PER_UNIT" ]; do
+          systemctl is-active --quiet "$u" && break
+          sleep 1
+          i=$((i+1))
+        done
+      fi
+      if systemctl is-active --quiet "$u"; then
+        log_info "[ok] $u is active"
+      else
+        log_fail "[fail] $u not active after ${TIMEOUT_PER_UNIT}s"
+        rc=1
+      fi
+    done <"$REQ_UNITS_FILE"
+    [ "$rc" -eq 0 ] || suite_rc=1
+  else
+    log_warn "systemctl not found; cannot verify required units"
+  fi
+else
+  log_warn "No --required file provided; not gating PASS/FAIL on specific units"
+fi
+
+# ---------- KPI breakdown (this run) ----------
+CLOCKSOURCE="unknown"
+if [ -f "$OUT_DIR/clocksource.txt" ]; then
+  CLOCKSOURCE=$(
+    grep '^clocksource=' "$OUT_DIR/clocksource.txt" 2>/dev/null \
+      | sed 's/^clocksource=//' | head -n 1
+  )
+  [ -n "$CLOCKSOURCE" ] || CLOCKSOURCE="unknown"
+fi
+
+# Read UEFI loader times (efivars)
+perf_read_uefi_loader_times
+UEFI_INITs="${PERF_UEFI_INIT_SEC:-unknown}"
+UEFI_EXECs="${PERF_UEFI_EXEC_SEC:-unknown}"
+UEFI_TOTAL="${PERF_UEFI_TOTAL_SEC:-unknown}"
+
+# Parse systemd-analyze time/blame
+FIRMWARE_SEC=""
+LOADER_SEC=""
+KERNEL_SEC=""
+USERSPACE_SEC=""
+TOTAL_SEC=""
+USERSPACE_EFF=""
+TOTAL_EFF=""
+
+if [ "$BOOT_NOT_FINISHED" -eq 0 ]; then
+  perf_parse_boot_times "$an_time" "$an_blame" "$EXCLUDE_NETWORKD_WAIT_ONLINE"
+
+  FIRMWARE_SEC="${PERF_FIRMWARE_SEC:-}"
+  LOADER_SEC="${PERF_LOADER_SEC:-}"
+  KERNEL_SEC="${PERF_KERNEL_SEC:-}"
+  USERSPACE_SEC="${PERF_USERSPACE_SEC:-}"
+  TOTAL_SEC="${PERF_TOTAL_SEC:-}"
+
+  USERSPACE_EFF="${PERF_USERSPACE_EFFECTIVE_SEC:-$USERSPACE_SEC}"
+  TOTAL_EFF="${PERF_TOTAL_EFFECTIVE_SEC:-$TOTAL_SEC}"
+else
+  log_warn "Boot not finished according to systemd-analyze; leaving KPI time fields as 'unknown'. See $an_time and $jobs_unfinished."
+fi
+
+# Extra service exclusions (beyond networkd-wait-online)
+EXCL_SVC_SEC=""
+EXCL_SVC_DETAIL=""
+if [ -n "$EXCLUDE_SERVICES" ] && [ -f "$an_blame" ]; then
+  sum="0"
+  detail=""
+  for svc in $EXCLUDE_SERVICES; do
+    line=$(grep "[[:space:]]$svc\$" "$an_blame" 2>/dev/null | head -n 1 || true)
+    [ -n "$line" ] || continue
+    seg=$(printf '%s\n' "$line" | awk '{NF--; print}')
+    t=$(perf_time_segment_to_sec "$seg")
+    [ -n "$t" ] || continue
+    detail="${detail}${svc}=${t}s; "
+    sum=$(printf '%s %s\n' "$sum" "$t" | awk '{printf("%.3f\n", $1+$2)}')
+  done
+  if [ "$sum" != "0" ]; then
+    EXCL_SVC_SEC="$sum"
+    EXCL_SVC_DETAIL="$detail"
+    if [ -n "$USERSPACE_EFF" ]; then
+      USERSPACE_EFF=$(printf '%s %s\n' "$USERSPACE_EFF" "$sum" \
+        | awk '{d=$1-$2; if (d<0) d=0; printf("%.3f\n", d)}')
+    fi
+    if [ -n "$TOTAL_EFF" ]; then
+      TOTAL_EFF=$(printf '%s %s\n' "$TOTAL_EFF" "$sum" \
+        | awk '{d=$1-$2; if (d<0) d=0; printf("%.3f\n", d)}')
+    fi
+  fi
+fi
+
+# Log exclusions clearly
+if [ "$EXCLUDE_NETWORKD_WAIT_ONLINE" -eq 1 ] && [ -n "${PERF_NETWORKD_WAIT_ONLINE_SEC:-}" ]; then
+  log_info "Excluded systemd-networkd-wait-online.service=${PERF_NETWORKD_WAIT_ONLINE_SEC}s from userspace/total; boot_total_effective_sec=$TOTAL_EFF"
+fi
+if [ -n "$EXCL_SVC_SEC" ]; then
+  log_info "Excluded services from userspace/total (sum=${EXCL_SVC_SEC}s): $EXCL_SVC_DETAIL boot_total_effective_sec=$TOTAL_EFF"
+fi
+
+# KPI printout (console + file)
+kpi_file="$OUT_DIR/boot_kpi_this_run.txt"
+{
+  echo "Boot KPI (this run)"
+  echo " boot_type : $BOOT_TYPE"
+  echo " iterations : $BOOT_KPI_ITERATIONS"
+  echo " clocksource : $CLOCKSOURCE"
+  echo " uefi_time_sec : $UEFI_TOTAL (Init=$UEFI_INITs, Exec=$UEFI_EXECs)"
+  echo " firmware_time_sec : ${FIRMWARE_SEC:-unknown}"
+  echo " bootloader_time_sec : ${LOADER_SEC:-unknown}"
+  echo " kernel_time_sec : ${KERNEL_SEC:-unknown}"
+  echo " userspace_time_sec : ${USERSPACE_SEC:-unknown}"
+  echo " userspace_effective_time_sec : ${USERSPACE_EFF:-unknown}"
+  echo " boot_total_sec : ${TOTAL_SEC:-unknown}"
+  echo " boot_total_effective_sec : ${TOTAL_EFF:-unknown}"
+} | tee "$kpi_file"
+
+log_info "Boot KPI breakdown (this run) → $kpi_file"
+
+# ---------- VERBOSE: dump key .txt artifacts to console ----------
+if [ "$VERBOSE" -eq 1 ]; then
+  log_info "Verbose mode: dumping text artifacts from $OUT_DIR (excluding journal_*.txt)"
+  for f in "$OUT_DIR"/*.txt; do
+    [ -f "$f" ] || continue
+    base=$(basename "$f")
+    case "$base" in
+      journal_*.txt)
+        # Skip huge journal files in verbose mode
+        continue
+        ;;
+    esac
+    echo "===== $base ====="
+    cat "$f"
+    echo
+  done
+fi
+
+# ---------- final PASS/FAIL ----------
+if [ "$suite_rc" -eq 0 ]; then
+  log_pass "$TESTNAME: PASS"
+  echo "$TESTNAME PASS" >"$RES_FILE"
+else
+  log_fail "$TESTNAME: FAIL"
+  echo "$TESTNAME FAIL" >"$RES_FILE"
+fi
+
+# restore_governor via trap
+exit "$suite_rc"

From 99624c73944ec15bad6c8c6e2bd62b58c19fd419 Mon Sep 17 00:00:00 2001
From: Srikanth Muppandam <smuppand@qti.qualcomm.com>
Date: Mon, 8 Dec 2025 11:31:56 +0530
Subject: [PATCH 3/3] Boot_Systemd_KPI_Loop: decouple KPI loop from boot
 transaction Switch auto-reboot orchestration to a oneshot systemd service +
 timer. Let the loop manage iterations/state while keeping the boot path
 clean. Ensure the KPI service exits quickly so FinishTimestampMonotonic is
 not blocked. Document usage while preserving behavior for manual single-shot
 runs.

Signed-off-by: Srikanth Muppandam <smuppand@qti.qualcomm.com>
---
 .../Boot_Systemd_KPI_Loop.yaml                |  37 ++
 .../Systemd_Boot_KPI_Tests_Overview.md        | 514 ++++++++++++++++++
 .../Performance/Boot_Systemd_KPI_Loop/run.sh  | 388 +++++++++++++
 3 files changed, 939 insertions(+)
 create mode 100755 Runner/suites/Performance/Boot_Systemd_KPI_Loop/Boot_Systemd_KPI_Loop.yaml
 create mode 100644 Runner/suites/Performance/Boot_Systemd_KPI_Loop/Systemd_Boot_KPI_Tests_Overview.md
 create mode 100755 Runner/suites/Performance/Boot_Systemd_KPI_Loop/run.sh

diff --git a/Runner/suites/Performance/Boot_Systemd_KPI_Loop/Boot_Systemd_KPI_Loop.yaml b/Runner/suites/Performance/Boot_Systemd_KPI_Loop/Boot_Systemd_KPI_Loop.yaml
new file mode 100755
index 00000000..aaac3e14
--- /dev/null
+++ b/Runner/suites/Performance/Boot_Systemd_KPI_Loop/Boot_Systemd_KPI_Loop.yaml
@@ -0,0 +1,37 @@
+metadata:
+  name: boot-systemd-kpi-loop
+  format: "Lava-Test Test Definition 1.0"
+  description: "Multi-boot KPI aggregator wrapper for Boot_Systemd_Validate with optional auto-reboot orchestration."
+  os:
+    - linux
+  scope:
+    - performance
+    - functional
+
+params:
+  # Where the child KPI script lives (Boot_Systemd_Validate)
+  KPI_SCRIPT: "./../Boot_Systemd_Validate/run.sh"
+  KPI_OUT_DIR: "./../Boot_Systemd_Validate/logs_Boot_Systemd_Validate"
+
+  # Averaging window / iteration collection
+  ITERATIONS: "5"
+  BOOT_TYPE: "cold" # cold|warm|unknown etc
+
+  # Forwarded knobs to Boot_Systemd_Validate
+  DISABLE_GETTY: "1" # 1|0
+  DISABLE_SSHD: "0" # 1|0
+  EXCLUDE_NETWORKD_WAIT_ONLINE: "1" # 1|0
+  EXCLUDE_SERVICES: "" # space-separated service names
+  NO_SVG: "1" # 1 disables svg
+  VERBOSE: "0" # 1 dumps key artifacts
+
+  # Orchestration
+  AUTO_REBOOT: "0" # 1 enables loop orchestration
+  REBOOT_RESULT_MODE: "PASS" # PASS (default) or SKIP when reboot requested mid-loop
+
+run:
+  steps:
+    - REPO_PATH=$PWD
+    - cd Runner/suites/Performance/Boot_Systemd_KPI_Loop/ 
+    - ./run.sh || true
+    - $REPO_PATH/Runner/utils/send-to-lava.sh Boot_Systemd_KPI_Loop.res || true
diff --git a/Runner/suites/Performance/Boot_Systemd_KPI_Loop/Systemd_Boot_KPI_Tests_Overview.md b/Runner/suites/Performance/Boot_Systemd_KPI_Loop/Systemd_Boot_KPI_Tests_Overview.md
new file mode 100644
index 00000000..01036f7d
--- /dev/null
+++ b/Runner/suites/Performance/Boot_Systemd_KPI_Loop/Systemd_Boot_KPI_Tests_Overview.md
@@ -0,0 +1,514 @@
+Systemd Boot KPI: How to Use the Two Tests
+==========================================
+
+We provide two complementary tests for measuring systemd boot KPIs:
+
+1. **Per-boot KPI collector**  
+   `Boot_Systemd_Validate/run.sh`
+2. **Reboot loop wrapper / KPI aggregator**  
+   `Boot_Systemd_KPI_Loop/run.sh`
+
+They are designed to work together but serve **different use-cases**.
+
+Typical paths in qcom-linux-testkit:
+
+```text
+suites/Performance/Boot_Systemd_Validate/run.sh
+suites/Performance/Boot_Systemd_KPI_Loop/run.sh
+```
+
+---
+
+1. `Boot_Systemd_Validate` – Per-boot KPI collector
+---------------------------------------------------
+
+**Path (example):**
+
+```text
+suites/Performance/Boot_Systemd_Validate/run.sh
+```
+
+### Purpose
+
+Runs **once per boot** and collects detailed systemd boot KPIs:
+
+- `systemd-analyze time` (parsed into firmware/loader/kernel/userspace/total)
+- `systemd-analyze blame` (full + top-20)
+- `systemd-analyze critical-chain`
+- `systemd-analyze plot` → `boot_analysis.svg` (optional)
+- `systemd-analyze dot` → `boot.dot`
+- `systemctl` unit dependency trees and per-unit state CSV
+- Journals: full boot, warnings, errors (when `journalctl` is available)
+- Optional **gating on required units** (e.g. “all critical services must be active”)
+- **UEFI loader timings** from efivars (Init/Exec/Total) when EFI vars exist
+- **Exclusion of slow services** from userspace/total (e.g. `systemd-networkd-wait-online.service`)
+
+All logs are stored under a test-local directory:
+
+```text
+./logs_Boot_Systemd_Validate/
+```
+
+When `--iterations N` is passed, the script still runs **once**, but includes
+this hint in the KPI output so that the KPI loop wrapper knows the intended
+window size.
+
+---
+
+### Usage (CLI help)
+
+The script has a built-in help that matches the implementation:
+
+```text
+Usage: ./run.sh [OPTIONS]
+
+Options:
+  --out DIR           Output directory for logs (default: ./logs_Boot_Systemd_Validate)
+  --required FILE     File listing systemd units that must become active
+  --timeout S         Timeout per required unit (seconds, default: $TIMEOUT_PER_UNIT)
+  --no-svg            Skip systemd-analyze plot SVG generation
+  --boot-type TYPE    Tag boot type (e.g. cold, warm, unknown)
+  --disable-getty     Disable serial-getty@ttyS0.service for this KPI run
+  --disable-sshd      Disable sshd.service for this KPI run
+
+  --exclude-networkd-wait-online
+                      Exclude systemd-networkd-wait-online.service time
+                      from userspace/total based on systemd-analyze blame
+
+  --exclude-services "svc1 svc2 ..."
+                      Exclude one or more services (matching names in
+                      systemd-analyze blame) from userspace/total.
+                      The summed time is subtracted and reported as
+                      an effective KPI.
+
+  --iterations N      Hint for KPI iterations (wrapper/LAVA metadata; this
+                      script still runs once per invocation)
+
+  --verbose           Dump key .txt artifacts from OUT_DIR to console for
+                      LAVA debugging (skips large journal_*.txt files)
+
+  -h, --help          Show this help and exit
+```
+
+**Environment knobs (optional):**
+
+- `TIMEOUT_PER_UNIT` – default per-unit wait time for `--required`
+- `SVG=yes|no` – default for SVG generation (overridden by `--no-svg`)
+- `BOOT_TYPE` – default boot type tag (overridden by `--boot-type`)
+- `BOOT_KPI_ITERATIONS` – default for the `iterations` field in the KPI output
+
+---
+
+### Outputs / Artifacts
+
+All written under `OUT_DIR` (default: `./logs_Boot_Systemd_Validate`):
+
+- Platform + metadata  
+  - `platform.txt`, `platform.json`  
+  - `clocksource.txt` (current clocksource)  
+  - `boot_type.txt` (e.g. `cold`, `warm`, `unknown`)
+
+- Units & dependencies  
+  - `sysinit_deps.txt`, `basic_deps.txt`  
+  - `units.list`  
+  - `unit_states.csv` (per-unit state/export from `systemctl show`)
+
+- Systemd timing & graphs  
+  - `analyze_time.txt` (raw `systemd-analyze time` output)  
+  - `blame.txt`, `blame_top20.txt`  
+  - `critical_chain.txt`  
+  - `boot_analysis.svg` (unless `--no-svg`)  
+  - `boot.dot`
+
+- Journals  
+  - `journal_boot.txt` – full boot journal  
+  - `journal_warn.txt` – warnings and above  
+  - `journal_err.txt` – errors and above  
+
+- Bootchart (if enabled via `init=/lib/systemd/systemd-bootchart`)  
+  - `bootchart.tgz` (if present under `/run/log/...`)
+
+- Required units  
+  - `failed_units.txt` (from `systemctl --failed`)  
+
+- **KPI breakdown (this run)**  
+  - `boot_kpi_this_run.txt` – structured, human-readable KPI summary
+
+---
+
+### KPI breakdown: fields and exclusions
+
+At the end of the run, the script prints a KPI summary **to console** and
+writes the same content into `boot_kpi_this_run.txt`, for example:
+
+```text
+Boot KPI (this run)
+ boot_type : cold
+ iterations : 5
+ clocksource : arch_sys_counter
+ uefi_time_sec : 438093.283 (Init=214751.707, Exec=223341.576)
+ firmware_time_sec : 3.765
+ bootloader_time_sec : 0.176
+ kernel_time_sec : 6.124
+ userspace_time_sec : 126.942
+ userspace_effective_time_sec : 6.825
+ boot_total_sec : 137.008
+ boot_total_effective_sec : 16.891
+```
+
+Fields:
+
+- `uefi_time_sec`  
+  Sum of UEFI loader Init+Exec time in seconds, derived from EFI vars:
+
+  - `LoaderTimeInitUSec-4a67b082-0a4c-41cf-b6c7-440b29bb8c4f`
+  - `LoaderTimeExecUSec-4a67b082-0a4c-41cf-b6c7-440b29bb8c4f`
+
+  with individual Init/Exec components also printed.
+
+- `firmware_time_sec`, `bootloader_time_sec`, `kernel_time_sec`,
+  `userspace_time_sec`, `boot_total_sec`  
+  Parsed from `systemd-analyze time`:
+
+  ```text
+  Startup finished in 3.801s (firmware) + 174ms (loader) + 6.106s (kernel) + 2min 7.045s (userspace) = 2min 17.127s
+  ```
+
+- `userspace_effective_time_sec`, `boot_total_effective_sec`  
+
+  These are derived from the raw userspace/total time by subtracting:
+
+  1. `systemd-networkd-wait-online.service` time when
+     `--exclude-networkd-wait-online` is passed.
+  2. Any additional services given via `--exclude-services "svc1 svc2"`.
+
+The script logs exclusions clearly, for example:
+
+```text
+[INFO] ... Excluded systemd-networkd-wait-online.service=120.117s from userspace/total; boot_total_effective_sec=16.891
+[INFO] ... Excluded services from userspace/total (sum=2.500s): docker.service=0.966s; NetworkManager.service=1.534s;  boot_total_effective_sec=14.391
+```
+
+If `systemd-analyze time` reports:
+
+```text
+Bootup is not yet finished (org.freedesktop.systemd1.Manager.FinishTimestampMonotonic=0).
+```
+
+the script:
+
+- Marks the timing fields as `unknown`.
+- Logs the active jobs from `systemctl list-jobs` to **console** so that
+  blocking services (including our own KPI service if misconfigured) are
+  visible during LAVA debugging.
+
+This diagnostic logging happens **even without `--verbose`**.
+
+---
+
+### Verbose mode (`--verbose`)
+
+When `--verbose` is set, the script:
+
+- Prints all “reasonable” `.txt` artifacts from `OUT_DIR` to console
+  (excluding `journal_*.txt` for size reasons).
+- This is intended for LAVA and other CI where you cannot easily inspect the
+  filesystem but can scroll the job log.
+
+Example tail of the verbose section:
+
+```text
+[INFO] ... Verbose mode: dumping text artifacts from ./logs_Boot_Systemd_Validate (excluding journal_*.txt)
+===== analyze_time.txt =====
+Startup finished in ...
+...
+===== boot_kpi_this_run.txt =====
+Boot KPI (this run)
+ ...
+```
+
+---
+
+### Typical usage examples
+
+**1) Basic per-boot KPI with required units**
+
+```sh
+./run.sh   --timeout 60   --required required-units.txt
+```
+
+**2) Cold-boot KPI, excluding networkd-wait-online + Docker/Weston**
+
+```sh
+./run.sh   --boot-type cold   --disable-getty   --exclude-networkd-wait-online   --exclude-services "docker.service weston.service"
+```
+
+**3) LAVA-friendly verbose run**
+
+```sh
+./run.sh   --boot-type warm   --disable-getty   --exclude-networkd-wait-online   --iterations 5   --verbose
+```
+
+In all cases, the main KPI is in `logs_Boot_Systemd_Validate/boot_kpi_this_run.txt`
+and echoed to console.
+
+---
+
+2. `Boot_Systemd_KPI_Loop` – Reboot loop wrapper & KPI aggregator
+-----------------------------------------------------------------
+
+**Path (example):**
+
+```text
+suites/Performance/Boot_Systemd_KPI_Loop/run.sh
+```
+
+### Purpose
+
+A **thin wrapper** that drives multiple KPI iterations across reboots and
+computes averages over the last **N boots** of a given `boot_type`.
+
+On each (re)boot it:
+
+1. Loads state from `Boot_Systemd_KPI_Loop.state` (if present) to determine:
+   - Total iterations requested
+   - Iterations already completed
+   - Boot type & options
+   - KPI script path + base out dir
+2. Computes **this iteration index**, and a per-iteration out dir:
+
+   ```text
+   <base_out_dir>/iter_<N>
+   ```
+
+3. Calls `Boot_Systemd_Validate/run.sh` once with:
+   - `--out <base_out_dir>/iter_N`
+   - `--boot-type <TYPE>`
+   - `--iterations <TOTAL>`
+   - Forwarded flags (`--disable-getty`, `--exclude-...`, `--verbose`, etc.)
+4. Parses `boot_kpi_this_run.txt` for this iteration, appends a row into:
+
+   ```text
+   Boot_Systemd_KPI_stats.csv
+   ```
+
+5. Computes averages over the last **N entries** for this `boot_type` and writes:
+
+   ```text
+   Boot_Systemd_KPI_summary.txt
+   ```
+
+6. In **auto-reboot mode**, if more iterations are pending:
+   - Updates `Boot_Systemd_KPI_Loop.state`
+   - Triggers a reboot
+   - A small systemd service (`boot-systemd-kpi-loop.service`) invokes this
+     script again on the next boot until all iterations complete.
+
+When all iterations finish, the wrapper:
+
+- Prints the KPI average summary to console.
+- Leaves `.csv` and `.summary.txt` for further analysis.
+- Cleans up the systemd hook + state file in auto-reboot mode.
+
+---
+
+### Usage (CLI help)
+
+```text
+Usage: ./run.sh [OPTIONS]
+
+This wrapper:
+  * Runs Boot_Systemd_Validate once for the *current boot*
+  * Uses a per-iteration KPI out dir when --iterations > 1:
+      base: ../Boot_Systemd_Validate/logs_Boot_Systemd_Validate
+      iter: <base>/iter_<N>
+  * Parses boot_kpi_this_run.txt from that test
+  * Appends a row into Boot_Systemd_KPI_stats.csv
+  * Computes averages over the last N boots (per boot_type) and prints summary.
+
+Options:
+  --kpi-script PATH   Override Boot_Systemd_Validate script path
+                      (default: ../Boot_Systemd_Validate/run.sh)
+
+  --kpi-out-dir DIR   Override base KPI output dir
+                      (default: ../Boot_Systemd_Validate/logs_Boot_Systemd_Validate)
+
+  --iterations N      Number of boots to average over (default: 1)
+  --boot-type TYPE    Tag for this run (e.g. cold, warm, unknown)
+
+  # Options forwarded to Boot_Systemd_Validate:
+  --disable-getty     Disable serial-getty@ttyS0.service
+  --disable-sshd      Disable sshd.service
+  --exclude-networkd-wait-online
+                      Exclude systemd-networkd-wait-online.service
+  --exclude-services "A B"
+                      Exclude these services from userspace/total
+  --no-svg            Disable SVG plot generation
+  --verbose           Print KPI .txt artifacts to console for debug
+
+  # Auto-reboot orchestration:
+  --auto-reboot       Install systemd hook and auto-reboot until
+                      --iterations boots are collected. State is
+                      stored in: Boot_Systemd_KPI_Loop.state
+
+  -h, --help          Show this help and exit
+```
+
+---
+
+### Files written by the loop wrapper
+
+Under the same directory as `Boot_Systemd_KPI_Loop/run.sh`:
+
+- `Boot_Systemd_KPI_Loop.res`  
+  PASS/FAIL status for the wrapper itself.
+
+- `Boot_Systemd_KPI_Loop.state`  
+  Persistent state across reboots (total iterations, done so far, boot_type,
+  options, KPI script path/out dir). Removed automatically when all iterations
+  complete or on error.
+
+- `Boot_Systemd_KPI_stats.csv`  
+  Rolling KPI database across boots. Each row corresponds to the parsed
+  `boot_kpi_this_run.txt` of one boot (for a given `boot_type`).
+
+- `Boot_Systemd_KPI_summary.txt`  
+  Human-readable summary of averages over the last **N** entries of that
+  `boot_type`, e.g.:
+
+  ```text
+  Boot KPI summary (last 5 cold boot(s))
+   entries_used : 5
+   target_iterations : 5
+   boot_type : cold
+   avg_uefi_time_sec : ...
+   avg_firmware_time_sec : ...
+   avg_bootloader_time_sec : ...
+   avg_kernel_time_sec : ...
+   avg_userspace_time_sec : ...
+   avg_userspace_effective_time_sec : ...
+   avg_boot_total_sec : ...
+   avg_boot_total_effective_sec : ...
+  ```
+
+- `Boot_Systemd_KPI_Loop_stdout_<timestamp>.log`  
+  Stdout/stderr log(s) for the wrapper itself (if you preserve them).
+
+Per-iteration artifacts from `Boot_Systemd_Validate` live under:
+
+```text
+../Boot_Systemd_Validate/logs_Boot_Systemd_Validate/iter_1/
+../Boot_Systemd_Validate/logs_Boot_Systemd_Validate/iter_2/
+...
+```
+
+Each `iter_N` has its own `boot_kpi_this_run.txt`, `analyze_time.txt`, etc.
+
+---
+
+### Auto-reboot mode details
+
+When `--auto-reboot` is passed:
+
+- The wrapper installs a small systemd service (e.g. `boot-systemd-kpi-loop.service`)
+  that runs the wrapper at boot.
+- On each boot, the wrapper:
+  - Runs `Boot_Systemd_Validate` once.
+  - Updates the `.state` file with the new iteration count.
+  - If more iterations are required, it requests `reboot` again.
+- After the final iteration:
+  - KPI averages are computed and printed.
+  - The systemd hook is removed.
+  - The state file is deleted.
+
+The reboot logic is designed to:
+
+- Ensure the reboot actually happens (falling back between `reboot` and `/sbin/reboot`).
+- Avoid blocking `systemd-analyze` permanently: the KPI scripts finish quickly,
+  and if any unit (including our own) prevents boot from completing, it will
+  show up in the “Bootup is not yet finished … list-jobs” diagnostics inside
+  each `iter_N/analyze_time.txt` and in the **console logs**.
+
+---
+
+### Typical usage examples
+
+**1) Manual KPI over last 5 cold boots (no auto-reboot)**
+
+You manually reboot the board between runs:
+
+```sh
+# Boot 1 (cold boot)
+./run.sh --iterations 5 --boot-type cold --disable-getty --exclude-networkd-wait-online
+
+# Reboot the board manually (power-cycle or reboot)
+
+# Boot 2..5 – re-run the same command each time
+./run.sh --iterations 5 --boot-type cold --disable-getty --exclude-networkd-wait-online
+...
+```
+
+After the 5th run, `Boot_Systemd_KPI_summary.txt` will contain the averages over
+the last 5 `cold` entries.
+
+**2) Fully automated cold-boot KPI campaign (auto-reboot)**
+
+```sh
+./run.sh   --iterations 5   --boot-type cold   --disable-getty   --exclude-networkd-wait-online   --auto-reboot
+```
+
+The wrapper will:
+
+- Run `Boot_Systemd_Validate` on this boot.
+- Reboot automatically until 5 iterations are captured.
+- Finally, print a KPI summary and clean up the systemd hook/state.
+
+**3) Warm-boot KPI with extra service exclusions and verbose logs**
+
+```sh
+./run.sh   --iterations 3   --boot-type warm   --disable-getty   --exclude-networkd-wait-online   --exclude-services "docker.service weston.service"   --auto-reboot   --verbose
+```
+
+This gives:
+
+- Per-iteration directories: `iter_1`, `iter_2`, `iter_3`.
+- Detailed logs printed to console from `Boot_Systemd_Validate` via `--verbose`.
+- Aggregated averages in `Boot_Systemd_KPI_summary.txt`.
+
+---
+
+3. Which one should I use?
+--------------------------
+
+| Scenario                                      | Recommended test                      | Notes                                                                 |
+|----------------------------------------------|---------------------------------------|-----------------------------------------------------------------------|
+| Standard CI pipeline (no reboot-resume)      | `Boot_Systemd_Validate`               | Run once per job; no reboot inside the script.                        |
+| Manual KPI measurement on a single boot      | `Boot_Systemd_Validate`               | E.g. after changing kernel/systemd configs.                          |
+| Quick health-check of systemd units          | `Boot_Systemd_Validate`               | Use `--required` to gate on critical services.                        |
+| Lab KPI across N cold/warm boots             | `Boot_Systemd_KPI_Loop`               | Wrapper handles per-boot dirs + CSV + averages; you may reboot manually. |
+| Automated multi-boot campaign in lab         | `Boot_Systemd_KPI_Loop` with `--auto-reboot` | State file + systemd hook handle the full loop.                 |
+| CI with explicit reboot-resume support       | `Boot_Systemd_KPI_Loop` (if allowed)  | CI must re-run the script after each reboot.                          |
+
+---
+
+4. Design principles
+--------------------
+
+- **Single responsibility**  
+  - `Boot_Systemd_Validate`: _measure one boot and emit KPIs_.  
+  - `Boot_Systemd_KPI_Loop`: _across boots: state, reboots, aggregation_.
+
+- **CI friendliness**  
+  - CI that cannot handle reboots should only use `Boot_Systemd_Validate`.  
+  - Reboot orchestration via `--auto-reboot` is explicitly opt-in.
+
+- **Robust & transparent**  
+  - Rolling CSV + summary for long-term trends.  
+  - Clear console logs for:
+    - service time exclusions,
+    - non-finished boots (`Bootup is not yet finished` + `systemctl list-jobs`),
+    - per-iteration KPI values.
+
+- **Local logs only**  
+  - All artifacts (CSV, SVG, journals, etc.) are stored under the test’s
+    working directory, making log collection and LAVA parsing straightforward.
diff --git a/Runner/suites/Performance/Boot_Systemd_KPI_Loop/run.sh b/Runner/suites/Performance/Boot_Systemd_KPI_Loop/run.sh
new file mode 100755
index 00000000..0c836166
--- /dev/null
+++ b/Runner/suites/Performance/Boot_Systemd_KPI_Loop/run.sh
@@ -0,0 +1,388 @@
+#!/bin/sh
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause-Clear
+# Boot KPI multi-boot aggregator / auto-reboot wrapper around Boot_Systemd_Validate.
+
+SCRIPT_DIR="$(
+  cd "$(dirname "$0")" || exit 1
+  pwd
+)"
+
+TESTNAME="Boot_Systemd_KPI_Loop"
+RES_FILE="./${TESTNAME}.res"
+
+# Default KPI script + base out dir (for iteration subfolders)
+KPI_SCRIPT_DEFAULT="$SCRIPT_DIR/../Boot_Systemd_Validate/run.sh"
+KPI_OUT_DIR_DEFAULT="$SCRIPT_DIR/../Boot_Systemd_Validate/logs_Boot_Systemd_Validate"
+
+# Defaults (env may override; CLI parsing later overrides again)
+KPI_SCRIPT="${KPI_SCRIPT:-$KPI_SCRIPT_DEFAULT}"
+KPI_OUT_DIR="${KPI_OUT_DIR:-$KPI_OUT_DIR_DEFAULT}"
+ITERATIONS="${ITERATIONS:-1}"
+BOOT_TYPE="${BOOT_TYPE:-unknown}"
+
+DISABLE_GETTY="${DISABLE_GETTY:-0}"
+DISABLE_SSHD="${DISABLE_SSHD:-0}"
+EXCLUDE_NETWORKD_WAIT_ONLINE="${EXCLUDE_NETWORKD_WAIT_ONLINE:-0}"
+EXCLUDE_SERVICES="${EXCLUDE_SERVICES:-}"
+NO_SVG="${NO_SVG:-0}"
+AUTO_REBOOT="${AUTO_REBOOT:-0}"
+VERBOSE="${VERBOSE:-0}"
+
+STATE_FILE="$SCRIPT_DIR/Boot_Systemd_KPI_Loop.state"
+KPI_REBOOT_STATE_FILE="$SCRIPT_DIR/Boot_Systemd_KPI_reboot.state"
+SERVICE_NAME="${SERVICE_NAME:-boot-systemd-kpi-loop}"
+STATS_CSV="$SCRIPT_DIR/Boot_Systemd_KPI_stats.csv"
+SUMMARY_FILE="$SCRIPT_DIR/Boot_Systemd_KPI_summary.txt"
+
+# Optional: allow caller to choose whether to treat reboot as PASS/SKIP in LAVA
+# (default PASS so LAVA won't fail the run during reboot)
+REBOOT_RESULT_MODE="${REBOOT_RESULT_MODE:-PASS}"
+
+usage() {
+  cat <<EOF
+Usage: $0 [OPTIONS]
+
+This wrapper:
+  * Runs Boot_Systemd_Validate once for the *current boot*
+  * Uses a per-iteration KPI out dir when --iterations > 1:
+      base: $KPI_OUT_DIR_DEFAULT
+      iter: <base>/iter_<N>
+  * Parses boot_kpi_this_run.txt from that test
+  * Appends a row into ${STATS_CSV##*/}
+  * Computes averages over the last N boots (per boot_type) and prints summary.
+
+Options:
+  --kpi-script PATH Override Boot_Systemd_Validate script path
+                                 (default: $KPI_SCRIPT_DEFAULT)
+  --kpi-out-dir DIR Override base KPI output dir
+                                 (default: $KPI_OUT_DIR_DEFAULT)
+  --iterations N Number of boots to average over (default: 1)
+  --boot-type TYPE Tag for this run (e.g. cold, warm, unknown)
+
+  # Options forwarded to Boot_Systemd_Validate:
+  --disable-getty Disable serial-getty@ttyS0.service
+  --disable-sshd Disable sshd.service
+  --exclude-networkd-wait-online Exclude systemd-networkd-wait-online.service
+  --exclude-services "A B" Exclude these services from userspace/total
+  --no-svg Disable SVG plot generation
+  --verbose Print KPI .txt artifacts to console for debug
+
+  # Auto-reboot orchestration:
+  --auto-reboot Install systemd hook and auto-reboot until
+                                 --iterations boots are collected. State is
+                                 stored in: $STATE_FILE
+
+  -h, --help Show this help and exit
+
+Example (single run, average over last 5 boots of this type):
+  ./run.sh --iterations 5 --boot-type cold --disable-getty --exclude-networkd-wait-online
+
+Auto-reboot mode (script installs systemd hook + reboots until N boots done):
+  ./run.sh --iterations 5 --boot-type cold --disable-getty \\
+           --exclude-networkd-wait-online --auto-reboot
+EOF
+}
+
+# EARLY help handling: do this BEFORE init_env/functestlib stdout capture
+case "${1:-}" in
+  -h|--help)
+    usage >&2
+    exit 0
+    ;;
+esac
+
+# --- locate and source init_env → functestlib.sh + lib_performance.sh ---
+INIT_ENV=""
+SEARCH="$SCRIPT_DIR"
+
+while [ "$SEARCH" != "/" ]; do
+    if [ -f "$SEARCH/init_env" ]; then
+        INIT_ENV="$SEARCH/init_env"
+        break
+    fi
+    SEARCH=$(dirname "$SEARCH")
+done
+
+if [ -z "$INIT_ENV" ]; then
+    echo "[ERROR] Could not find init_env (starting at $SCRIPT_DIR)" >&2
+    exit 1
+fi
+
+# Only source once (idempotent)
+# NOTE: We intentionally **do not export** any new vars. They stay local to this shell.
+if [ -z "${__INIT_ENV_LOADED:-}" ]; then
+    # shellcheck disable=SC1090
+    . "$INIT_ENV"
+    __INIT_ENV_LOADED=1
+fi
+
+# shellcheck disable=SC1090
+. "$INIT_ENV"
+# shellcheck disable=SC1091
+. "$TOOLS/functestlib.sh"
+# shellcheck disable=SC1091
+. "$TOOLS/lib_performance.sh"
+
+# --- CLI parsing ---
+while [ "$#" -gt 0 ]; do
+  case "$1" in
+    --kpi-script)
+      shift
+      KPI_SCRIPT=$1
+      ;;
+    --kpi-out-dir)
+      shift
+      KPI_OUT_DIR=$1
+      ;;
+    --iterations)
+      shift
+      ITERATIONS=$1
+      ;;
+    --boot-type)
+      shift
+      BOOT_TYPE=$1
+      ;;
+    --disable-getty)
+      DISABLE_GETTY=1
+      ;;
+    --disable-sshd)
+      DISABLE_SSHD=1
+      ;;
+    --exclude-networkd-wait-online)
+      EXCLUDE_NETWORKD_WAIT_ONLINE=1
+      ;;
+    --exclude-services)
+      shift
+      EXCLUDE_SERVICES=$1
+      ;;
+    --no-svg)
+      NO_SVG=1
+      ;;
+    --auto-reboot)
+      AUTO_REBOOT=1
+      ;;
+    --verbose)
+      VERBOSE=1
+      ;;
+    -h|--help)
+      usage >&2
+      exit 0
+      ;;
+    *)
+      log_warn "Unknown option: $1"
+      usage >&2
+      echo "$TESTNAME FAIL" >"$RES_FILE"
+      exit 1
+      ;;
+  esac
+  shift
+done
+
+# Validate iterations
+case "$ITERATIONS" in
+  ''|*[!0-9]*)
+    log_warn "Non-numeric --iterations; defaulting to 1"
+    ITERATIONS=1
+    ;;
+esac
+if [ "$ITERATIONS" -lt 1 ] 2>/dev/null; then
+  ITERATIONS=1
+fi
+
+# NEW: auto-enable auto-reboot mode when state exists
+if [ "$AUTO_REBOOT" -eq 0 ] && [ -f "$STATE_FILE" ]; then
+  AUTO_REBOOT=1
+fi
+
+# If we are in auto-reboot mode, first verify whether a previous reboot actually happened.
+if [ "$AUTO_REBOOT" -eq 1 ]; then
+  perf_kpi_check_previous_reboot "$KPI_REBOOT_STATE_FILE"
+fi
+
+# Always log current boot identity for debugging / LAVA traces
+perf_kpi_get_boot_identity
+log_info "$TESTNAME: boot identity → boot_id=${PERF_KPI_BOOT_ID:-unknown} uptime=${PERF_KPI_UPTIME_SEC:-unknown}s"
+
+# Validate KPI script
+if [ ! -x "$KPI_SCRIPT" ]; then
+  log_error "KPI script not executable or missing: $KPI_SCRIPT"
+  echo "$TESTNAME FAIL" >"$RES_FILE"
+  exit 1
+fi
+
+mkdir -p "$KPI_OUT_DIR" 2>/dev/null || true
+
+CURRENT_DONE=0
+
+# --- Auto-reboot: load or initialise state ---
+if [ "$AUTO_REBOOT" -eq 1 ]; then
+  if perf_kpi_load_loop_state "$STATE_FILE"; then
+    # Reuse knobs from state
+    if [ -n "${KPI_LOOP_ITERATIONS_TOTAL:-}" ]; then
+      ITERATIONS=$KPI_LOOP_ITERATIONS_TOTAL
+    fi
+    if [ -n "${KPI_LOOP_BOOT_TYPE:-}" ]; then
+      BOOT_TYPE=$KPI_LOOP_BOOT_TYPE
+    fi
+    if [ -n "${KPI_LOOP_KPI_SCRIPT:-}" ]; then
+      KPI_SCRIPT=$KPI_LOOP_KPI_SCRIPT
+    fi
+    if [ -n "${KPI_LOOP_KPI_OUT_DIR:-}" ]; then
+      KPI_OUT_DIR=$KPI_LOOP_KPI_OUT_DIR
+    fi
+    DISABLE_GETTY=${KPI_LOOP_DISABLE_GETTY:-0}
+    DISABLE_SSHD=${KPI_LOOP_DISABLE_SSHD:-0}
+    EXCLUDE_NETWORKD_WAIT_ONLINE=${KPI_LOOP_EXCLUDE_NETWORKD:-0}
+    EXCLUDE_SERVICES=${KPI_LOOP_EXCLUDE_SERVICES:-}
+    CURRENT_DONE=${KPI_LOOP_ITERATIONS_DONE:-0}
+  else
+    # First time in auto-reboot mode
+    CURRENT_DONE=0
+    perf_kpi_write_loop_state "$STATE_FILE" "$ITERATIONS" "$CURRENT_DONE" \
+      "$BOOT_TYPE" "$DISABLE_GETTY" "$DISABLE_SSHD" \
+      "$EXCLUDE_NETWORKD_WAIT_ONLINE" "$EXCLUDE_SERVICES" \
+      "$KPI_SCRIPT" "$KPI_OUT_DIR"
+    perf_install_kpi_systemd_hook "$SCRIPT_DIR/run.sh" "$SERVICE_NAME"
+  fi
+fi
+
+log_info "$TESTNAME: starting KPI aggregation (boot_type=$BOOT_TYPE, iterations_window=$ITERATIONS, auto_reboot=$AUTO_REBOOT, verbose=$VERBOSE)"
+log_info "$TESTNAME: KPI script → $KPI_SCRIPT"
+log_info "$TESTNAME: KPI base out dir → $KPI_OUT_DIR"
+log_info "$TESTNAME: iterations already done (from state) = $CURRENT_DONE"
+
+# --- Determine this iteration index and concrete out-dir ---
+THIS_ITER=1
+if [ "$AUTO_REBOOT" -eq 1 ]; then
+  THIS_ITER=$((CURRENT_DONE + 1))
+fi
+
+RUN_OUT_DIR="$KPI_OUT_DIR"
+if [ "$ITERATIONS" -gt 1 ] 2>/dev/null; then
+  RUN_OUT_DIR="$KPI_OUT_DIR/iter_${THIS_ITER}"
+fi
+mkdir -p "$RUN_OUT_DIR" 2>/dev/null || true
+log_info "$TESTNAME: this iteration=$THIS_ITER, KPI out dir for this run → $RUN_OUT_DIR"
+
+# --- Build argv for Boot_Systemd_Validate ---
+KPI_ARGS="--out $RUN_OUT_DIR --boot-type $BOOT_TYPE --iterations $ITERATIONS"
+if [ "$DISABLE_GETTY" -eq 1 ]; then
+  KPI_ARGS="$KPI_ARGS --disable-getty"
+fi
+if [ "$DISABLE_SSHD" -eq 1 ]; then
+  KPI_ARGS="$KPI_ARGS --disable-sshd"
+fi
+if [ "$EXCLUDE_NETWORKD_WAIT_ONLINE" -eq 1 ]; then
+  KPI_ARGS="$KPI_ARGS --exclude-networkd-wait-online"
+fi
+if [ -n "$EXCLUDE_SERVICES" ]; then
+  KPI_ARGS="$KPI_ARGS --exclude-services \"$EXCLUDE_SERVICES\""
+fi
+if [ "$NO_SVG" -eq 1 ]; then
+  KPI_ARGS="$KPI_ARGS --no-svg"
+fi
+if [ "$VERBOSE" -eq 1 ]; then
+  KPI_ARGS="$KPI_ARGS --verbose"
+fi
+
+# --- Invoke Boot_Systemd_Validate for this boot ---
+log_info "$TESTNAME: invoking KPI script: $KPI_SCRIPT $KPI_ARGS"
+
+# We use 'sh -c' to keep quoting of EXCLUDE_SERVICES intact if present.
+# shellcheck disable=SC2086
+sh -c "\"$KPI_SCRIPT\" $KPI_ARGS"
+rc=$?
+
+if [ "$rc" -ne 0 ]; then
+  log_fail "$TESTNAME: KPI script failed with rc=$rc"
+  echo "$TESTNAME FAIL" >"$RES_FILE"
+  exit "$rc"
+fi
+
+# --- Parse this-run KPI file from this iteration OUT dir ---
+KPI_FILE="$RUN_OUT_DIR/boot_kpi_this_run.txt"
+if [ ! -f "$KPI_FILE" ]; then
+  log_fail "$TESTNAME: KPI file not found for this iteration: $KPI_FILE"
+  echo "$TESTNAME FAIL" >"$RES_FILE"
+  exit 1
+fi
+
+perf_kpi_extract_from_file "$KPI_FILE"
+
+# If Boot_Systemd_Validate wrote empty boot_type, fall back to CLI boot_type
+if [ -z "${PERF_KPI_BOOT_TYPE:-}" ]; then
+  PERF_KPI_BOOT_TYPE="$BOOT_TYPE"
+fi
+
+log_info "$TESTNAME: parsed KPI for this boot (iter=$THIS_ITER, boot_type=$PERF_KPI_BOOT_TYPE, total_sec=${PERF_KPI_BOOT_TOTAL_SEC:-unknown}, total_eff_sec=${PERF_KPI_BOOT_TOTAL_EFFECTIVE_SEC:-unknown})"
+
+if [ "$VERBOSE" -eq 1 ]; then
+  echo "================ boot_kpi_this_run.txt (from $KPI_FILE) ================"
+  cat "$KPI_FILE"
+  echo "======================================================================="
+fi
+
+# --- Append CSV row (global stats CSV under Boot_Systemd_KPI_Loop) ---
+perf_kpi_append_csv_row "$STATS_CSV" "$PERF_KPI_BOOT_TYPE"
+
+# --- Compute averages over last N boots for this boot_type ---
+if perf_kpi_compute_average "$STATS_CSV" "$PERF_KPI_BOOT_TYPE" "$ITERATIONS" "$SUMMARY_FILE"; then
+  if [ -f "$SUMMARY_FILE" ]; then
+    echo "================ KPI AVERAGE SUMMARY ================"
+    cat "$SUMMARY_FILE"
+    echo "====================================================="
+  fi
+else
+  log_warn "$TESTNAME: could not compute KPI averages (maybe not enough entries yet)."
+fi
+
+if [ "$VERBOSE" -eq 1 ]; then
+  if [ -f "$STATS_CSV" ]; then
+    echo "================ Last KPI CSV rows ($STATS_CSV) ======================="
+    tail -n 5 "$STATS_CSV" 2>/dev/null || cat "$STATS_CSV"
+    echo "======================================================================="
+  fi
+fi
+
+# --- Auto-reboot decision & cleanup ---
+if [ "$AUTO_REBOOT" -eq 1 ]; then
+  NEW_DONE=$((CURRENT_DONE + 1))
+  perf_kpi_write_loop_state "$STATE_FILE" "$ITERATIONS" "$NEW_DONE" \
+    "$BOOT_TYPE" "$DISABLE_GETTY" "$DISABLE_SSHD" \
+    "$EXCLUDE_NETWORKD_WAIT_ONLINE" "$EXCLUDE_SERVICES" \
+    "$KPI_SCRIPT" "$KPI_OUT_DIR"
+
+  if [ "$NEW_DONE" -lt "$ITERATIONS" ]; then
+    # Prepare reboot tracking state so next boot can verify it succeeded
+    perf_kpi_get_boot_identity
+    perf_kpi_reboot_state_save \
+      "$KPI_REBOOT_STATE_FILE" \
+      "$PERF_KPI_BOOT_ID" \
+      "${PERF_KPI_UPTIME_SEC:-}" \
+      "1" \
+      "$NEW_DONE"
+
+    log_info "$TESTNAME: completed iteration $NEW_DONE/$ITERATIONS; requesting reboot for next KPI iteration."
+    log_info "$TESTNAME: current boot_id=$PERF_KPI_BOOT_ID uptime=${PERF_KPI_UPTIME_SEC:-unknown}s"
+
+    perf_kpi_request_reboot "Boot_Systemd_KPI_Loop auto-reboot for next KPI iteration"
+
+    # If we are still alive here, reboot did not occur immediately; exit and let systemd/LAVA retry.
+    if [ "$REBOOT_RESULT_MODE" = "SKIP" ]; then
+      log_skip "$TESTNAME: reboot requested for next iteration ($NEW_DONE/$ITERATIONS)"
+      echo "$TESTNAME SKIP" >"$RES_FILE"
+    else
+      echo "$TESTNAME PASS" >"$RES_FILE"
+    fi
+    exit 0
+  else
+    log_info "$TESTNAME: all iterations completed ($NEW_DONE/$ITERATIONS); cleaning up auto-reboot hook."
+    perf_remove_kpi_systemd_hook "$SERVICE_NAME"
+    rm -f "$STATE_FILE" "$KPI_REBOOT_STATE_FILE" 2>/dev/null || true
+  fi
+fi
+
+log_pass "$TESTNAME: PASS"
+echo "$TESTNAME PASS" >"$RES_FILE"
+exit 0