diff --git a/test/extended/kernel/common.go b/test/extended/kernel/common.go index 96ed3b6b737d..32b71e07f21a 100644 --- a/test/extended/kernel/common.go +++ b/test/extended/kernel/common.go @@ -2,9 +2,12 @@ package kernel import ( "context" + "encoding/json" + "fmt" "os" "path/filepath" "regexp" + "strings" "time" g "github.com/onsi/ginkgo/v2" @@ -14,6 +17,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/util/wait" + e2e "k8s.io/kubernetes/test/e2e/framework" ) var ( @@ -37,17 +41,35 @@ func failIfNotRT(oc *exutil.CLI) { o.Expect(len(rtNodes)).NotTo(o.BeZero(), "no realtime nodes are configured") } -func getRealTimeWorkerNodes(oc *exutil.CLI) (nodes map[string]int, err error) { - nodes = make(map[string]int) - +func getRealTimeWorkerNodes(oc *exutil.CLI) (nodes []string, err error) { kubeNodes, err := oc.AsAdmin().KubeClient().CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{LabelSelector: "node-role.kubernetes.io/worker="}) if err != nil { return nodes, err } + nodes = make([]string, 0, len(kubeNodes.Items)) + + nodesAreMetal := true + for _, node := range kubeNodes.Items { if realTimeKernelRE.MatchString(node.Status.NodeInfo.KernelVersion) { - nodes[node.Name] = 0 + nodes = append(nodes, node.Name) + + nodeLabels := node.GetLabels() + if !strings.Contains(nodeLabels["node.kubernetes.io/instance-type"], "metal") { + nodesAreMetal = false + } + } + } + + // Pad the latencies for non-metal instances + if !nodesAreMetal { + e2e.Logf("One or more nodes are not a metal instance, setting all real-time test thresholds to 7500 usec") + for test := range rtTestThresholds { + rtTestThresholds[test] = rtThresholdConfig{ + SoftThreshold: 7500, + HardThreshold: 10000, + } } } @@ -101,6 +123,19 @@ func cleanupRtTestPod(oc *exutil.CLI) { o.Expect(err).NotTo(o.HaveOccurred(), "test pod for rt-tests never became ready") } +// Write out a structured latency analysis artifact +func writeAnalysisArtifact(testName string, analysis *rtLatencyAnalysis) { + analysisJSON, err := json.MarshalIndent(analysis, "", " ") + if err != nil { + e2e.Logf("WARNING: unable to marshal analysis for %s: %v", testName, err) + return + } + writeTestArtifacts( + fmt.Sprintf("%s_%s_analysis.json", testName, e2e.TimeNow().Format(time.RFC3339)), + string(analysisJSON), + ) +} + // Write out test artifacts func writeTestArtifacts(fname string, content string) { // Create the artifact dir for rt-tests if it does not exist diff --git a/test/extended/kernel/tools.go b/test/extended/kernel/tools.go index 7b0e6cee1966..48287f8e20ef 100644 --- a/test/extended/kernel/tools.go +++ b/test/extended/kernel/tools.go @@ -4,8 +4,11 @@ import ( "context" "encoding/json" "fmt" + "math" + "sort" "strconv" "strings" + "time" exutil "github.com/openshift/origin/test/extended/util" "github.com/pkg/errors" @@ -16,45 +19,109 @@ import ( "k8s.io/utils/cpuset" ) +type rtTestResult string + const ( - hwlatdetectThresholdusec = 7500 - oslatThresholdusec = 7500 - cyclictestThresholdusec = 7500 + rtResultPass rtTestResult = "PASS" + rtResultWarn rtTestResult = "WARN" + rtResultFail rtTestResult = "FAIL" ) +type rtThresholdConfig struct { + SoftThreshold int // Expected max — warn if exceeded + HardThreshold int // Absolute max — fail if exceeded +} + +type cpuLatencyResult struct { + CPU int `json:"cpu"` + MaxLatency int `json:"max_latency_usec"` +} + +type rtLatencyAnalysis struct { + TestName string `json:"test_name"` + SoftThreshold int `json:"soft_threshold_usec"` + HardThreshold int `json:"hard_threshold_usec"` + TotalCPUs int `json:"total_cpus"` + MaxLatency int `json:"max_latency_usec"` + AvgMaxLatency float64 `json:"avg_max_latency_usec"` + P99MaxLatency int `json:"p99_max_latency_usec"` + CPUsOverSoft int `json:"cpus_over_soft_threshold"` + CPUsOverHard int `json:"cpus_over_hard_threshold"` + PercentOverSoft float64 `json:"percent_over_soft_threshold"` + FailedCPUs []cpuLatencyResult `json:"failed_cpus,omitempty"` + Result rtTestResult `json:"result"` + FailureReason string `json:"failure_reason,omitempty"` +} + +const maxSoftThresholdViolationPercent = 10.0 + +var rtTestThresholds = map[string]rtThresholdConfig{ + "deadline_test": {SoftThreshold: 100, HardThreshold: 500}, + "oslat": {SoftThreshold: 150, HardThreshold: 500}, + "cyclictest": {SoftThreshold: 150, HardThreshold: 500}, + "hwlatdetect": {SoftThreshold: 100, HardThreshold: 500}, +} + func runPiStressFifo(oc *exutil.CLI) error { args := []string{rtPodName, "--", "pi_stress", "--duration=600", "--groups=1"} - _, err := oc.SetNamespace(rtNamespace).Run("exec").Args(args...).Output() + res, err := oc.SetNamespace(rtNamespace).Run("exec").Args(args...).Output() + if err != nil { + // An error here indicates thresholds were exceeded or an issue with the test + return errors.Wrap(err, "error running pi_stress with the standard algorithm") + } - return err + writeTestArtifacts(fmt.Sprintf("%s_%s.log", "pi_stress_standard", e2e.TimeNow().Format(time.RFC3339)), res) + + return nil } func runPiStressRR(oc *exutil.CLI) error { args := []string{rtPodName, "--", "pi_stress", "--duration=600", "--groups=1", "--rr"} - _, err := oc.SetNamespace(rtNamespace).Run("exec").Args(args...).Output() + res, err := oc.SetNamespace(rtNamespace).Run("exec").Args(args...).Output() + if err != nil { + // An error here indicates thresholds were exceeded or an issue with the test + return errors.Wrap(err, "error running pi_stress with the round-robin algorithm") + } - return err + writeTestArtifacts(fmt.Sprintf("%s_%s.log", "pi_stress_rr", e2e.TimeNow().Format(time.RFC3339)), res) + + return nil } func runDeadlineTest(oc *exutil.CLI) error { - args := []string{rtPodName, "--", "deadline_test"} - _, err := oc.SetNamespace(rtNamespace).Run("exec").Args(args...).Output() + testName := "deadline_test" + + args := []string{rtPodName, "--", + testName, + "-i", fmt.Sprintf("%d", rtTestThresholds[testName].HardThreshold), + } + res, err := oc.SetNamespace(rtNamespace).Run("exec").Args(args...).Output() + if err != nil { + // An error here indicates thresholds were exceeded or an issue with the test + return errors.Wrap(err, "error running deadline_test") + } + + writeTestArtifacts(fmt.Sprintf("%s_%s.log", testName, e2e.TimeNow().Format(time.RFC3339)), res) - return err + return nil } func runHwlatdetect(oc *exutil.CLI) error { - args := []string{rtPodName, "--", "hwlatdetect", "--duration=600s", "--window=1s", "--width=500ms", fmt.Sprintf("--threshold=%dus", hwlatdetectThresholdusec)} - _, err := oc.SetNamespace(rtNamespace).Run("exec").Args(args...).Output() + testName := "hwlatdetect" + args := []string{rtPodName, "--", testName, "--duration=600s", "--window=1s", "--width=500ms", "--debug", fmt.Sprintf("--threshold=%dus", rtTestThresholds[testName].HardThreshold)} + res, err := oc.SetNamespace(rtNamespace).Run("exec").Args(args...).Output() if err != nil { // An error here indicates thresholds were exceeded or an issue with the test return errors.Wrap(err, "error running hwlatdetect") } + writeTestArtifacts(fmt.Sprintf("%s_%s.log", testName, e2e.TimeNow().Format(time.RFC3339)), res) + return nil } func runOslat(cpuCount int, oc *exutil.CLI) error { + testName := "oslat" oslatReportFile := "/tmp/oslatresults.json" // Make sure there is enough hardware for this test @@ -69,7 +136,7 @@ func runOslat(cpuCount int, oc *exutil.CLI) error { // Run the test args := []string{rtPodName, "--", - "oslat", + testName, "--cpu-list", fmt.Sprintf("%d-%d", reservedCores+1, cpuCount-1), "--cpu-main-thread", fmt.Sprint(reservedCores + 1), "--rtprio", "1", @@ -87,57 +154,34 @@ func runOslat(cpuCount int, oc *exutil.CLI) error { return errors.Wrap(err, "error retrieving oslat results") } - writeTestArtifacts("oslat_results.json", report) - - // Parse the results and return any errors detected - if err = parseOslatResults(report, oslatThresholdusec); err != nil { - return errors.Wrap(err, "error parsing oslat report") - } - - return nil -} - -func parseOslatResults(jsonReport string, maxThresholdusec int) error { - var oslatReport struct { - Threads map[string]struct { - Cpu int `json:"cpu"` - Max int `json:"max"` - } `json:"thread"` - } + writeTestArtifacts(fmt.Sprintf("%s_%s.json", testName, e2e.TimeNow().Format(time.RFC3339)), report) - // Parse the data - err := json.Unmarshal([]byte(jsonReport), &oslatReport) + // Parse the results and evaluate against thresholds + analysis, err := parseLatencyResults(testName, report, rtTestThresholds[testName]) if err != nil { - return errors.Wrap(err, "unable to decode oslat report json") - } - - if len(oslatReport.Threads) == 0 { - return fmt.Errorf("no thread reports found") + return errors.Wrap(err, "error parsing oslat report") } - - failedCPUs := make([]int, 0, len(oslatReport.Threads)) // Report all failed cores - for _, thread := range oslatReport.Threads { - if thread.Max > maxThresholdusec { - failedCPUs = append(failedCPUs, thread.Cpu) - } + writeAnalysisArtifact(testName, analysis) + if analysis.Result == rtResultWarn { + e2e.Logf("WARNING: %s - %s", testName, analysis.FailureReason) } - - if len(failedCPUs) > 0 { - return fmt.Errorf("the following CPUs were over the max latency threshold: %v", failedCPUs) + if analysis.Result == rtResultFail { + return fmt.Errorf("%s failed: %s", testName, analysis.FailureReason) } return nil } func runCyclictest(cpuCount int, oc *exutil.CLI) error { + testName := "cyclictest" cyclictestReportFile := "/tmp/cyclictestresults.json" // Make sure there is enough hardware for this test if cpuCount <= 4 { - return fmt.Errorf("more than 4 cores are required to run this oslat test. Found %d cores", cpuCount) + return fmt.Errorf("more than 4 cores are required to run this cyclictest test. Found %d cores", cpuCount) } // Run the test - args := []string{rtPodName, "--", "cyclictest", "--duration=10m", "--priority=95", fmt.Sprintf("--threads=%d", cpuCount-5), fmt.Sprintf("--affinity=4-%d", cpuCount-1), "--interval=1000", fmt.Sprintf("--breaktrace=%d", cyclictestThresholdusec), "--mainaffinity=4", "-m", fmt.Sprintf("--json=%s", cyclictestReportFile)} + args := []string{rtPodName, "--", testName, "--duration=10m", "--priority=95", fmt.Sprintf("--threads=%d", cpuCount-5), fmt.Sprintf("--affinity=4-%d", cpuCount-1), "--interval=1000", fmt.Sprintf("--breaktrace=%d", rtTestThresholds[testName].HardThreshold), "--mainaffinity=4", "-m", fmt.Sprintf("--json=%s", cyclictestReportFile)} _, err := oc.SetNamespace(rtNamespace).Run("exec").Args(args...).Output() if err != nil { return errors.Wrap(err, "error running cyclictest") @@ -150,46 +194,98 @@ func runCyclictest(cpuCount int, oc *exutil.CLI) error { return errors.Wrap(err, "error retrieving cyclictest results") } - writeTestArtifacts("cyclictest_results.json", report) + writeTestArtifacts(fmt.Sprintf("%s_%s.json", testName, e2e.TimeNow().Format(time.RFC3339)), report) - // Parse the results and return any errors detected - if err = parseCyclictestResults(report, cyclictestThresholdusec); err != nil { + // Parse the results and evaluate against thresholds + analysis, err := parseLatencyResults(testName, report, rtTestThresholds[testName]) + if err != nil { return errors.Wrap(err, "error parsing cyclictest report") } + writeAnalysisArtifact(testName, analysis) + if analysis.Result == rtResultWarn { + e2e.Logf("WARNING: %s - %s", testName, analysis.FailureReason) + } + if analysis.Result == rtResultFail { + return fmt.Errorf("%s failed: %s", testName, analysis.FailureReason) + } return nil } -func parseCyclictestResults(jsonReport string, maxThresholdusec int) error { - var cyclictestReport struct { +func parseLatencyResults(testName string, jsonReport string, thresholds rtThresholdConfig) (*rtLatencyAnalysis, error) { + var report struct { Threads map[string]struct { Cpu int `json:"cpu"` Max int `json:"max"` } `json:"thread"` } - // Parse the data - err := json.Unmarshal([]byte(jsonReport), &cyclictestReport) - if err != nil { - return errors.Wrap(err, "unable to decode cyclictest report json") + if err := json.Unmarshal([]byte(jsonReport), &report); err != nil { + return nil, fmt.Errorf("unable to decode %s report json: %w", testName, err) } - if len(cyclictestReport.Threads) == 0 { - return fmt.Errorf("no thread reports found") + if len(report.Threads) == 0 { + return nil, fmt.Errorf("no thread reports found in %s results", testName) } - failedCPUs := make([]int, 0, len(cyclictestReport.Threads)) // Report all failed cores - for _, thread := range cyclictestReport.Threads { - if thread.Max > maxThresholdusec { - failedCPUs = append(failedCPUs, thread.Cpu) + analysis := &rtLatencyAnalysis{ + TestName: testName, + SoftThreshold: thresholds.SoftThreshold, + HardThreshold: thresholds.HardThreshold, + TotalCPUs: len(report.Threads), + } + + maxValues := make([]int, 0, len(report.Threads)) + var sum int + for _, thread := range report.Threads { + maxValues = append(maxValues, thread.Max) + sum += thread.Max + + if thread.Max > analysis.MaxLatency { + analysis.MaxLatency = thread.Max + } + if thread.Max > thresholds.SoftThreshold { + analysis.CPUsOverSoft++ + analysis.FailedCPUs = append(analysis.FailedCPUs, cpuLatencyResult{ + CPU: thread.Cpu, + MaxLatency: thread.Max, + }) + } + if thread.Max > thresholds.HardThreshold { + analysis.CPUsOverHard++ } } - if len(failedCPUs) > 0 { - return fmt.Errorf("the following CPUs were over the max latency threshold: %v", failedCPUs) + analysis.AvgMaxLatency = float64(sum) / float64(analysis.TotalCPUs) + analysis.PercentOverSoft = (float64(analysis.CPUsOverSoft) / float64(analysis.TotalCPUs)) * 100 + + // Compute P99 + sort.Ints(maxValues) + p99Index := int(math.Ceil(0.99*float64(len(maxValues)))) - 1 + if p99Index < 0 { + p99Index = 0 + } + analysis.P99MaxLatency = maxValues[p99Index] + + // Determine result + switch { + case analysis.CPUsOverHard > 0: + analysis.Result = rtResultFail + analysis.FailureReason = fmt.Sprintf("%d/%d CPUs exceeded hard threshold of %dus (max observed: %dus)", + analysis.CPUsOverHard, analysis.TotalCPUs, thresholds.HardThreshold, analysis.MaxLatency) + case analysis.PercentOverSoft > maxSoftThresholdViolationPercent: + analysis.Result = rtResultFail + analysis.FailureReason = fmt.Sprintf("%d/%d CPUs (%.1f%%) exceeded soft threshold of %dus — systemic latency issue", + analysis.CPUsOverSoft, analysis.TotalCPUs, analysis.PercentOverSoft, thresholds.SoftThreshold) + case analysis.CPUsOverSoft > 0: + analysis.Result = rtResultWarn + analysis.FailureReason = fmt.Sprintf("%d/%d CPUs (%.1f%%) exceeded soft threshold of %dus (max observed: %dus) — isolated spikes within tolerance", + analysis.CPUsOverSoft, analysis.TotalCPUs, analysis.PercentOverSoft, thresholds.SoftThreshold, analysis.MaxLatency) + default: + analysis.Result = rtResultPass } - return nil + return analysis, nil } func getProcessorCount(oc *exutil.CLI) (int, error) { diff --git a/test/extended/kernel/tools_test.go b/test/extended/kernel/tools_test.go new file mode 100644 index 000000000000..94d06a1edeec --- /dev/null +++ b/test/extended/kernel/tools_test.go @@ -0,0 +1,187 @@ +package kernel + +import ( + "encoding/json" + "fmt" + "testing" +) + +// buildTestReport generates a JSON report string matching the oslat/cyclictest format. +// cpuMaxValues is a map of CPU ID to max latency in usec. +func buildTestReport(cpuMaxValues map[int]int) string { + threads := make(map[string]struct { + Cpu int `json:"cpu"` + Max int `json:"max"` + }) + for cpu, max := range cpuMaxValues { + threads[fmt.Sprintf("%d", cpu)] = struct { + Cpu int `json:"cpu"` + Max int `json:"max"` + }{Cpu: cpu, Max: max} + } + report := struct { + Threads map[string]struct { + Cpu int `json:"cpu"` + Max int `json:"max"` + } `json:"thread"` + }{Threads: threads} + + data, _ := json.Marshal(report) + return string(data) +} + +func TestParseLatencyResults_AllPass(t *testing.T) { + cpus := make(map[int]int, 80) + for i := 0; i < 80; i++ { + cpus[i] = 50 // All well under 200us soft threshold + } + report := buildTestReport(cpus) + thresholds := rtThresholdConfig{SoftThreshold: 200, HardThreshold: 400} + + analysis, err := parseLatencyResults("oslat", report, thresholds) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if analysis.Result != rtResultPass { + t.Errorf("expected PASS, got %s: %s", analysis.Result, analysis.FailureReason) + } + if analysis.CPUsOverSoft != 0 { + t.Errorf("expected 0 CPUs over soft, got %d", analysis.CPUsOverSoft) + } + if analysis.CPUsOverHard != 0 { + t.Errorf("expected 0 CPUs over hard, got %d", analysis.CPUsOverHard) + } +} + +func TestParseLatencyResults_WarnFewOverSoft(t *testing.T) { + cpus := make(map[int]int, 100) + for i := 0; i < 100; i++ { + cpus[i] = 50 + } + // 1 CPU over soft (1% < 5%), but under hard + cpus[17] = 211 + + report := buildTestReport(cpus) + thresholds := rtThresholdConfig{SoftThreshold: 200, HardThreshold: 400} + + analysis, err := parseLatencyResults("oslat", report, thresholds) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if analysis.Result != rtResultWarn { + t.Errorf("expected WARN, got %s: %s", analysis.Result, analysis.FailureReason) + } + if analysis.CPUsOverSoft != 1 { + t.Errorf("expected 1 CPU over soft, got %d", analysis.CPUsOverSoft) + } + if analysis.CPUsOverHard != 0 { + t.Errorf("expected 0 CPUs over hard, got %d", analysis.CPUsOverHard) + } + if analysis.MaxLatency != 211 { + t.Errorf("expected max latency 211, got %d", analysis.MaxLatency) + } +} + +func TestParseLatencyResults_FailSystemic(t *testing.T) { + cpus := make(map[int]int, 100) + for i := 0; i < 100; i++ { + cpus[i] = 50 + } + // 10 CPUs over soft (10% > 5%), but under hard + for i := 0; i < 10; i++ { + cpus[i] = 300 + } + + report := buildTestReport(cpus) + thresholds := rtThresholdConfig{SoftThreshold: 200, HardThreshold: 400} + + analysis, err := parseLatencyResults("cyclictest", report, thresholds) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if analysis.Result != rtResultFail { + t.Errorf("expected FAIL, got %s: %s", analysis.Result, analysis.FailureReason) + } + if analysis.CPUsOverSoft != 10 { + t.Errorf("expected 10 CPUs over soft, got %d", analysis.CPUsOverSoft) + } + if analysis.PercentOverSoft != 10.0 { + t.Errorf("expected 10.0%% over soft, got %.1f%%", analysis.PercentOverSoft) + } +} + +func TestParseLatencyResults_FailHardThreshold(t *testing.T) { + cpus := make(map[int]int, 80) + for i := 0; i < 80; i++ { + cpus[i] = 50 + } + // 1 CPU exceeds hard threshold + cpus[35] = 500 + + report := buildTestReport(cpus) + thresholds := rtThresholdConfig{SoftThreshold: 200, HardThreshold: 400} + + analysis, err := parseLatencyResults("oslat", report, thresholds) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if analysis.Result != rtResultFail { + t.Errorf("expected FAIL, got %s: %s", analysis.Result, analysis.FailureReason) + } + if analysis.CPUsOverHard != 1 { + t.Errorf("expected 1 CPU over hard, got %d", analysis.CPUsOverHard) + } +} + +func TestParseLatencyResults_CorrectStatistics(t *testing.T) { + // 10 CPUs with known values for deterministic statistics + cpus := map[int]int{ + 0: 10, 1: 20, 2: 30, 3: 40, 4: 50, + 5: 60, 6: 70, 7: 80, 8: 90, 9: 100, + } + report := buildTestReport(cpus) + thresholds := rtThresholdConfig{SoftThreshold: 200, HardThreshold: 400} + + analysis, err := parseLatencyResults("oslat", report, thresholds) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if analysis.TotalCPUs != 10 { + t.Errorf("expected 10 total CPUs, got %d", analysis.TotalCPUs) + } + if analysis.MaxLatency != 100 { + t.Errorf("expected max latency 100, got %d", analysis.MaxLatency) + } + expectedAvg := 55.0 // (10+20+30+40+50+60+70+80+90+100)/10 + if analysis.AvgMaxLatency != expectedAvg { + t.Errorf("expected avg max latency %.1f, got %.1f", expectedAvg, analysis.AvgMaxLatency) + } + // P99 of 10 values: ceil(0.99*10)-1 = 9, so sorted[9] = 100 + if analysis.P99MaxLatency != 100 { + t.Errorf("expected P99 max latency 100, got %d", analysis.P99MaxLatency) + } + if analysis.Result != rtResultPass { + t.Errorf("expected PASS, got %s", analysis.Result) + } +} + +func TestParseLatencyResults_EmptyThreads(t *testing.T) { + report := `{"thread": {}}` + thresholds := rtThresholdConfig{SoftThreshold: 200, HardThreshold: 400} + + _, err := parseLatencyResults("oslat", report, thresholds) + if err == nil { + t.Fatal("expected error for empty threads, got nil") + } +} + +func TestParseLatencyResults_InvalidJSON(t *testing.T) { + report := `{invalid json` + thresholds := rtThresholdConfig{SoftThreshold: 200, HardThreshold: 400} + + _, err := parseLatencyResults("oslat", report, thresholds) + if err == nil { + t.Fatal("expected error for invalid JSON, got nil") + } +}