From 0c1aeb61f8985a9ecf6da6885cb87418364507f7 Mon Sep 17 00:00:00 2001 From: Stephen Benjamin Date: Fri, 19 Dec 2025 15:11:03 -0500 Subject: [PATCH] fix(test): prevent global BeforeEach/AfterEach registration and add detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When exutil.FixturePath() or exutil.NewCLI*() is called at package level (in a var declaration outside any function), it registers BeforeEach/AfterEach hooks at the root of the Ginkgo tree. This causes these hooks to run for EVERY SINGLE TEST in the suite, even when not needed, wasting CI resources and adding unnecessary test execution time. This PR fixes the remaining cases of package-level FixturePath() calls by converting them to lazy functions: - kernel/common.go: rtEnvFixture, rtPodFixture - postgresql_replica.go: postgreSQLEphemeralTemplate - mysql_replica.go: helperTemplate - networking/util.go: ipsecConfigurationBaseDir, nsMachineConfigFixture, nsNodeRebootNoneFixture Additionally, this PR adds runtime detection that creates flaky JUnit test cases when global nodes are detected. This allows CI to track future violations without blocking test runs. The detection: - Groups tests by image and binary - Identifies code locations appearing in 100% of tests for each binary - Reports violations as [sig-ci] flaky tests with detailed fix instructions - Skips binaries with <25 tests to avoid false positives - Supports an allowlist for temporary exceptions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- pkg/test/extensions/global_node_detection.go | 254 ++++++++++++++++++ pkg/test/ginkgo/cmd_runsuite.go | 5 + .../extended/image_ecosystem/mysql_replica.go | 9 +- .../image_ecosystem/postgresql_replica.go | 7 +- test/extended/kernel/common.go | 18 +- test/extended/networking/ipsec.go | 2 +- test/extended/networking/util.go | 17 +- 7 files changed, 296 insertions(+), 16 deletions(-) create mode 100644 pkg/test/extensions/global_node_detection.go diff --git a/pkg/test/extensions/global_node_detection.go b/pkg/test/extensions/global_node_detection.go new file mode 100644 index 000000000000..a83d38dc4c16 --- /dev/null +++ b/pkg/test/extensions/global_node_detection.go @@ -0,0 +1,254 @@ +package extensions + +import ( + "fmt" + "path/filepath" + "sort" + "strings" + + "github.com/openshift/origin/pkg/test/ginkgo/junitapi" +) + +// binaryKey uniquely identifies a test binary by its image and path. +type binaryKey struct { + imageTag string + binaryPath string +} + +// binaryResult holds the detection results for a single binary. +type binaryResult struct { + binaryName string // Just the filename, not the full path + globalLocations []string + totalTests int +} + +// imageResult groups all binary results for a single image. +type imageResult struct { + imageTag string + binaries []binaryResult +} + +// CheckForGlobalNodes checks if any code locations appear in ALL tests for each binary, +// which indicates that BeforeEach/AfterEach nodes were registered at the global level +// (root of the Ginkgo tree). This is a serious bug because these hooks run for EVERY test, +// wasting resources and time. In some cases, these global nodes can +// interfere with test operations. +// +// Tests are grouped by their source image, and each binary within an image is checked separately. +// +// Returns JUnit test cases as flakes (failing + passing with same name) for each image +// that has global nodes detected. This allows the issue to be tracked in CI without +// blocking the test run for now. +func CheckForGlobalNodes(specs ExtensionTestSpecs) []*junitapi.JUnitTestCase { + // Group tests by image tag and binary path + specsByBinary := make(map[binaryKey]ExtensionTestSpecs) + for _, spec := range specs { + if spec.Binary == nil { + continue + } + key := binaryKey{ + imageTag: spec.Binary.imageTag, + binaryPath: spec.Binary.binaryPath, + } + specsByBinary[key] = append(specsByBinary[key], spec) + } + + // Check each binary for global nodes, grouped by image + resultsByImage := make(map[string][]binaryResult) + + for key, binarySpecs := range specsByBinary { + // Skip binaries with fewer than 25 tests - can't detect global nodes meaningfully + // with small test counts as it would generate false positives + if len(binarySpecs) < 25 { + continue + } + + totalTests := len(binarySpecs) + + // Count how many unique tests contain each code location + locationToTests := make(map[string]map[string]struct{}) + for _, spec := range binarySpecs { + for _, loc := range spec.CodeLocations { + if locationToTests[loc] == nil { + locationToTests[loc] = make(map[string]struct{}) + } + locationToTests[loc][spec.Name] = struct{}{} + } + } + + // Find code locations that appear in ALL tests for this binary + var globalLocations []string + for loc, tests := range locationToTests { + if len(tests) != totalTests { + continue + } + + // Skip locations that are expected to be in all tests + if isExpectedGlobalLocation(loc) { + continue + } + + globalLocations = append(globalLocations, loc) + } + + if len(globalLocations) > 0 { + sort.Strings(globalLocations) + // Use just the binary filename, not the full extracted path + binaryName := filepath.Base(key.binaryPath) + resultsByImage[key.imageTag] = append(resultsByImage[key.imageTag], binaryResult{ + binaryName: binaryName, + globalLocations: globalLocations, + totalTests: totalTests, + }) + } + } + + if len(resultsByImage) == 0 { + return nil + } + + // Build sorted list of image results + var imageResults []imageResult + for imageTag, binaries := range resultsByImage { + // Sort binaries within each image for consistent output + sort.Slice(binaries, func(i, j int) bool { + return binaries[i].binaryName < binaries[j].binaryName + }) + imageResults = append(imageResults, imageResult{ + imageTag: imageTag, + binaries: binaries, + }) + } + + // Sort by imageTag for consistent output + sort.Slice(imageResults, func(i, j int) bool { + return imageResults[i].imageTag < imageResults[j].imageTag + }) + + // Create JUnit test cases as flakes (one failing, one passing per image) + var testCases []*junitapi.JUnitTestCase + + for _, result := range imageResults { + testName := fmt.Sprintf("[sig-ci] image %s should not have global BeforeEach/AfterEach nodes", result.imageTag) + + // Build detailed failure message + failureOutput := buildGlobalNodeFailureMessage(result.imageTag, result.binaries) + + // Count total global locations across all binaries + totalLocations := 0 + for _, b := range result.binaries { + totalLocations += len(b.globalLocations) + } + + // Create failing test case + failingCase := &junitapi.JUnitTestCase{ + Name: testName, + FailureOutput: &junitapi.FailureOutput{ + Message: fmt.Sprintf("Found %d global BeforeEach/AfterEach code locations in image %s across %d binary(ies)", totalLocations, result.imageTag, len(result.binaries)), + Output: failureOutput, + }, + } + testCases = append(testCases, failingCase) + + // Create passing test case (same name = flake) + passingCase := &junitapi.JUnitTestCase{ + Name: testName, + } + testCases = append(testCases, passingCase) + } + + return testCases +} + +// buildGlobalNodeFailureMessage creates a detailed message explaining the global node issue. +func buildGlobalNodeFailureMessage(imageTag string, binaries []binaryResult) string { + var sb strings.Builder + + sb.WriteString("\n") + sb.WriteString("╔══════════════════════════════════════════════════════════════════════════════╗\n") + sb.WriteString("║ GLOBAL BEFOREEACH/AFTEREACH DETECTED ║\n") + sb.WriteString("╚══════════════════════════════════════════════════════════════════════════════╝\n") + sb.WriteString("\n") + sb.WriteString(fmt.Sprintf("IMAGE: %s\n", imageTag)) + sb.WriteString("\n") + + for _, binary := range binaries { + sb.WriteString(fmt.Sprintf("BINARY: %s (%d tests)\n", binary.binaryName, binary.totalTests)) + sb.WriteString("GLOBAL CODE LOCATIONS:\n") + for _, loc := range binary.globalLocations { + sb.WriteString(fmt.Sprintf(" • %s\n", loc)) + } + sb.WriteString("\n") + } + + sb.WriteString("PROBLEM: The code locations above appear in ALL tests for their respective binaries,\n") + sb.WriteString("indicating that BeforeEach or AfterEach hooks were registered at the global level.\n") + sb.WriteString("\n") + sb.WriteString("This means these hooks run for EVERY SINGLE TEST even when not needed,\n") + sb.WriteString("wasting CI resources and adding unnecessary test execution time.\n") + sb.WriteString("\n") + sb.WriteString("COMMON CAUSES:\n") + sb.WriteString("\n") + sb.WriteString("1. Package-level FixturePath() call:\n") + sb.WriteString(" BAD: var myFixture = exutil.FixturePath(\"testdata\", \"file.yaml\")\n") + sb.WriteString(" GOOD: func myFixture() string { return exutil.FixturePath(\"testdata\", \"file.yaml\") }\n") + sb.WriteString("\n") + sb.WriteString("2. Package-level NewCLI/NewCLIWithoutNamespace call:\n") + sb.WriteString(" BAD: var oc = exutil.NewCLIWithoutNamespace(\"test\")\n") + sb.WriteString(" GOOD: Inside g.Describe(): oc := exutil.NewCLIWithoutNamespace(\"test\")\n") + sb.WriteString("\n") + sb.WriteString("HOW TO FIX:\n") + sb.WriteString("\n") + sb.WriteString("1. Find the file and line number from the locations above\n") + sb.WriteString("2. Look for package-level var declarations that call FixturePath() or NewCLI*()\n") + sb.WriteString("3. Convert them to functions (for FixturePath) or move inside Describe blocks (for CLI)\n") + sb.WriteString("\n") + sb.WriteString("EXAMPLE FIX FOR FixturePath:\n") + sb.WriteString("\n") + sb.WriteString(" // BEFORE (causes global BeforeEach):\n") + sb.WriteString(" var myFixture = exutil.FixturePath(\"testdata\", \"file.yaml\")\n") + sb.WriteString(" func useFixture() { oc.Run(\"apply\").Args(\"-f\", myFixture) }\n") + sb.WriteString("\n") + sb.WriteString(" // AFTER (lazy evaluation, no global BeforeEach):\n") + sb.WriteString(" func myFixture() string { return exutil.FixturePath(\"testdata\", \"file.yaml\") }\n") + sb.WriteString(" func useFixture() { oc.Run(\"apply\").Args(\"-f\", myFixture()) }\n") + sb.WriteString("\n") + sb.WriteString("FOR AI ASSISTANTS:\n") + sb.WriteString("If you're Claude or another AI helping fix this, search the codebase for the file\n") + sb.WriteString("paths shown above. Look for package-level 'var' declarations that call\n") + sb.WriteString("exutil.FixturePath() or exutil.NewCLI*(). Convert FixturePath vars to functions\n") + sb.WriteString("that return the path. Move NewCLI* calls inside Ginkgo Describe/Context blocks.\n") + sb.WriteString("\n") + + return sb.String() +} + +// allowedGlobalLocations is an exception list of code locations that are known +// to register global BeforeEach/AfterEach nodes. These should be fixed +// unless it is intentional. +var allowedGlobalLocations = map[string]string{ + // Add exceptions here with an explanation, e.g.: + // "github.com/openshift/origin/test/extended/networking/util.go:82": "FIXME: package-level FixturePath call", +} + +// isExpectedGlobalLocation returns true for code locations that are expected +// to appear in all tests and should not trigger the global node detection. +func isExpectedGlobalLocation(loc string) bool { + // Check exact match in allowlist + if _, ok := allowedGlobalLocations[loc]; ok { + return true + } + + // Check pattern matches for framework infrastructure that's legitimately global + expectedPatterns := []string{ + // None currently - if we find legitimate cases, add them here with comments + } + + for _, pattern := range expectedPatterns { + if strings.Contains(loc, pattern) { + return true + } + } + + return false +} diff --git a/pkg/test/ginkgo/cmd_runsuite.go b/pkg/test/ginkgo/cmd_runsuite.go index df1b73b0f641..5bf52aaccb85 100644 --- a/pkg/test/ginkgo/cmd_runsuite.go +++ b/pkg/test/ginkgo/cmd_runsuite.go @@ -220,6 +220,10 @@ func (o *GinkgoRunSuiteOptions) Run(suite *TestSuite, clusterConfig *clusterdisc logrus.Infof("Discovered %d total tests", len(specs)) + // Check for global BeforeEach/AfterEach nodes in each binary + // Returns JUnit test cases as flakes to track the issue without blocking + globalNodeTestCases := extensions.CheckForGlobalNodes(specs) + // skip tests due to newer k8s restConfig, err := clusterinfo.GetMonitorRESTConfig() if err != nil { @@ -618,6 +622,7 @@ func (o *GinkgoRunSuiteOptions) Run(suite *TestSuite, clusterConfig *clusterdisc var syntheticTestResults []*junitapi.JUnitTestCase var syntheticFailure bool syntheticTestResults = append(syntheticTestResults, stableClusterTestResults...) + syntheticTestResults = append(syntheticTestResults, globalNodeTestCases...) timeSuffix := fmt.Sprintf("_%s", start.UTC().Format("20060102-150405")) diff --git a/test/extended/image_ecosystem/mysql_replica.go b/test/extended/image_ecosystem/mysql_replica.go index 42a635388de5..2919a574c399 100644 --- a/test/extended/image_ecosystem/mysql_replica.go +++ b/test/extended/image_ecosystem/mysql_replica.go @@ -34,10 +34,13 @@ var ( false, }, } - helperTemplate = exutil.FixturePath("..", "..", "examples", "db-templates", "mysql-ephemeral-template.json") - helperName = "mysql-helper" + helperName = "mysql-helper" ) +func helperTemplate() string { + return exutil.FixturePath("..", "..", "examples", "db-templates", "mysql-ephemeral-template.json") +} + // CreateMySQLReplicationHelpers creates a set of MySQL helpers for master, // slave and an extra helper that is used for remote login test. func CreateMySQLReplicationHelpers(c kcoreclient.PodInterface, masterDeployment, slaveDeployment, helperDeployment string, slaveCount int) (exutil.Database, []exutil.Database, exutil.Database) { @@ -82,7 +85,7 @@ func replicationTestFactory(oc *exutil.CLI, tc testCase, cleanup func()) func() err = oc.Run("new-app").Args("--template", tc.TemplateName).Execute() o.Expect(err).NotTo(o.HaveOccurred()) - err = oc.Run("new-app").Args("-f", helperTemplate, "-p", fmt.Sprintf("MYSQL_VERSION=%s", tc.Version), "-p", fmt.Sprintf("DATABASE_SERVICE_NAME=%s", helperName)).Execute() + err = oc.Run("new-app").Args("-f", helperTemplate(), "-p", fmt.Sprintf("MYSQL_VERSION=%s", tc.Version), "-p", fmt.Sprintf("DATABASE_SERVICE_NAME=%s", helperName)).Execute() o.Expect(err).NotTo(o.HaveOccurred()) // oc.KubeFramework().WaitForAnEndpoint currently will wait forever; for now, prefacing with our WaitForADeploymentToComplete, diff --git a/test/extended/image_ecosystem/postgresql_replica.go b/test/extended/image_ecosystem/postgresql_replica.go index 6d0a5294d8cd..76696a2749d5 100644 --- a/test/extended/image_ecosystem/postgresql_replica.go +++ b/test/extended/image_ecosystem/postgresql_replica.go @@ -20,13 +20,16 @@ import ( var ( postgreSQLReplicationTemplate = "https://raw.githubusercontent.com/sclorg/postgresql-container/master/examples/replica/postgresql_replica.json" - postgreSQLEphemeralTemplate = exutil.FixturePath("..", "..", "examples", "db-templates", "postgresql-ephemeral-template.json") postgreSQLHelperName = "postgresql-helper" postgreSQLImages = []string{ "postgresql:9.6", } ) +func postgreSQLEphemeralTemplate() string { + return exutil.FixturePath("..", "..", "examples", "db-templates", "postgresql-ephemeral-template.json") +} + /* var _ = g.Describe("[sig-devex][Feature:ImageEcosystem][postgresql][Slow][Local] openshift postgresql replication", func() { defer g.GinkgoRecover() @@ -130,7 +133,7 @@ func PostgreSQLReplicationTestFactory(oc *exutil.CLI, image string, cleanup func err = oc.Run("new-app").Args("--template", "pg-replica-example", "-p", fmt.Sprintf("IMAGESTREAMTAG=%s", image)).Execute() o.Expect(err).NotTo(o.HaveOccurred()) - err = oc.Run("new-app").Args("-f", postgreSQLEphemeralTemplate, "-p", fmt.Sprintf("DATABASE_SERVICE_NAME=%s", postgreSQLHelperName)).Execute() + err = oc.Run("new-app").Args("-f", postgreSQLEphemeralTemplate(), "-p", fmt.Sprintf("DATABASE_SERVICE_NAME=%s", postgreSQLHelperName)).Execute() o.Expect(err).NotTo(o.HaveOccurred()) g.By("PV/PVC dump after setup") diff --git a/test/extended/kernel/common.go b/test/extended/kernel/common.go index 96ed3b6b737d..0cef82c4c572 100644 --- a/test/extended/kernel/common.go +++ b/test/extended/kernel/common.go @@ -23,12 +23,18 @@ var ( // - 5.14.0-430.el9.x86_64+rt // Continue using regex to tighten the match for both versions realTimeKernelRE = regexp.MustCompile(".*[.+]rt.*") - rtEnvFixture = exutil.FixturePath("testdata", "kernel", "rt-tests-environment.yaml") - rtPodFixture = exutil.FixturePath("testdata", "kernel", "rt-tests-pod.yaml") rtNamespace = "ci-realtime-testbed" rtPodName = "rt-tests" ) +func rtEnvFixture() string { + return exutil.FixturePath("testdata", "kernel", "rt-tests-environment.yaml") +} + +func rtPodFixture() string { + return exutil.FixturePath("testdata", "kernel", "rt-tests-pod.yaml") +} + func failIfNotRT(oc *exutil.CLI) { g.By("checking kernel configuration") @@ -57,14 +63,14 @@ func getRealTimeWorkerNodes(oc *exutil.CLI) (nodes map[string]int, err error) { // Setup the cluster infra needed for running RT tests func configureRealtimeTestEnvironment(oc *exutil.CLI) { g.By("Setting up the privileged environment needed for realtime tests") - err := oc.SetNamespace(rtNamespace).Run("apply").Args("-f", rtEnvFixture).Execute() + err := oc.SetNamespace(rtNamespace).Run("apply").Args("-f", rtEnvFixture()).Execute() o.Expect(err).NotTo(o.HaveOccurred(), "unable to create namespace and service accounts for rt-tests") } // Tear down the infra setup we used for testing func cleanupRealtimeTestEnvironment(oc *exutil.CLI) { g.By("Cleaning up the privileged environment needed for realtime tests") - err := oc.SetNamespace(rtNamespace).Run("delete").Args("-f", rtEnvFixture).Execute() + err := oc.SetNamespace(rtNamespace).Run("delete").Args("-f", rtEnvFixture()).Execute() o.Expect(err).NotTo(o.HaveOccurred(), "unable to clean up the namespace and service accounts for rt-tests") err = wait.PollImmediate(1*time.Second, 60*time.Second, func() (bool, error) { @@ -82,7 +88,7 @@ func cleanupRealtimeTestEnvironment(oc *exutil.CLI) { // Setup the pod that will be used to run the test func startRtTestPod(oc *exutil.CLI) { g.By("Setting up the pod needed for realtime tests") - err := oc.SetNamespace(rtNamespace).Run("apply").Args("-f", rtPodFixture).Execute() + err := oc.SetNamespace(rtNamespace).Run("apply").Args("-f", rtPodFixture()).Execute() o.Expect(err).NotTo(o.HaveOccurred(), "unable to create test pod for rt-tests") // Wait for the container to be ready to go @@ -93,7 +99,7 @@ func startRtTestPod(oc *exutil.CLI) { // Cleanup the pod used for the test func cleanupRtTestPod(oc *exutil.CLI) { g.By("Cleaning up the pod needed for realtime tests") - err := oc.SetNamespace(rtNamespace).Run("delete").Args("-f", rtPodFixture).Execute() + err := oc.SetNamespace(rtNamespace).Run("delete").Args("-f", rtPodFixture()).Execute() o.Expect(err).NotTo(o.HaveOccurred(), "unable to clean up test pod for rt-tests") // Wait for the container to be ready to go diff --git a/test/extended/networking/ipsec.go b/test/extended/networking/ipsec.go index e71d21638d33..5c0ee83681b2 100644 --- a/test/extended/networking/ipsec.go +++ b/test/extended/networking/ipsec.go @@ -505,7 +505,7 @@ var _ = g.Describe("[sig-network][Feature:IPsec]", g.Ordered, func() { // nodes don't go for a reboot while rolling out `99-worker-north-south-ipsec-config` // machine config which configures certificates for testing IPsec north south traffic. g.By("deploy machine configuration policy") - err = oc.AsAdmin().Run("apply").Args("-f", nsNodeRebootNoneFixture).Execute() + err = oc.AsAdmin().Run("apply").Args("-f", nsNodeRebootNoneFixture()).Execute() o.Expect(err).NotTo(o.HaveOccurred()) mg.WaitForBootImageControllerToComplete(oc) diff --git a/test/extended/networking/util.go b/test/extended/networking/util.go index ec1bc5ff59a8..ea9ed1f542ba 100644 --- a/test/extended/networking/util.go +++ b/test/extended/networking/util.go @@ -79,11 +79,20 @@ const ( var ( masterRoleMachineConfigLabel = map[string]string{"machineconfiguration.openshift.io/role": "master"} workerRoleMachineConfigLabel = map[string]string{"machineconfiguration.openshift.io/role": "worker"} - ipsecConfigurationBaseDir = exutil.FixturePath("testdata", "ipsec") - nsMachineConfigFixture = filepath.Join(ipsecConfigurationBaseDir, "nsconfig-machine-config.yaml") - nsNodeRebootNoneFixture = filepath.Join(ipsecConfigurationBaseDir, "nsconfig-reboot-none-policy.yaml") ) +func ipsecConfigurationBaseDir() string { + return exutil.FixturePath("testdata", "ipsec") +} + +func nsMachineConfigFixture() string { + return filepath.Join(ipsecConfigurationBaseDir(), "nsconfig-machine-config.yaml") +} + +func nsNodeRebootNoneFixture() string { + return filepath.Join(ipsecConfigurationBaseDir(), "nsconfig-reboot-none-policy.yaml") +} + // IsIPv6 returns true if a group of ips are ipv6. func isIpv6(ip []string) bool { ipv6 := false @@ -648,7 +657,7 @@ func createIPsecCertsMachineConfig(oc *exutil.CLI) (*mcfgv1.MachineConfig, error if err == nil { return nsCertMachineConfig, nil } - err = oc.AsAdmin().Run("create").Args("-f", nsMachineConfigFixture).Execute() + err = oc.AsAdmin().Run("create").Args("-f", nsMachineConfigFixture()).Execute() if err != nil { return nil, fmt.Errorf("error deploying IPsec certs Machine Config: %v", err) }