From a1abac6be9aa8d9d51fd54509afcb8133c5972a6 Mon Sep 17 00:00:00 2001 From: Rohan Kumar Date: Thu, 9 Oct 2025 14:48:49 +0530 Subject: [PATCH 1/2] feat (postStart) : Allow debugging poststart failures with sleep by trapping errors Add an optional debug mechanism for postStart lifecycle hooks. When enabled via the `controller.devfile.io/debug-start: "true"` annotation, any failure in a postStart command results in the container sleeping for some seconds as per configured progressTimeout, allowing developers time to inspect the container state. - Added `enableDebugStart` parameter to poststart methods. - Injects `trap ... sleep` into postStart scripts when debug mode is enabled. - Includes support for both timeout-wrapped (`postStartTimeout`) and non-timeout lifecycle scripts. This feature improves debuggability of DevWorkspaces where postStart hooks fail and would otherwise cause container crash/restarts. Signed-off-by: Rohan Kumar --- .../workspace/devworkspace_controller.go | 5 + docs/additional-configuration.adoc | 8 + pkg/library/container/container.go | 4 +- pkg/library/container/container_test.go | 2 +- pkg/library/lifecycle/poststart.go | 59 +++++- pkg/library/lifecycle/poststart_test.go | 172 +++++++++++++++--- .../adds_all_postStart_commands.yaml | 1 + .../testdata/postStart/basic_postStart.yaml | 1 + ...led_add_trap_sleep_postStart_commands.yaml | 61 +++++++ ...bled_trap_already_exists_in_postStart.yaml | 38 ++++ .../postStart/error_command_has_env.yaml | 1 + .../error_postStart_cmd_is_not_exec.yaml | 1 + ...rror_postStart_command_does_not_exist.yaml | 1 + ...rt_command_uses_nonexistent_container.yaml | 1 + .../multiple_poststart_commands.yaml | 1 + .../testdata/postStart/no_events.yaml | 1 + .../testdata/postStart/no_postStart.yaml | 1 + .../postStart/workingDir_postStart.yaml | 1 + 18 files changed, 325 insertions(+), 34 deletions(-) create mode 100644 pkg/library/lifecycle/testdata/postStart/debug_enabled_add_trap_sleep_postStart_commands.yaml create mode 100644 pkg/library/lifecycle/testdata/postStart/debug_enabled_trap_already_exists_in_postStart.yaml diff --git a/controllers/workspace/devworkspace_controller.go b/controllers/workspace/devworkspace_controller.go index 8f10217c4..69448fcb9 100644 --- a/controllers/workspace/devworkspace_controller.go +++ b/controllers/workspace/devworkspace_controller.go @@ -323,12 +323,17 @@ func (r *DevWorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request } } + postStartDebugTrapSleepDuration := "" + if workspace.Annotations[constants.DevWorkspaceDebugStartAnnotation] == "true" { + postStartDebugTrapSleepDuration = workspace.Config.Workspace.ProgressTimeout + } devfilePodAdditions, err := containerlib.GetKubeContainersFromDevfile( &workspace.Spec.Template, workspace.Config.Workspace.ContainerSecurityContext, workspace.Config.Workspace.ImagePullPolicy, workspace.Config.Workspace.DefaultContainerResources, workspace.Config.Workspace.PostStartTimeout, + postStartDebugTrapSleepDuration, ) if err != nil { return r.failWorkspace(workspace, fmt.Sprintf("Error processing devfile: %s", err), metrics.ReasonBadRequest, reqLogger, &reconcileStatus), nil diff --git a/docs/additional-configuration.adoc b/docs/additional-configuration.adoc index ab041cec9..8418fd4cb 100644 --- a/docs/additional-configuration.adoc +++ b/docs/additional-configuration.adoc @@ -348,6 +348,14 @@ The DevWorkspace Operator sets the `volumeMounts` by default for config files, m ## Debugging a failing workspace Normally, when a workspace fails to start, the deployment will be scaled down and the workspace will be stopped in a `Failed` state. This can make it difficult to debug misconfiguration errors, so the annotation `controller.devfile.io/debug-start: "true"` can be applied to DevWorkspaces to leave resources for failed workspaces on the cluster. This allows viewing logs from workspace containers. +It also enables a specialized debug mode for `postStart` lifecycle hooks, which are often used for initial setup tasks. + +When a postStart command fails: +- The container will not immediately crash or restart. It would stay in `ContainerCreating` phase. +- The command failure is trapped, and the container is instead forced to sleep for some seconds as per configured DevWorkspace progressTimeout (by default, 5 minutes). + +This trap sleep pause is a critical window that allows developers to connect to the container (e.g., using `kubectl exec`), inspect the file system, and review logs `/tmp/poststart-stderr.txt` / `/tmp/poststart-stdout.txt` to diagnose the exact cause of the postStart failure before the workspace ultimately scales down. This applies to both standard and timeout-wrapped postStart scripts. + ## Setting RuntimeClass for workspace pods To run a DevWorkspace with a specific RuntimeClass, the attribute `controller.devfile.io/runtime-class` can be set on the DevWorkspace with the name of the RuntimeClass to be used. If the specified RuntimeClass does not exist, the workspace will fail to start. For example, to run a DevWorkspace using the https://github.com/kata-containers/kata-containers[kata containers] runtime in clusters where this is enabled, the DevWorkspace can be specified: [source,yaml] diff --git a/pkg/library/container/container.go b/pkg/library/container/container.go index 03132b90c..52dc04290 100644 --- a/pkg/library/container/container.go +++ b/pkg/library/container/container.go @@ -45,7 +45,7 @@ import ( // rewritten as Volumes are added to PodAdditions, in order to support e.g. using one PVC to hold all volumes // // Note: Requires DevWorkspace to be flattened (i.e. the DevWorkspace contains no Parent or Components of type Plugin) -func GetKubeContainersFromDevfile(workspace *dw.DevWorkspaceTemplateSpec, securityContext *corev1.SecurityContext, pullPolicy string, defaultResources *corev1.ResourceRequirements, postStartTimeout string) (*v1alpha1.PodAdditions, error) { +func GetKubeContainersFromDevfile(workspace *dw.DevWorkspaceTemplateSpec, securityContext *corev1.SecurityContext, pullPolicy string, defaultResources *corev1.ResourceRequirements, postStartTimeout string, postStartDebugTrapSleepDuration string) (*v1alpha1.PodAdditions, error) { if !flatten.DevWorkspaceIsFlattened(workspace, nil) { return nil, fmt.Errorf("devfile is not flattened") } @@ -77,7 +77,7 @@ func GetKubeContainersFromDevfile(workspace *dw.DevWorkspaceTemplateSpec, securi podAdditions.Containers = append(podAdditions.Containers, *k8sContainer) } - if err := lifecycle.AddPostStartLifecycleHooks(workspace, podAdditions.Containers, postStartTimeout); err != nil { + if err := lifecycle.AddPostStartLifecycleHooks(workspace, podAdditions.Containers, postStartTimeout, postStartDebugTrapSleepDuration); err != nil { return nil, err } diff --git a/pkg/library/container/container_test.go b/pkg/library/container/container_test.go index 36bf47738..6b98184f2 100644 --- a/pkg/library/container/container_test.go +++ b/pkg/library/container/container_test.go @@ -87,7 +87,7 @@ func TestGetKubeContainersFromDevfile(t *testing.T) { t.Run(tt.Name, func(t *testing.T) { // sanity check that file is read correctly. assert.True(t, len(tt.Input.Components) > 0, "Input defines no components") - gotPodAdditions, err := GetKubeContainersFromDevfile(tt.Input, nil, testImagePullPolicy, defaultResources, "") + gotPodAdditions, err := GetKubeContainersFromDevfile(tt.Input, nil, testImagePullPolicy, defaultResources, "", "") if tt.Output.ErrRegexp != nil && assert.Error(t, err) { assert.Regexp(t, *tt.Output.ErrRegexp, err.Error(), "Error message should match") } else { diff --git a/pkg/library/lifecycle/poststart.go b/pkg/library/lifecycle/poststart.go index f2a78f232..73eb3471e 100644 --- a/pkg/library/lifecycle/poststart.go +++ b/pkg/library/lifecycle/poststart.go @@ -15,9 +15,12 @@ package lifecycle import ( "fmt" + "regexp" "strings" "time" + "github.com/go-logr/logr" + dw "github.com/devfile/api/v2/pkg/apis/workspaces/v1alpha2" corev1 "k8s.io/api/core/v1" "sigs.k8s.io/controller-runtime/pkg/log" @@ -41,7 +44,9 @@ const ( ` ) -func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []corev1.Container, postStartTimeout string) error { +var trapErrRegex = regexp.MustCompile(`\btrap\b.*\bERR\b`) + +func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []corev1.Container, postStartTimeout string, postStartDebugTrapSleepDuration string) error { if wksp.Events == nil || len(wksp.Events.PostStart) == 0 { return nil } @@ -69,7 +74,7 @@ func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers [] return fmt.Errorf("failed to process postStart event %s: %w", commands[0].Id, err) } - postStartHandler, err := processCommandsForPostStart(commands, postStartTimeout) + postStartHandler, err := processCommandsForPostStart(commands, postStartTimeout, postStartDebugTrapSleepDuration) if err != nil { return fmt.Errorf("failed to process postStart event %s: %w", commands[0].Id, err) } @@ -85,10 +90,10 @@ func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers [] // processCommandsForPostStart processes a list of DevWorkspace commands // and generates a corev1.LifecycleHandler for the PostStart lifecycle hook. -func processCommandsForPostStart(commands []dw.Command, postStartTimeout string) (*corev1.LifecycleHandler, error) { +func processCommandsForPostStart(commands []dw.Command, postStartTimeout string, postStartDebugTrapSleepDuration string) (*corev1.LifecycleHandler, error) { if postStartTimeout == "" { // use the fallback if no timeout propagated - return processCommandsWithoutTimeoutFallback(commands) + return processCommandsWithoutTimeoutFallback(commands, postStartDebugTrapSleepDuration) } originalUserScript, err := buildUserScript(commands) @@ -101,7 +106,7 @@ func processCommandsForPostStart(commands []dw.Command, postStartTimeout string) scriptToExecute := "set -e\n" + originalUserScript escapedUserScriptForTimeoutWrapper := strings.ReplaceAll(scriptToExecute, "'", `'\''`) - fullScriptWithTimeout := generateScriptWithTimeout(escapedUserScriptForTimeoutWrapper, postStartTimeout) + fullScriptWithTimeout := generateScriptWithTimeout(escapedUserScriptForTimeoutWrapper, postStartTimeout, postStartDebugTrapSleepDuration) finalScriptForHook := fmt.Sprintf(redirectOutputFmt, fullScriptWithTimeout) @@ -128,8 +133,10 @@ func processCommandsForPostStart(commands []dw.Command, postStartTimeout string) // - | // cd // -func processCommandsWithoutTimeoutFallback(commands []dw.Command) (*corev1.LifecycleHandler, error) { +func processCommandsWithoutTimeoutFallback(commands []dw.Command, postStartDebugTrapSleepDuration string) (*corev1.LifecycleHandler, error) { var dwCommands []string + postStartFailureDebugSleepSeconds := parsePostStartFailureDebugSleepDurationToSeconds(log.Log, postStartDebugTrapSleepDuration) + hasErrTrapInUserScript := false for _, command := range commands { execCmd := command.Exec if len(execCmd.Env) > 0 { @@ -139,6 +146,21 @@ func processCommandsWithoutTimeoutFallback(commands []dw.Command) (*corev1.Lifec dwCommands = append(dwCommands, fmt.Sprintf("cd %s", execCmd.WorkingDir)) } dwCommands = append(dwCommands, execCmd.CommandLine) + if trapErrRegex.MatchString(execCmd.CommandLine) { + hasErrTrapInUserScript = true + } + } + + if postStartFailureDebugSleepSeconds > 0 && !hasErrTrapInUserScript { + debugTrap := fmt.Sprintf(` +trap 'echo "[postStart] failure encountered, sleep for debugging"; sleep %d' ERR +`, postStartFailureDebugSleepSeconds) + debugTrapLine := strings.ReplaceAll(strings.TrimSpace(debugTrap), "\n", " ") + + dwCommands = append([]string{ + "set -e", + debugTrapLine, + }, dwCommands...) } joinedCommands := strings.Join(dwCommands, "\n") @@ -187,7 +209,7 @@ func buildUserScript(commands []dw.Command) (string, error) { // environment variable exports, and specific exit code handling. // The killAfterDurationSeconds is hardcoded to 5s within this generated script. // It conditionally prefixes the user script with the timeout command if available. -func generateScriptWithTimeout(escapedUserScript string, postStartTimeout string) string { +func generateScriptWithTimeout(escapedUserScript string, postStartTimeout string, postStartDebugTrapSleepDuration string) string { // Convert `postStartTimeout` into the `timeout` format var timeoutSeconds int64 if postStartTimeout != "" && postStartTimeout != "0" { @@ -199,10 +221,12 @@ func generateScriptWithTimeout(escapedUserScript string, postStartTimeout string timeoutSeconds = int64(duration.Seconds()) } } + postStartFailureDebugSleepSeconds := parsePostStartFailureDebugSleepDurationToSeconds(log.Log, postStartDebugTrapSleepDuration) return fmt.Sprintf(` export POSTSTART_TIMEOUT_DURATION="%d" export POSTSTART_KILL_AFTER_DURATION="5" +export DEBUG_ENABLED="%t" _TIMEOUT_COMMAND_PART="" _WAS_TIMEOUT_USED="false" # Use strings "true" or "false" for shell boolean @@ -219,6 +243,11 @@ fi ${_TIMEOUT_COMMAND_PART} /bin/sh -c '%s' exit_code=$? +if [ "$DEBUG_ENABLED" = "true" ] && [ $exit_code -ne 0 ]; then + echo "[postStart] failure encountered, sleep for debugging" >&2 + sleep %d +fi + # Check the exit code based on whether timeout was attempted if [ "$_WAS_TIMEOUT_USED" = "true" ]; then if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM) @@ -239,5 +268,19 @@ else fi exit $exit_code -`, timeoutSeconds, escapedUserScript) +`, timeoutSeconds, postStartFailureDebugSleepSeconds > 0, escapedUserScript, postStartFailureDebugSleepSeconds) +} + +func parsePostStartFailureDebugSleepDurationToSeconds(logger logr.Logger, durationStr string) int { + if durationStr == "" { + return 0 + } + + d, err := time.ParseDuration(durationStr) + if err != nil { + logger.Error(err, "Failed to parse postStart failure debug sleep duration for ", "durationStr", durationStr) + return 0 + } + + return int(d.Seconds()) } diff --git a/pkg/library/lifecycle/poststart_test.go b/pkg/library/lifecycle/poststart_test.go index b487858ea..f5d8fe894 100644 --- a/pkg/library/lifecycle/poststart_test.go +++ b/pkg/library/lifecycle/poststart_test.go @@ -19,6 +19,8 @@ import ( "path/filepath" "testing" + "github.com/go-logr/logr/testr" + dw "github.com/devfile/api/v2/pkg/apis/workspaces/v1alpha2" "github.com/stretchr/testify/assert" corev1 "k8s.io/api/core/v1" @@ -33,8 +35,9 @@ type postStartTestCase struct { } type postStartTestInput struct { - Devfile *dw.DevWorkspaceTemplateSpec `json:"devfile,omitempty"` - Containers []corev1.Container `json:"containers,omitempty"` + Devfile *dw.DevWorkspaceTemplateSpec `json:"devfile,omitempty"` + PostStartDebugTrapSleepDuration string `json:"postStartDebugTrapSleepDuration,omitempty"` + Containers []corev1.Container `json:"containers,omitempty"` } type postStartTestOutput struct { @@ -76,7 +79,7 @@ func TestAddPostStartLifecycleHooks(t *testing.T) { for _, tt := range tests { t.Run(fmt.Sprintf("%s (%s)", tt.Name, tt.testPath), func(t *testing.T) { var timeout string - err := AddPostStartLifecycleHooks(tt.Input.Devfile, tt.Input.Containers, timeout) + err := AddPostStartLifecycleHooks(tt.Input.Devfile, tt.Input.Containers, timeout, tt.Input.PostStartDebugTrapSleepDuration) if tt.Output.ErrRegexp != nil && assert.Error(t, err) { assert.Regexp(t, *tt.Output.ErrRegexp, err.Error(), "Error message should match") } else { @@ -296,18 +299,21 @@ func TestBuildUserScript(t *testing.T) { func TestGenerateScriptWithTimeout(t *testing.T) { tests := []struct { - name string - escapedUserScript string - timeout string - expectedScript string + name string + escapedUserScript string + timeout string + postStartDebugTrapSleepDuration string + expectedScript string }{ { - name: "Basic script with timeout", - escapedUserScript: "echo 'hello world'\nsleep 1", - timeout: "10s", + name: "Basic script with timeout", + escapedUserScript: "echo 'hello world'\nsleep 1", + timeout: "10s", + postStartDebugTrapSleepDuration: "", expectedScript: ` export POSTSTART_TIMEOUT_DURATION="10" export POSTSTART_KILL_AFTER_DURATION="5" +export DEBUG_ENABLED="false" _TIMEOUT_COMMAND_PART="" _WAS_TIMEOUT_USED="false" # Use strings "true" or "false" for shell boolean @@ -325,6 +331,11 @@ ${_TIMEOUT_COMMAND_PART} /bin/sh -c 'echo 'hello world' sleep 1' exit_code=$? +if [ "$DEBUG_ENABLED" = "true" ] && [ $exit_code -ne 0 ]; then + echo "[postStart] failure encountered, sleep for debugging" >&2 + sleep 0 +fi + # Check the exit code based on whether timeout was attempted if [ "$_WAS_TIMEOUT_USED" = "true" ]; then if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM) @@ -348,12 +359,14 @@ exit $exit_code `, }, { - name: "Script with zero timeout (no timeout)", - escapedUserScript: "echo 'running indefinitely...'", - timeout: "0s", + name: "Script with zero timeout (no timeout)", + escapedUserScript: "echo 'running indefinitely...'", + timeout: "0s", + postStartDebugTrapSleepDuration: "", expectedScript: ` export POSTSTART_TIMEOUT_DURATION="0" export POSTSTART_KILL_AFTER_DURATION="5" +export DEBUG_ENABLED="false" _TIMEOUT_COMMAND_PART="" _WAS_TIMEOUT_USED="false" # Use strings "true" or "false" for shell boolean @@ -370,6 +383,11 @@ fi ${_TIMEOUT_COMMAND_PART} /bin/sh -c 'echo 'running indefinitely...'' exit_code=$? +if [ "$DEBUG_ENABLED" = "true" ] && [ $exit_code -ne 0 ]; then + echo "[postStart] failure encountered, sleep for debugging" >&2 + sleep 0 +fi + # Check the exit code based on whether timeout was attempted if [ "$_WAS_TIMEOUT_USED" = "true" ]; then if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM) @@ -393,12 +411,14 @@ exit $exit_code `, }, { - name: "Empty user script", - escapedUserScript: "", - timeout: "5s", + name: "Empty user script", + escapedUserScript: "", + timeout: "5s", + postStartDebugTrapSleepDuration: "", expectedScript: ` export POSTSTART_TIMEOUT_DURATION="5" export POSTSTART_KILL_AFTER_DURATION="5" +export DEBUG_ENABLED="false" _TIMEOUT_COMMAND_PART="" _WAS_TIMEOUT_USED="false" # Use strings "true" or "false" for shell boolean @@ -415,6 +435,11 @@ fi ${_TIMEOUT_COMMAND_PART} /bin/sh -c '' exit_code=$? +if [ "$DEBUG_ENABLED" = "true" ] && [ $exit_code -ne 0 ]; then + echo "[postStart] failure encountered, sleep for debugging" >&2 + sleep 0 +fi + # Check the exit code based on whether timeout was attempted if [ "$_WAS_TIMEOUT_USED" = "true" ]; then if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM) @@ -438,12 +463,14 @@ exit $exit_code `, }, { - name: "User script with already escaped single quotes", - escapedUserScript: "echo 'it'\\''s complex'", - timeout: "30s", + name: "User script with already escaped single quotes", + escapedUserScript: "echo 'it'\\''s complex'", + timeout: "30s", + postStartDebugTrapSleepDuration: "", expectedScript: ` export POSTSTART_TIMEOUT_DURATION="30" export POSTSTART_KILL_AFTER_DURATION="5" +export DEBUG_ENABLED="false" _TIMEOUT_COMMAND_PART="" _WAS_TIMEOUT_USED="false" # Use strings "true" or "false" for shell boolean @@ -460,6 +487,11 @@ fi ${_TIMEOUT_COMMAND_PART} /bin/sh -c 'echo 'it'\''s complex'' exit_code=$? +if [ "$DEBUG_ENABLED" = "true" ] && [ $exit_code -ne 0 ]; then + echo "[postStart] failure encountered, sleep for debugging" >&2 + sleep 0 +fi + # Check the exit code based on whether timeout was attempted if [ "$_WAS_TIMEOUT_USED" = "true" ]; then if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM) @@ -483,12 +515,14 @@ exit $exit_code `, }, { - name: "User script with minute timeout", - escapedUserScript: "echo 'wait for it...'", - timeout: "2m", + name: "User script with minute timeout", + escapedUserScript: "echo 'wait for it...'", + timeout: "2m", + postStartDebugTrapSleepDuration: "", expectedScript: ` export POSTSTART_TIMEOUT_DURATION="120" export POSTSTART_KILL_AFTER_DURATION="5" +export DEBUG_ENABLED="false" _TIMEOUT_COMMAND_PART="" _WAS_TIMEOUT_USED="false" # Use strings "true" or "false" for shell boolean @@ -505,6 +539,64 @@ fi ${_TIMEOUT_COMMAND_PART} /bin/sh -c 'echo 'wait for it...'' exit_code=$? +if [ "$DEBUG_ENABLED" = "true" ] && [ $exit_code -ne 0 ]; then + echo "[postStart] failure encountered, sleep for debugging" >&2 + sleep 0 +fi + +# Check the exit code based on whether timeout was attempted +if [ "$_WAS_TIMEOUT_USED" = "true" ]; then + if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM) + echo "[postStart hook] Commands terminated by SIGTERM (likely timed out after ${POSTSTART_TIMEOUT_DURATION}s). Exit code 143." >&2 + elif [ $exit_code -eq 137 ]; then # 128 + 9 (SIGKILL) + echo "[postStart hook] Commands forcefully killed by SIGKILL (likely after --kill-after ${POSTSTART_KILL_AFTER_DURATION}s expired). Exit code 137." >&2 + elif [ $exit_code -ne 0 ]; then # Catches any other non-zero exit code + echo "[postStart hook] Commands failed with exit code $exit_code." >&2 + else + echo "[postStart hook] Commands completed successfully within the time limit." >&2 + fi +else + if [ $exit_code -ne 0 ]; then + echo "[postStart hook] Commands failed with exit code $exit_code (no timeout)." >&2 + else + echo "[postStart hook] Commands completed successfully (no timeout)." >&2 + fi +fi + +exit $exit_code +`, + }, + { + name: "Basic script with timeout and debug enabled", + escapedUserScript: "echo 'hello world'\nsleep 1", + timeout: "10s", + postStartDebugTrapSleepDuration: "5m", + expectedScript: ` +export POSTSTART_TIMEOUT_DURATION="10" +export POSTSTART_KILL_AFTER_DURATION="5" +export DEBUG_ENABLED="true" + +_TIMEOUT_COMMAND_PART="" +_WAS_TIMEOUT_USED="false" # Use strings "true" or "false" for shell boolean + +if command -v timeout >/dev/null 2>&1; then + echo "[postStart hook] Executing commands with timeout: ${POSTSTART_TIMEOUT_DURATION} seconds, kill after: ${POSTSTART_KILL_AFTER_DURATION} seconds" >&2 + _TIMEOUT_COMMAND_PART="timeout --preserve-status --kill-after=${POSTSTART_KILL_AFTER_DURATION} ${POSTSTART_TIMEOUT_DURATION}" + _WAS_TIMEOUT_USED="true" +else + echo "[postStart hook] WARNING: 'timeout' utility not found. Executing commands without timeout." >&2 +fi + +# Execute the user's script +${_TIMEOUT_COMMAND_PART} /bin/sh -c 'echo 'hello world' +sleep 1' +exit_code=$? + +if [ "$DEBUG_ENABLED" = "true" ] && [ $exit_code -ne 0 ]; then + echo "[postStart] failure encountered, sleep for debugging" >&2 + sleep 300 +fi + # Check the exit code based on whether timeout was attempted if [ "$_WAS_TIMEOUT_USED" = "true" ]; then if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM) @@ -531,8 +623,42 @@ exit $exit_code for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - script := generateScriptWithTimeout(tt.escapedUserScript, tt.timeout) + script := generateScriptWithTimeout(tt.escapedUserScript, tt.timeout, tt.postStartDebugTrapSleepDuration) assert.Equal(t, tt.expectedScript, script) }) } } + +func TestParsePostStartFailureDebugSleepDurationToSeconds(t *testing.T) { + logger := testr.New(t) + tests := []struct { + name string + input string + expected int + }{ + { + name: "empty string returns 0", + input: "", + expected: 0, + }, + { + name: "valid duration", + input: "5s", + expected: 5, + }, + { + name: "invalid duration returns 0", + input: "abc", + expected: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := parsePostStartFailureDebugSleepDurationToSeconds(logger, tt.input) + if got != tt.expected { + t.Errorf("parsePostStartFailureDebugSleepDurationToSeconds(%q) = %d; want %d", tt.input, got, tt.expected) + } + }) + } +} diff --git a/pkg/library/lifecycle/testdata/postStart/adds_all_postStart_commands.yaml b/pkg/library/lifecycle/testdata/postStart/adds_all_postStart_commands.yaml index e2a9bf371..2fa2b9d1a 100644 --- a/pkg/library/lifecycle/testdata/postStart/adds_all_postStart_commands.yaml +++ b/pkg/library/lifecycle/testdata/postStart/adds_all_postStart_commands.yaml @@ -1,6 +1,7 @@ name: "Adds all postStart events to containers" input: + postStartDebugTrapSleepDuration: "" devfile: commands: - id: test-postStart-1 diff --git a/pkg/library/lifecycle/testdata/postStart/basic_postStart.yaml b/pkg/library/lifecycle/testdata/postStart/basic_postStart.yaml index acbf1ae36..28482d9e8 100644 --- a/pkg/library/lifecycle/testdata/postStart/basic_postStart.yaml +++ b/pkg/library/lifecycle/testdata/postStart/basic_postStart.yaml @@ -1,6 +1,7 @@ name: "Should add postStart lifecycle hook for basic event" input: + postStartDebugTrapSleepDuration: "" devfile: commands: - id: test-postStart diff --git a/pkg/library/lifecycle/testdata/postStart/debug_enabled_add_trap_sleep_postStart_commands.yaml b/pkg/library/lifecycle/testdata/postStart/debug_enabled_add_trap_sleep_postStart_commands.yaml new file mode 100644 index 000000000..e0cbbc031 --- /dev/null +++ b/pkg/library/lifecycle/testdata/postStart/debug_enabled_add_trap_sleep_postStart_commands.yaml @@ -0,0 +1,61 @@ +name: "Debug Enabled Adds postStart events with trap and sleep to containers" + +input: + postStartDebugTrapSleepDuration: "5m" + devfile: + commands: + - id: test-postStart-1 + exec: + component: test-component-1 + commandLine: "echo 'hello world 1'" + - id: test-postStart-2 + exec: + component: test-component-2 + commandLine: "echo 'hello world 2'" + workingDir: "/tmp/test-dir" + events: + postStart: + - test-postStart-1 + - test-postStart-2 + + containers: + - name: test-component-1 + image: test-img + - name: test-component-2 + image: test-img + - name: test-component-3 + image: test-img + +output: + containers: + - name: test-component-1 + image: test-img + lifecycle: + postStart: + exec: + command: + - /bin/sh + - -c + - | + { + set -e + trap 'echo "[postStart] failure encountered, sleep for debugging"; sleep 300' ERR + echo 'hello world 1' + } 1>/tmp/poststart-stdout.txt 2>/tmp/poststart-stderr.txt + - name: test-component-2 + image: test-img + lifecycle: + postStart: + exec: + command: + - /bin/sh + - -c + - | + { + set -e + trap 'echo "[postStart] failure encountered, sleep for debugging"; sleep 300' ERR + cd /tmp/test-dir + echo 'hello world 2' + } 1>/tmp/poststart-stdout.txt 2>/tmp/poststart-stderr.txt + - name: test-component-3 + image: test-img diff --git a/pkg/library/lifecycle/testdata/postStart/debug_enabled_trap_already_exists_in_postStart.yaml b/pkg/library/lifecycle/testdata/postStart/debug_enabled_trap_already_exists_in_postStart.yaml new file mode 100644 index 000000000..aac317a2a --- /dev/null +++ b/pkg/library/lifecycle/testdata/postStart/debug_enabled_trap_already_exists_in_postStart.yaml @@ -0,0 +1,38 @@ +name: "Debug Enabled But trap already exists in user script, nothing added" + +input: + postStartDebugTrapSleepDuration: "5m" + devfile: + commands: + - id: test-postStart-1 + exec: + component: test-component-1 + commandLine: | + trap 'echo "cleanup on error"' ERR + echo "Starting app..." + /app/start.sh + events: + postStart: + - test-postStart-1 + + containers: + - name: test-component-1 + image: test-img + +output: + containers: + - name: test-component-1 + image: test-img + lifecycle: + postStart: + exec: + command: + - /bin/sh + - -c + - | + { + trap 'echo "cleanup on error"' ERR + echo "Starting app..." + /app/start.sh + + } 1>/tmp/poststart-stdout.txt 2>/tmp/poststart-stderr.txt diff --git a/pkg/library/lifecycle/testdata/postStart/error_command_has_env.yaml b/pkg/library/lifecycle/testdata/postStart/error_command_has_env.yaml index 9881bfb92..272fddd65 100644 --- a/pkg/library/lifecycle/testdata/postStart/error_command_has_env.yaml +++ b/pkg/library/lifecycle/testdata/postStart/error_command_has_env.yaml @@ -1,6 +1,7 @@ name: "Returns error when postStart command requires env vars" input: + postStartDebugTrapSleepDuration: "" devfile: commands: - id: test-cmd diff --git a/pkg/library/lifecycle/testdata/postStart/error_postStart_cmd_is_not_exec.yaml b/pkg/library/lifecycle/testdata/postStart/error_postStart_cmd_is_not_exec.yaml index af46b167b..32cee2e1e 100644 --- a/pkg/library/lifecycle/testdata/postStart/error_postStart_cmd_is_not_exec.yaml +++ b/pkg/library/lifecycle/testdata/postStart/error_postStart_cmd_is_not_exec.yaml @@ -1,6 +1,7 @@ name: "Returns error when postStart command is not exec-type" input: + postStartDebugTrapSleepDuration: "" devfile: commands: - id: test-command diff --git a/pkg/library/lifecycle/testdata/postStart/error_postStart_command_does_not_exist.yaml b/pkg/library/lifecycle/testdata/postStart/error_postStart_command_does_not_exist.yaml index bc7de2a93..dec90982d 100644 --- a/pkg/library/lifecycle/testdata/postStart/error_postStart_command_does_not_exist.yaml +++ b/pkg/library/lifecycle/testdata/postStart/error_postStart_command_does_not_exist.yaml @@ -1,6 +1,7 @@ name: "Returns error when postStart command does not exist" input: + postStartDebugTrapSleepDuration: "" devfile: events: postStart: diff --git a/pkg/library/lifecycle/testdata/postStart/error_postStart_command_uses_nonexistent_container.yaml b/pkg/library/lifecycle/testdata/postStart/error_postStart_command_uses_nonexistent_container.yaml index dd78ef329..f89205f8b 100644 --- a/pkg/library/lifecycle/testdata/postStart/error_postStart_command_uses_nonexistent_container.yaml +++ b/pkg/library/lifecycle/testdata/postStart/error_postStart_command_uses_nonexistent_container.yaml @@ -1,6 +1,7 @@ name: "Returns error when postStart command requires nonexistent container" input: + postStartDebugTrapSleepDuration: "" devfile: commands: - id: test-cmd diff --git a/pkg/library/lifecycle/testdata/postStart/multiple_poststart_commands.yaml b/pkg/library/lifecycle/testdata/postStart/multiple_poststart_commands.yaml index db14a94bf..48a5cab21 100644 --- a/pkg/library/lifecycle/testdata/postStart/multiple_poststart_commands.yaml +++ b/pkg/library/lifecycle/testdata/postStart/multiple_poststart_commands.yaml @@ -1,6 +1,7 @@ name: "Multiple postStart commands use same component" input: + postStartDebugTrapSleepDuration: "" devfile: commands: - id: test-cmd-1 diff --git a/pkg/library/lifecycle/testdata/postStart/no_events.yaml b/pkg/library/lifecycle/testdata/postStart/no_events.yaml index d8508f7df..645eabb6a 100644 --- a/pkg/library/lifecycle/testdata/postStart/no_events.yaml +++ b/pkg/library/lifecycle/testdata/postStart/no_events.yaml @@ -1,6 +1,7 @@ name: "Should do nothing when devfile does not specify events" input: + postStartDebugTrapSleepDuration: "" devfile: {} containers: - name: test-container diff --git a/pkg/library/lifecycle/testdata/postStart/no_postStart.yaml b/pkg/library/lifecycle/testdata/postStart/no_postStart.yaml index 4da036281..e42864de7 100644 --- a/pkg/library/lifecycle/testdata/postStart/no_postStart.yaml +++ b/pkg/library/lifecycle/testdata/postStart/no_postStart.yaml @@ -1,6 +1,7 @@ name: "Should do nothing when devfile does not include postStart events" input: + postStartDebugTrapSleepDuration: "" devfile: commands: - id: prestart-cmd diff --git a/pkg/library/lifecycle/testdata/postStart/workingDir_postStart.yaml b/pkg/library/lifecycle/testdata/postStart/workingDir_postStart.yaml index dfe976b4e..344dd3540 100644 --- a/pkg/library/lifecycle/testdata/postStart/workingDir_postStart.yaml +++ b/pkg/library/lifecycle/testdata/postStart/workingDir_postStart.yaml @@ -1,6 +1,7 @@ name: "Should add postStart lifecycle hook for event with workingDir" input: + postStartDebugTrapSleepDuration: "" devfile: commands: - id: test-postStart From 3a96b2bee130f0ba85cf9b026b7debc8049da7c1 Mon Sep 17 00:00:00 2001 From: Rohan Kumar Date: Thu, 6 Nov 2025 21:27:47 +0530 Subject: [PATCH 2/2] fix (controller) : update DevWorkspace status with different message for debug start When DevWorkspace contains 'controller.devfile.io/debug-start' annotation, set a different message for DevWorkspace Starting phase to give user indication that debug start mode is activated and they need to monitor DevWorkspace pod's logs or exec into it for debugging. Signed-off-by: Rohan Kumar --- .../workspace/devworkspace_controller.go | 5 +- .../devworkspace_debug_poststart_tests.go | 78 +++++++++++++++++++ ...e-devworkspace-debug-start-annotation.yaml | 19 +++++ 3 files changed, 101 insertions(+), 1 deletion(-) create mode 100644 test/e2e/pkg/tests/devworkspace_debug_poststart_tests.go create mode 100644 test/resources/simple-devworkspace-debug-start-annotation.yaml diff --git a/controllers/workspace/devworkspace_controller.go b/controllers/workspace/devworkspace_controller.go index 69448fcb9..e158e69ad 100644 --- a/controllers/workspace/devworkspace_controller.go +++ b/controllers/workspace/devworkspace_controller.go @@ -324,7 +324,10 @@ func (r *DevWorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request } postStartDebugTrapSleepDuration := "" + deploymentReadyConditionMessage := "Waiting for workspace deployment" if workspace.Annotations[constants.DevWorkspaceDebugStartAnnotation] == "true" { + deploymentReadyConditionMessage = "Debug mode: failed postStart commands will be trapped; inspect logs/exec to debug" + reconcileStatus.setConditionTrue(conditions.Started, "DevWorkspace is starting in debug mode") postStartDebugTrapSleepDuration = workspace.Config.Workspace.ProgressTimeout } devfilePodAdditions, err := containerlib.GetKubeContainersFromDevfile( @@ -494,7 +497,7 @@ func (r *DevWorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request if err := wsprovision.SyncDeploymentToCluster(workspace, allPodAdditions, serviceAcctName, clusterAPI); err != nil { if shouldReturn, reconcileResult, reconcileErr := r.checkDWError(workspace, err, "Error creating DevWorkspace deployment", metrics.DetermineProvisioningFailureReason(err.Error()), reqLogger, &reconcileStatus); shouldReturn { reqLogger.Info("Waiting on deployment to be ready") - reconcileStatus.setConditionFalse(conditions.DeploymentReady, "Waiting for workspace deployment") + reconcileStatus.setConditionFalse(conditions.DeploymentReady, deploymentReadyConditionMessage) return reconcileResult, reconcileErr } } diff --git a/test/e2e/pkg/tests/devworkspace_debug_poststart_tests.go b/test/e2e/pkg/tests/devworkspace_debug_poststart_tests.go new file mode 100644 index 000000000..06edf0fed --- /dev/null +++ b/test/e2e/pkg/tests/devworkspace_debug_poststart_tests.go @@ -0,0 +1,78 @@ +// Copyright (c) 2019-2025 Red Hat, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tests + +import ( + "fmt" + "strings" + + dw "github.com/devfile/api/v2/pkg/apis/workspaces/v1alpha2" + "github.com/devfile/devworkspace-operator/test/e2e/pkg/config" + "github.com/onsi/ginkgo/v2" +) + +var _ = ginkgo.Describe("[DevWorkspace Debug Start Mode]", func() { + defer ginkgo.GinkgoRecover() + + ginkgo.It("Wait DevWorkspace Webhook Server Pod", func() { + controllerLabel := "app.kubernetes.io/name=devworkspace-webhook-server" + + deploy, err := config.AdminK8sClient.WaitForPodRunningByLabel(config.OperatorNamespace, controllerLabel) + if err != nil { + ginkgo.Fail(fmt.Sprintf("cannot get the DevWorkspace Webhook Server Pod status with label %s: %s", controllerLabel, err.Error())) + return + } + + if !deploy { + ginkgo.Fail("Devworkspace webhook didn't start properly") + } + }) + + ginkgo.It("Add Debug DevWorkspace to cluster and wait starting status", func() { + commandResult, err := config.DevK8sClient.OcApplyWorkspace(config.DevWorkspaceNamespace, "test/resources/simple-devworkspace-debug-start-annotation.yaml") + if err != nil { + ginkgo.Fail(fmt.Sprintf("Failed to create DevWorkspace: %s %s", err.Error(), commandResult)) + return + } + + deploy, err := config.DevK8sClient.WaitDevWsStatus("code-latest-with-debug-start", config.DevWorkspaceNamespace, dw.DevWorkspaceStatusStarting) + if !deploy { + ginkgo.Fail(fmt.Sprintf("DevWorkspace didn't start properly. Error: %s", err)) + } + }) + + ginkgo.It("Check DevWorkspace Conditions for Debug Start message", func() { + devWorkspaceStatus, err := config.DevK8sClient.GetDevWsStatus("code-latest-with-debug-start", config.DevWorkspaceNamespace) + if err != nil { + ginkgo.Fail(fmt.Sprintf("Failure in fetching DevWorkspace status. Error: %s", err)) + } + + expectedSubstring := "Debug mode: failed postStart commands will be trapped; inspect logs/exec to debug" + + found := false + for _, cond := range devWorkspaceStatus.Conditions { + if cond.Message != "" && strings.Contains(cond.Message, expectedSubstring) { + found = true + break + } + } + + if !found { + ginkgo.Fail(fmt.Sprintf( + "DevWorkspace status does not contain expected debug message.\nExpected substring: %q\nGot conditions: %+v", + expectedSubstring, devWorkspaceStatus.Conditions, + )) + } + }) +}) diff --git a/test/resources/simple-devworkspace-debug-start-annotation.yaml b/test/resources/simple-devworkspace-debug-start-annotation.yaml new file mode 100644 index 000000000..d3ff7d13c --- /dev/null +++ b/test/resources/simple-devworkspace-debug-start-annotation.yaml @@ -0,0 +1,19 @@ +kind: DevWorkspace +apiVersion: workspace.devfile.io/v1alpha2 +metadata: + name: code-latest-with-debug-start + annotations: + controller.devfile.io/debug-start: "true" +spec: + started: true + template: + projects: + - name: web-nodejs-sample + git: + remotes: + origin: "https://github.com/che-samples/web-nodejs-sample.git" + components: + - name: dev + container: + image: quay.io/wto/web-terminal-tooling:latest + args: ["tail", "-f", "/dev/null"] \ No newline at end of file