diff --git a/Makefile b/Makefile index 4ccf7f91..c89b3c5a 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ LIBEXECDIR ?= ${PREFIX}/libexec PKG_CONFIG ?= pkg-config HEADERS := $(wildcard src/*.h) -OBJS := src/conmon.o src/cmsg.o src/ctr_logging.o src/utils.o src/cli.o src/globals.o src/cgroup.o src/conn_sock.o src/oom.o src/ctrl.o src/ctr_stdio.o src/parent_pipe_fd.o src/ctr_exit.o src/runtime_args.o src/close_fds.o src/seccomp_notify.o +OBJS := src/conmon.o src/cmsg.o src/ctr_logging.o src/utils.o src/cli.o src/globals.o src/cgroup.o src/conn_sock.o src/oom.o src/ctrl.o src/ctr_stdio.o src/parent_pipe_fd.o src/ctr_exit.o src/runtime_args.o src/close_fds.o src/seccomp_notify.o src/healthcheck.o MAKEFILE_PATH := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) diff --git a/src/cli.c b/src/cli.c index de1ddabe..ab0b6af4 100644 --- a/src/cli.c +++ b/src/cli.c @@ -61,6 +61,12 @@ char *opt_seccomp_notify_plugins = NULL; gboolean opt_log_rotate = FALSE; int opt_log_max_files = 1; gchar **opt_log_allowlist_dirs = NULL; +char *opt_healthcheck_cmd = NULL; +gchar **opt_healthcheck_args = NULL; +int opt_healthcheck_interval = -1; +int opt_healthcheck_timeout = -1; +int opt_healthcheck_retries = -1; +int opt_healthcheck_start_period = -1; GOptionEntry opt_entries[] = { {"api-version", 0, 0, G_OPTION_ARG_NONE, &opt_api_version, "Conmon API version to use", NULL}, {"bundle", 'b', 0, G_OPTION_ARG_STRING, &opt_bundle_path, "Location of the OCI Bundle path", NULL}, @@ -125,6 +131,15 @@ GOptionEntry opt_entries[] = { NULL}, {"log-max-files", 0, 0, G_OPTION_ARG_INT, &opt_log_max_files, "Number of backup log files to keep (default: 1)", NULL}, {"log-allowlist-dir", 0, 0, G_OPTION_ARG_STRING_ARRAY, &opt_log_allowlist_dirs, "Allowed log directory", NULL}, + {"healthcheck-cmd", 0, 0, G_OPTION_ARG_STRING, &opt_healthcheck_cmd, "Healthcheck command to execute", NULL}, + {"healthcheck-arg", 0, 0, G_OPTION_ARG_STRING_ARRAY, &opt_healthcheck_args, + "Healthcheck command arguments (can be used multiple times)", NULL}, + {"healthcheck-interval", 0, 0, G_OPTION_ARG_INT, &opt_healthcheck_interval, "Healthcheck interval in seconds (default: 30)", NULL}, + {"healthcheck-timeout", 0, 0, G_OPTION_ARG_INT, &opt_healthcheck_timeout, "Healthcheck timeout in seconds (default: 30)", NULL}, + {"healthcheck-retries", 0, 0, G_OPTION_ARG_INT, &opt_healthcheck_retries, + "Number of consecutive failures before marking unhealthy (default: 3)", NULL}, + {"healthcheck-start-period", 0, 0, G_OPTION_ARG_INT, &opt_healthcheck_start_period, + "Start period in seconds before healthchecks start counting failures (default: 0)", NULL}, {NULL, 0, 0, 0, NULL, NULL, NULL}}; @@ -228,4 +243,11 @@ void process_cli() if (opt_no_container_partial_message && !logging_is_journald_enabled()) { nwarnf("--no-container-partial-message has no effect without journald log driver"); } + + /* Validate healthcheck parameters - if any healthcheck options were provided without --healthcheck-cmd */ + if (opt_healthcheck_cmd == NULL + && (opt_healthcheck_interval != -1 || opt_healthcheck_timeout != -1 || opt_healthcheck_retries != -1 + || opt_healthcheck_start_period != -1 || opt_healthcheck_args != NULL)) { + nexit("Healthcheck parameters specified without --healthcheck-cmd. Please provide --healthcheck-cmd to enable healthcheck functionality."); + } } diff --git a/src/cli.h b/src/cli.h index 5961a203..c3b346d6 100644 --- a/src/cli.h +++ b/src/cli.h @@ -50,6 +50,12 @@ extern char *opt_seccomp_notify_plugins; extern gboolean opt_log_rotate; extern int opt_log_max_files; extern gchar **opt_log_allowlist_dirs; +extern char *opt_healthcheck_cmd; +extern gchar **opt_healthcheck_args; +extern int opt_healthcheck_interval; +extern int opt_healthcheck_timeout; +extern int opt_healthcheck_retries; +extern int opt_healthcheck_start_period; extern GOptionEntry opt_entries[]; extern gboolean opt_full_attach_path; diff --git a/src/conmon.c b/src/conmon.c index 74805eae..126c2851 100644 --- a/src/conmon.c +++ b/src/conmon.c @@ -20,6 +20,7 @@ #include "close_fds.h" #include "seccomp_notify.h" #include "runtime_args.h" +#include "healthcheck.h" #include #include @@ -46,7 +47,6 @@ int main(int argc, char *argv[]) _cleanup_close_ int dev_null_r_cleanup = -1; _cleanup_close_ int dev_null_w_cleanup = -1; _cleanup_close_ int dummyfd = -1; - int initialize_ec = initialize_cli(argc, argv); if (initialize_ec >= 0) { exit(initialize_ec); @@ -396,7 +396,6 @@ int main(int argc, char *argv[]) } container_pid = atoi(contents); - ndebugf("container PID: %d", container_pid); g_hash_table_insert(pid_to_handler, (pid_t *)&container_pid, container_exit_cb); @@ -408,6 +407,87 @@ int main(int argc, char *argv[]) if ((opt_api_version >= 1 || !opt_exec) && sync_pipe_fd >= 0) write_or_close_sync_fd(&sync_pipe_fd, container_pid, NULL); + /* Start healthcheck timers if healthcheck command is provided */ + if (opt_healthcheck_cmd != NULL) { + + healthcheck_config_t config; + memset(&config, 0, sizeof(config)); + + /* Parse healthcheck command and arguments into array */ + /* Count total arguments: command + args + NULL terminator */ + int argc = 1; // At least the command + if (opt_healthcheck_args != NULL) { + for (int i = 0; opt_healthcheck_args[i] != NULL; i++) { + argc++; + } + } + + /* Allocate array for command and arguments */ + config.test = calloc(argc + 1, sizeof(char *)); + if (config.test == NULL) { + pexit("Failed to allocate memory for healthcheck command"); + } + + /* Copy command */ + config.test[0] = strdup(opt_healthcheck_cmd); + if (config.test[0] == NULL) { + pexit("Failed to duplicate healthcheck command"); + } + + /* Copy arguments */ + if (opt_healthcheck_args != NULL) { + for (int i = 0; opt_healthcheck_args[i] != NULL; i++) { + config.test[i + 1] = strdup(opt_healthcheck_args[i]); + if (config.test[i + 1] == NULL) { + /* Clean up on error */ + for (int j = 0; j <= i; j++) { + free(config.test[j]); + } + free(config.test); + pexit("Failed to duplicate healthcheck argument"); + } + } + } + config.test[argc] = NULL; /* NULL terminator */ + + /* Set healthcheck parameters from CLI, using defaults for -1 values */ + config.enabled = true; + config.interval = opt_healthcheck_interval != -1 ? opt_healthcheck_interval : 30; + config.timeout = opt_healthcheck_timeout != -1 ? opt_healthcheck_timeout : 30; + config.retries = opt_healthcheck_retries != -1 ? opt_healthcheck_retries : 3; + /* First healthcheck runs immediately, then after 'interval' seconds. + * Here we give a default of 10 seconds to allow container to fully initialize. + * If the user knows the container will take less time to initialize, they can set the start_period to a lower value. + */ + config.start_period = opt_healthcheck_start_period != -1 ? opt_healthcheck_start_period : 10; + + /* Validate healthcheck configuration */ + if (!healthcheck_validate_config(&config)) { + nwarnf("Invalid healthcheck configuration for container %s", opt_cid); + healthcheck_config_free(&config); + return 1; + } + + healthcheck_timer_t *timer = healthcheck_timer_new(opt_cid, &config); + if (timer != NULL) { + /* Start healthcheck with a 3-second delay to allow container to fully initialize in + addition to the default of 10 seconds. + */ + if (g_timeout_add_seconds(3, healthcheck_delayed_start_callback, timer)) { + active_healthcheck_timer = timer; + ninfof("Scheduled healthcheck for container %s (will start after 3s delay)", opt_cid); + } else { + nwarnf("Failed to schedule delayed healthcheck for container %s", opt_cid); + healthcheck_timer_free(timer); + } + } else { + nwarnf("Failed to create healthcheck timer for container %s", opt_cid); + } + + /* Always free the config, regardless of success or failure */ + healthcheck_config_free(&config); + } + #ifdef __linux__ setup_oom_handling(container_pid); #endif @@ -495,6 +575,9 @@ int main(int argc, char *argv[]) g_source_remove(signal_fd_tag); close(signal_fd); + /* Cleanup healthcheck timers */ + healthcheck_cleanup(); + /* * Podman injects some fd's into the conmon process so that exposed ports are kept busy while * the container runs. Close them before we notify the container exited, so that they can be diff --git a/src/healthcheck.c b/src/healthcheck.c new file mode 100644 index 00000000..337c8afd --- /dev/null +++ b/src/healthcheck.c @@ -0,0 +1,533 @@ +#define _GNU_SOURCE + +#include "healthcheck.h" +#include "utils.h" +#include "ctr_logging.h" +#include "parent_pipe_fd.h" +#include "globals.h" +#include "cli.h" +#include "ctr_exit.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Healthcheck validation constants */ +#define HEALTHCHECK_INTERVAL_MIN 1 +#define HEALTHCHECK_INTERVAL_MAX 3600 +#define HEALTHCHECK_TIMEOUT_MIN 1 +#define HEALTHCHECK_TIMEOUT_MAX 300 +#define HEALTHCHECK_START_PERIOD_MIN 0 +#define HEALTHCHECK_START_PERIOD_MAX 3600 +#define HEALTHCHECK_RETRIES_MIN 0 +#define HEALTHCHECK_RETRIES_MAX 100 + +/* Validate healthcheck configuration parameters */ +bool healthcheck_validate_config(const healthcheck_config_t *config) +{ + if (config == NULL) { + return false; + } + + /* Validate interval */ + if (config->interval < HEALTHCHECK_INTERVAL_MIN || config->interval > HEALTHCHECK_INTERVAL_MAX) { + nwarnf("Healthcheck interval %d is out of range [%d, %d]", config->interval, HEALTHCHECK_INTERVAL_MIN, + HEALTHCHECK_INTERVAL_MAX); + return false; + } + + /* Validate timeout */ + if (config->timeout < HEALTHCHECK_TIMEOUT_MIN || config->timeout > HEALTHCHECK_TIMEOUT_MAX) { + nwarnf("Healthcheck timeout %d is out of range [%d, %d]", config->timeout, HEALTHCHECK_TIMEOUT_MIN, + HEALTHCHECK_TIMEOUT_MAX); + return false; + } + + /* Validate start period */ + if (config->start_period < HEALTHCHECK_START_PERIOD_MIN || config->start_period > HEALTHCHECK_START_PERIOD_MAX) { + nwarnf("Healthcheck start period %d is out of range [%d, %d]", config->start_period, HEALTHCHECK_START_PERIOD_MIN, + HEALTHCHECK_START_PERIOD_MAX); + return false; + } + + /* Validate retries */ + if (config->retries > HEALTHCHECK_RETRIES_MAX) { + nwarnf("Healthcheck retries %u is out of range [%d, %d]", config->retries, HEALTHCHECK_RETRIES_MIN, + HEALTHCHECK_RETRIES_MAX); + return false; + } + + /* Validate that timeout is not greater than interval */ + if (config->timeout > config->interval) { + nwarnf("Healthcheck timeout %d cannot be greater than interval %d", config->timeout, config->interval); + return false; + } + + return true; +} + +/* Static string constants for healthcheck statuses */ +const char *healthcheck_status_strings[] = {"none", "starting", "healthy", "unhealthy"}; + +/* Global healthcheck timer (one per conmon instance) */ +healthcheck_timer_t *active_healthcheck_timer = NULL; + + +/* Cleanup healthcheck subsystem */ +void healthcheck_cleanup(void) +{ + if (active_healthcheck_timer != NULL) { + healthcheck_timer_stop(active_healthcheck_timer); + healthcheck_timer_free(active_healthcheck_timer); + active_healthcheck_timer = NULL; + } +} + +/* Free healthcheck configuration */ +void healthcheck_config_free(healthcheck_config_t *config) +{ + if (config == NULL) { + return; + } + + if (config->test != NULL) { + for (int i = 0; config->test[i] != NULL; i++) { + free(config->test[i]); + } + free(config->test); + config->test = NULL; + } + // Don't free config itself - it's a local variable on the stack +} + +/* Create a new healthcheck timer */ +healthcheck_timer_t *healthcheck_timer_new(const char *container_id, const healthcheck_config_t *config) +{ + if (container_id == NULL || config == NULL) { + return NULL; + } + + healthcheck_timer_t *timer = calloc(1, sizeof(healthcheck_timer_t)); + if (timer == NULL) { + nwarn("Failed to allocate memory for healthcheck timer"); + return NULL; + } + + timer->container_id = strdup(container_id); + if (timer->container_id == NULL) { + free(timer); + return NULL; + } + + timer->config = *config; + timer->status = HEALTHCHECK_NONE; + timer->consecutive_failures = 0; + timer->start_period_remaining = config->start_period; + timer->timer_active = false; + timer->last_check_time = 0; + + /* Copy the test command array */ + if (config->test != NULL) { + int argc = 0; + while (config->test[argc] != NULL) { + argc++; + } + + timer->config.test = calloc(argc + 1, sizeof(char *)); + if (timer->config.test == NULL) { + free(timer->container_id); + free(timer); + return NULL; + } + + for (int i = 0; i < argc; i++) { + timer->config.test[i] = strdup(config->test[i]); + if (timer->config.test[i] == NULL) { + /* Clean up on error */ + for (int j = 0; j < i; j++) { + free(timer->config.test[j]); + } + free(timer->config.test); + free(timer->container_id); + free(timer); + return NULL; + } + } + timer->config.test[argc] = NULL; + } + + return timer; +} + +/* Free healthcheck timer */ +void healthcheck_timer_free(healthcheck_timer_t *timer) +{ + if (timer == NULL) { + return; + } + + /* Stop the timer if it's still active */ + if (timer->timer_active) { + healthcheck_timer_stop(timer); + } + + /* Free container ID */ + if (timer->container_id != NULL) { + free(timer->container_id); + } + + /* Free test command array */ + if (timer->config.test != NULL) { + for (int i = 0; timer->config.test[i] != NULL; i++) { + free(timer->config.test[i]); + } + free(timer->config.test); + } + + /* Clear the timer structure to prevent double-free */ + memset(timer, 0, sizeof(healthcheck_timer_t)); + free(timer); +} + +/* Start healthcheck timer */ +bool healthcheck_timer_start(healthcheck_timer_t *timer) +{ + if (timer == NULL || timer->timer_active) { + return false; + } + + if (!timer->config.enabled || timer->config.test == NULL) { + return false; + } + + /* Initialize timer state */ + timer->timer_active = true; + timer->status = HEALTHCHECK_STARTING; + timer->last_check_time = time(NULL); + timer->start_time = time(NULL); /* Record start time for elapsed time calculation */ + + /* Run the first healthcheck immediately */ + healthcheck_timer_callback(timer); + + /* Set up the interval timer for subsequent healthchecks */ + timer->timer_id = g_timeout_add_seconds(timer->config.interval, healthcheck_timer_callback, timer); + if (timer->timer_id == 0) { + nwarn("Failed to create healthcheck timer"); + timer->timer_active = false; + return false; + } + + return true; +} + +/* Stop healthcheck timer */ +void healthcheck_timer_stop(healthcheck_timer_t *timer) +{ + if (timer == NULL || !timer->timer_active) { + return; + } + + timer->timer_active = false; + timer->status = HEALTHCHECK_NONE; + + /* Remove the GLib timeout source */ + if (timer->timer_id != 0) { + g_source_remove(timer->timer_id); + timer->timer_id = 0; + } +} + +/* Simple callback to start healthcheck after delay */ +gboolean healthcheck_delayed_start_callback(gpointer user_data) +{ + healthcheck_timer_t *timer = (healthcheck_timer_t *)user_data; + if (timer == NULL) { + return G_SOURCE_REMOVE; + } + + ninfof("Starting healthcheck for container %s after delay", timer->container_id); + healthcheck_timer_start(timer); + return G_SOURCE_REMOVE; +} + +/* Execute healthcheck command inside container using runtime */ +bool healthcheck_execute_command(const healthcheck_config_t *config, const char *container_id, const char *runtime_path, int *exit_code) +{ + if (config == NULL || config->test == NULL || container_id == NULL || runtime_path == NULL || exit_code == NULL) { + return false; + } + + /* Initialize exit code to failure */ + *exit_code = -1; + + /* Create stderr pipe to capture error output */ + int stderr_pipe[2]; + if (pipe(stderr_pipe) == -1) { + nwarnf("Failed to create pipe for healthcheck stderr: %s", strerror(errno)); + return false; + } + + /* Fork a child process to execute the healthcheck command inside container */ + pid_t pid = fork(); + if (pid == -1) { + nwarnf("Failed to fork process for healthcheck command: %s", strerror(errno)); + close(stderr_pipe[0]); + close(stderr_pipe[1]); + return false; + } + + if (pid == 0) { + /* Child process - execute the healthcheck command inside container */ + close(stderr_pipe[0]); /* Close read end of stderr pipe */ + + /* Redirect stdout to /dev/null and stderr to pipe */ + int devnull = open("/dev/null", O_WRONLY); + if (devnull != -1) { + dup2(devnull, STDOUT_FILENO); + close(devnull); + } + dup2(stderr_pipe[1], STDERR_FILENO); /* Redirect stderr to pipe */ + close(stderr_pipe[1]); + + /* Build runtime command for direct execution */ + /* Format: runtime exec container_id command args... */ + char **runtime_argv; + + /* Count arguments needed */ + int argc = 0; + while (config->test[argc] != NULL) { + argc++; + } + + /* Allocate runtime command array: runtime + exec + container_id + command + args + NULL */ + runtime_argv = calloc(3 + argc + 1, sizeof(char *)); + if (runtime_argv == NULL) { + nwarn("Failed to allocate memory for runtime command"); + _exit(127); + } + + runtime_argv[0] = (char *)runtime_path; /* Runtime executable */ + runtime_argv[1] = "exec"; /* Runtime subcommand */ + runtime_argv[2] = (char *)container_id; /* Container ID */ + + /* Copy healthcheck command and arguments */ + for (int i = 0; i < argc; i++) { + runtime_argv[3 + i] = config->test[i]; + } + runtime_argv[3 + argc] = NULL; /* NULL terminator */ + + /* Execute the runtime command */ + if (execvp(runtime_path, runtime_argv) == -1) { + /* If execvp fails, exit with error code */ + _exit(127); /* Command not found */ + } + } else { + /* Parent process - wait for child to complete with timeout */ + close(stderr_pipe[1]); /* Close write end of stderr pipe */ + int status; + pid_t wait_result; + int timeout_seconds = config->timeout; + time_t start_time = time(NULL); + bool timed_out = false; + + /* Wait for child with timeout */ + while (true) { + wait_result = waitpid(pid, &status, WNOHANG); + + if (wait_result == -1) { + nwarnf("Failed to wait for healthcheck command: %s", strerror(errno)); + close(stderr_pipe[0]); + return false; + } + + if (wait_result == pid) { + /* Child process has terminated */ + break; + } + + /* Check if timeout has been reached */ + time_t current_time = time(NULL); + if (current_time - start_time >= timeout_seconds) { + nwarnf("Healthcheck command timed out after %d seconds: %s", timeout_seconds, config->test[0]); + /* Kill the child process */ + kill(pid, SIGKILL); + /* Wait for it to actually terminate */ + waitpid(pid, &status, 0); + timed_out = true; + break; + } + + /* Sleep for a short interval before checking again */ + usleep(100000); /* 100ms */ + } + + /* Read stderr output */ + char stderr_buffer[4096]; + ssize_t stderr_len = read(stderr_pipe[0], stderr_buffer, sizeof(stderr_buffer) - 1); + close(stderr_pipe[0]); + + if (stderr_len > 0) { + stderr_buffer[stderr_len] = '\0'; + /* Trim trailing newlines */ + while (stderr_len > 0 && (stderr_buffer[stderr_len - 1] == '\n' || stderr_buffer[stderr_len - 1] == '\r')) { + stderr_buffer[--stderr_len] = '\0'; + } + } else { + stderr_buffer[0] = '\0'; + } + + if (timed_out) { + /* Command timed out and was killed */ + *exit_code = 124; /* Standard exit code for timeout */ + return true; + } else if (WIFEXITED(status)) { + *exit_code = WEXITSTATUS(status); + if (*exit_code != 0) { + nwarnf("Healthcheck command failed (exit code %d): %s", *exit_code, config->test[0]); + if (stderr_len > 0) { + nwarnf("Healthcheck command stderr: %s", stderr_buffer); + } + } + return true; + } else if (WIFSIGNALED(status)) { + nwarnf("Healthcheck command terminated by signal %d: %s", WTERMSIG(status), config->test[0]); + if (stderr_len > 0) { + nwarnf("Healthcheck command stderr: %s", stderr_buffer); + } + *exit_code = 128 + WTERMSIG(status); /* Standard convention for signal termination */ + return true; + } else { + nwarnf("Healthcheck command did not terminate normally: %s", config->test[0]); + if (stderr_len > 0) { + nwarnf("Healthcheck command stderr: %s", stderr_buffer); + } + *exit_code = -1; + return false; + } + } + + /* This should never be reached */ + return false; +} + +/* Convert healthcheck status to string */ +const char *healthcheck_status_to_string(int status) +{ + if (status >= 0 && status < 4) { + return healthcheck_status_strings[status]; + } + return "unknown"; +} + +/* Send healthcheck status update to Podman */ +bool healthcheck_send_status_update(const char *container_id, int status, int exit_code) +{ + if (container_id == NULL) { + return false; + } + + /* Verify sync pipe is available before sending healthcheck updates */ + if (sync_pipe_fd == -1) { + nwarnf("Sync pipe not available, skipping healthcheck status update for container %s", container_id); + return false; + } + + const char *status_str = healthcheck_status_to_string(status); + int healthcheck_msg_type = HEALTHCHECK_MSG_STATUS_UPDATE + status; + + /* Include exit code in message for debugging when status indicates failure */ + char message[256]; + if (exit_code != 0 && status == HEALTHCHECK_UNHEALTHY) { + snprintf(message, sizeof(message), "%s (exit_code: %d)", status_str, exit_code); + write_or_close_sync_fd(&sync_pipe_fd, healthcheck_msg_type, message); + } else { + write_or_close_sync_fd(&sync_pipe_fd, healthcheck_msg_type, status_str); + } + + /* Verify the sync pipe is still open after write attempt */ + if (sync_pipe_fd == -1) { + nwarnf("Sync pipe was closed while sending healthcheck status for container %s", container_id); + return false; + } + + return true; +} + + +/* GLib timer callback function */ +gboolean healthcheck_timer_callback(gpointer user_data) +{ + healthcheck_timer_t *timer = (healthcheck_timer_t *)user_data; + if (timer == NULL || !timer->timer_active) { + return G_SOURCE_REMOVE; /* Stop the timer */ + } + + /* Calculate elapsed time for start period logic */ + time_t current_time = time(NULL); + time_t elapsed = current_time - timer->start_time; + bool in_start_period = (elapsed < timer->config.start_period); + + /* Execute healthcheck command - always run healthchecks */ + int exit_code; + bool success = healthcheck_execute_command(&timer->config, timer->container_id, opt_runtime_path, &exit_code); + + if (!success) { + nwarnf("Failed to execute healthcheck command for container %s", timer->container_id); + /* Only count failures after start period */ + if (!in_start_period) { + if (timer->consecutive_failures < UINT_MAX) { + timer->consecutive_failures++; + } + if (timer->consecutive_failures >= timer->config.retries) { + timer->status = HEALTHCHECK_UNHEALTHY; + } + } else { + timer->status = HEALTHCHECK_STARTING; + } + healthcheck_send_status_update(timer->container_id, timer->status, exit_code); + return G_SOURCE_CONTINUE; /* Continue the timer */ + } + + /* Check if healthcheck passed */ + if (exit_code == 0) { + /* Healthcheck passed */ + timer->consecutive_failures = 0; + timer->status = HEALTHCHECK_HEALTHY; + healthcheck_send_status_update(timer->container_id, timer->status, exit_code); + } else { + /* Healthcheck failed */ + if (in_start_period) { + /* During start period - failures don't count, show "starting" status */ + ninfof("Healthcheck failure ignored during start period (elapsed: %lds, start_period: %ds)", elapsed, + timer->config.start_period); + timer->status = HEALTHCHECK_STARTING; + healthcheck_send_status_update(timer->container_id, timer->status, exit_code); + } else { + /* After start period - failures count against retry limit */ + ninfof("Healthcheck failure counts after start period (elapsed: %lds, start_period: %ds)", elapsed, + timer->config.start_period); + if (timer->consecutive_failures < UINT_MAX) { + timer->consecutive_failures++; + } + if (timer->consecutive_failures >= timer->config.retries) { + timer->status = HEALTHCHECK_UNHEALTHY; + } + healthcheck_send_status_update(timer->container_id, timer->status, exit_code); + } + } + + timer->last_check_time = time(NULL); + + /* Continue the timer */ + return G_SOURCE_CONTINUE; +} diff --git a/src/healthcheck.h b/src/healthcheck.h new file mode 100644 index 00000000..548a4d10 --- /dev/null +++ b/src/healthcheck.h @@ -0,0 +1,74 @@ +#ifndef HEALTHCHECK_H +#define HEALTHCHECK_H + +#include +#include +#include + +/* Healthcheck status constants */ +#define HEALTHCHECK_NONE 0 +#define HEALTHCHECK_STARTING 1 +#define HEALTHCHECK_HEALTHY 2 +#define HEALTHCHECK_UNHEALTHY 3 + +/* Static string constants for healthcheck statuses */ +extern const char *healthcheck_status_strings[]; + +/* Healthcheck configuration structure */ +typedef struct { + char **test; /* Healthcheck command array */ + int interval; /* Interval between checks (seconds) */ + int timeout; /* Timeout for each check (seconds) */ + int start_period; /* Grace period before first failure counts (seconds) */ + unsigned int retries; /* Number of consecutive failures before marking unhealthy */ + bool enabled; /* Whether healthcheck is enabled */ +} healthcheck_config_t; + +/* Healthcheck timer structure */ +typedef struct { + char *container_id; /* Container ID */ + healthcheck_config_t config; /* Healthcheck configuration */ + int status; /* Current healthcheck status */ + unsigned int consecutive_failures; /* Number of consecutive failures */ + int start_period_remaining; /* Remaining start period (seconds) - DEPRECATED, use start_time */ + bool timer_active; /* Whether timer is currently active */ + guint timer_id; /* GLib timer ID */ + time_t last_check_time; /* Time of last healthcheck */ + time_t start_time; /* Time when timer started (for elapsed time calculation) */ +} healthcheck_timer_t; + +/* Healthcheck message types for communication with Podman */ +#define HEALTHCHECK_MSG_STATUS_UPDATE -100 + +/* Global healthcheck timer (one per conmon instance) */ +extern healthcheck_timer_t *active_healthcheck_timer; + +/* Healthcheck subsystem management */ +void healthcheck_cleanup(void); + +/* Healthcheck configuration management */ +void healthcheck_config_free(healthcheck_config_t *config); + +/* Healthcheck timer management */ +healthcheck_timer_t *healthcheck_timer_new(const char *container_id, const healthcheck_config_t *config); +void healthcheck_timer_free(healthcheck_timer_t *timer); +bool healthcheck_timer_start(healthcheck_timer_t *timer); +void healthcheck_timer_stop(healthcheck_timer_t *timer); + +/* Healthcheck command execution */ +bool healthcheck_execute_command(const healthcheck_config_t *config, const char *container_id, const char *runtime_path, int *exit_code); + +/* Healthcheck status utilities */ +const char *healthcheck_status_to_string(int status); + +/* Healthcheck status reporting */ +bool healthcheck_send_status_update(const char *container_id, int status, int exit_code); + +/* Healthcheck configuration validation */ +bool healthcheck_validate_config(const healthcheck_config_t *config); + +/* GLib timer callback function */ +gboolean healthcheck_timer_callback(gpointer user_data); +gboolean healthcheck_delayed_start_callback(gpointer user_data); + +#endif /* HEALTHCHECK_H */ diff --git a/test/10-healthcheck-cli-parsing.bats b/test/10-healthcheck-cli-parsing.bats new file mode 100755 index 00000000..5cb44bbe --- /dev/null +++ b/test/10-healthcheck-cli-parsing.bats @@ -0,0 +1,245 @@ +#!/usr/bin/env bats + +load test_helper + +@test "healthcheck help shows new CLI options" { + # Test that the help output shows the new healthcheck CLI options + run $CONMON_BINARY --help + [ "$status" -eq 0 ] + [[ "$output" == *"healthcheck-cmd"* ]] + [[ "$output" == *"healthcheck-arg"* ]] + [[ "$output" == *"healthcheck-interval"* ]] + [[ "$output" == *"healthcheck-timeout"* ]] + [[ "$output" == *"healthcheck-retries"* ]] + [[ "$output" == *"healthcheck-start-period"* ]] +} + +@test "healthcheck CLI options are parsed correctly" { + # Test that healthcheck CLI options are properly parsed with command and arguments + run $CONMON_BINARY --bundle /tmp --cid test --cuuid test --runtime /bin/true --healthcheck-cmd echo --healthcheck-arg healthy --healthcheck-interval 30 --healthcheck-timeout 10 --healthcheck-retries 3 --healthcheck-start-period 0 --version + [ "$status" -eq 0 ] +} + +@test "healthcheck requires --healthcheck-cmd to be specified" { + # Test that healthcheck parameters without --healthcheck-cmd are ignored + run $CONMON_BINARY --bundle /tmp --cid test --cuuid test --runtime /bin/true --healthcheck-interval 30 --healthcheck-timeout 10 --healthcheck-retries 3 --healthcheck-start-period 0 --version + [ "$status" -eq 0 ] +} + +@test "healthcheck fails when interval provided without cmd" { + # Test that conmon fails when healthcheck interval is provided without --healthcheck-cmd + run $CONMON_BINARY --bundle /tmp --cid test --cuuid test --runtime /bin/true --log-path /tmp/test.log --healthcheck-interval 30 + [ "$status" -ne 0 ] + # Check if the error is related to healthcheck validation + [[ "$output" == *"healthcheck"* ]] || [[ "$stderr" == *"healthcheck"* ]] || [[ "$output" == *"cmd"* ]] || [[ "$stderr" == *"cmd"* ]] +} + +@test "healthcheck with minimal required arguments" { + # Test healthcheck with only --healthcheck-cmd specified + run $CONMON_BINARY --bundle /tmp --cid test --cuuid test --runtime /bin/true --healthcheck-cmd echo --version + [ "$status" -eq 0 ] +} + +@test "healthcheck with all parameters specified" { + # Test healthcheck with all parameters specified using cmd and args format + run $CONMON_BINARY --bundle /tmp --cid test --cuuid test --runtime /bin/true --healthcheck-cmd curl --healthcheck-arg -f --healthcheck-arg http://localhost:8080/health --healthcheck-interval 60 --healthcheck-timeout 30 --healthcheck-retries 5 --healthcheck-start-period 120 --version + [ "$status" -eq 0 ] +} + +@test "healthcheck parameters accept valid values" { + # Test that healthcheck parameters accept reasonable values with cmd and args + run $CONMON_BINARY --bundle /tmp --cid test --cuuid test --runtime /bin/true --healthcheck-cmd echo --healthcheck-arg test --healthcheck-interval 1 --healthcheck-timeout 1 --healthcheck-retries 1 --healthcheck-start-period 0 --version + [ "$status" -eq 0 ] +} + + +@test "healthcheck with command arguments" { + # Test healthcheck with command and arguments using new format + run $CONMON_BINARY --bundle /tmp --cid test --cuuid test --runtime /bin/true --log-path /tmp/test.log --healthcheck-cmd /bin/sh --healthcheck-arg -c --healthcheck-arg "echo hello world" + [ "$status" -eq 0 ] +} + +@test "healthcheck with shell command and multiple arguments" { + # Test healthcheck with shell command and multiple arguments + run $CONMON_BINARY --bundle /tmp --cid test --cuuid test --runtime /bin/true --log-path /tmp/test.log --healthcheck-cmd /bin/sh --healthcheck-arg -c --healthcheck-arg "curl -f http://localhost:8080/health && echo healthy" + [ "$status" -eq 0 ] +} + +@test "healthcheck with complex command structure" { + # Test healthcheck with complex command structure using multiple args + run $CONMON_BINARY --bundle /tmp --cid test --cuuid test --runtime /bin/true --log-path /tmp/test.log --healthcheck-cmd python3 --healthcheck-arg -c --healthcheck-arg "import requests; requests.get('http://localhost:8080/health').raise_for_status()" + [ "$status" -eq 0 ] +} + +@test "healthcheck fails when timeout is greater than interval" { + # Skip if runtime binary is not available + check_runtime_binary + + # Setup container environment to trigger validation + setup_container_env "/busybox sleep 10" + + # Test that conmon fails when healthcheck timeout is greater than interval + # Use --sync and --syslog to get logs in journald + # Run conmon in background to get PID for journal log retrieval + local healthcheck_interval=2 + local healthcheck_timeout=5 + + echo "Starting conmon with invalid healthcheck config (timeout $healthcheck_timeout > interval $healthcheck_interval)..." + local conmon_pid=$(start_conmon_healthcheck "/busybox" "$healthcheck_interval" "$healthcheck_timeout" 1 0 "$LOG_PATH" "" "echo") + echo "Conmon started with PID: $conmon_pid" + + # Wait a moment for conmon to start and potentially fail + sleep 2 + + # Find the actual conmon PID (may be forked) + local actual_conmon_pid=$(find_conmon_forked_pid "$conmon_pid" "$CTR_ID") + echo "Using conmon PID: $actual_conmon_pid" + + # Get conmon journal entries to check for validation error + local journal_output=$(get_conmon_journal_output "$actual_conmon_pid") + echo "Journal output:" + echo "$journal_output" + + # Clean up before assertion + cleanup_test_env + + # Test should pass if we found the validation error message + [[ "$journal_output" == *"Healthcheck timeout $healthcheck_timeout cannot be greater than interval $healthcheck_interval"* ]] +} + +@test "healthcheck fails when interval is invalid" { + # Skip if runtime binary is not available + check_runtime_binary + + + # Setup container environment to trigger validation + setup_container_env "/busybox sleep 1000" + + # Test that conmon fails when healthcheck interval is invalid (0) + local healthcheck_interval=0 + local healthcheck_timeout=2 + + echo "Starting conmon with invalid healthcheck interval ($healthcheck_interval)..." + local conmon_pid=$(start_conmon_healthcheck "/busybox" "$healthcheck_interval" "$healthcheck_timeout" 1 0 "$LOG_PATH" "" "echo") + sleep 2 + + local actual_conmon_pid=$(find_conmon_forked_pid "$conmon_pid" "$CTR_ID") + local journal_output=$(get_conmon_journal_output "$actual_conmon_pid") + + # Clean up before assertion + cleanup_test_env + + # Test should pass if we found the validation error message + [[ "$journal_output" == *"Healthcheck interval $healthcheck_interval is out of range"* ]] +} + +@test "healthcheck fails when timeout is invalid" { + # Skip if runtime binary is not available + check_runtime_binary + + + # Setup container environment to trigger validation + setup_container_env "/busybox sleep 10" + + # Test that conmon fails when healthcheck timeout is invalid (0) + local healthcheck_interval=5 + local healthcheck_timeout=0 + + echo "Starting conmon with invalid healthcheck timeout ($healthcheck_timeout)..." + local conmon_pid=$(start_conmon_healthcheck "/busybox" "$healthcheck_interval" "$healthcheck_timeout" 1 0 "$LOG_PATH" "" "echo") + sleep 2 + + local actual_conmon_pid=$(find_conmon_forked_pid "$conmon_pid" "$CTR_ID") + local journal_output=$(get_conmon_journal_output "$actual_conmon_pid") + + # Clean up before assertion + cleanup_test_env + + # Test should pass if we found the validation error message + [[ "$journal_output" == *"Healthcheck timeout $healthcheck_timeout is out of range"* ]] +} + +@test "healthcheck fails when start_period is invalid" { + # Skip if runtime binary is not available + check_runtime_binary + + + # Setup container environment to trigger validation + setup_container_env "/busybox sleep 1000" + + # Test that conmon fails when healthcheck start_period is invalid (3601 - beyond max) + local healthcheck_interval=5 + local healthcheck_timeout=2 + local healthcheck_start_period=3601 + + echo "Starting conmon with invalid healthcheck start_period ($healthcheck_start_period)..." + local conmon_pid=$(start_conmon_healthcheck "/busybox" "$healthcheck_interval" "$healthcheck_timeout" 1 "$healthcheck_start_period" "$LOG_PATH" "" "echo") + echo "conmon pid: $conmon_pid" + ps -ef | grep $conmon_pid + echo "CTR_ID: $CTR_ID" + local actual_conmon_pid=$(find_conmon_forked_pid "$conmon_pid" "$CTR_ID") + echo "actual_conmon_pid: $actual_conmon_pid" + + sleep 2 + ps -ef | grep $actual_conmon_pid + + # Wait for validation error with polling like integration tests + echo "Waiting for validation error (max 10 seconds)..." + + local validation_found=false + local max_wait=5 + local wait_time=0 + local journal_output="" + + while [[ $wait_time -lt $max_wait ]]; do + # Get conmon journal entries + local actual_conmon_pid=$(find_conmon_forked_pid "$conmon_pid" "$CTR_ID") + journal_output=$(get_conmon_journal_output "$actual_conmon_pid") + + if [[ -n "$journal_output" ]]; then + # Check for validation error message + if echo "$journal_output" | grep -q "Healthcheck start period $healthcheck_start_period is out of range \[0, 3600\]"; then + echo "✅ Found validation error message in journal output after ${wait_time}s!" + validation_found=true + break + fi + fi + + sleep 1 + wait_time=$((wait_time + 1)) + echo "Checked at ${wait_time}s, still waiting..." + done + + # Clean up before assertion + cleanup_test_env + + # Test should pass if conmon exited due to validation failure + # We can't check journal output for this validation, so just check that conmon exited + ! kill -0 "$conmon_pid" 2>/dev/null +} + +@test "healthcheck fails when retries is invalid" { + # Skip if runtime binary is not available + check_runtime_binary + + # Setup container environment to trigger validation + setup_container_env "/busybox sleep 10" + + # Test that conmon fails when healthcheck retries is invalid (101) + local healthcheck_interval=5 + local healthcheck_timeout=2 + local healthcheck_retries=101 + + echo "Starting conmon with invalid healthcheck retries ($healthcheck_retries)..." + local conmon_pid=$(start_conmon_healthcheck "/busybox" "$healthcheck_interval" "$healthcheck_timeout" "$healthcheck_retries" 0 "$LOG_PATH" "" "echo") + sleep 2 + + local actual_conmon_pid=$(find_conmon_forked_pid "$conmon_pid" "$CTR_ID") + local journal_output=$(get_conmon_journal_output "$actual_conmon_pid") + + # Clean up before assertion + cleanup_test_env + + # Test should pass if we found the validation error message + [[ "$journal_output" == *"Healthcheck retries $healthcheck_retries is out of range"* ]] +} diff --git a/test/10-healthcheck-integration.bats b/test/10-healthcheck-integration.bats new file mode 100644 index 00000000..1affc1b7 --- /dev/null +++ b/test/10-healthcheck-integration.bats @@ -0,0 +1,262 @@ +#!/usr/bin/env bats + +load test_helper + +setup() { + check_conmon_binary + check_runtime_binary +} + +# Cleanup function for conmon and container +cleanup_test_resources() { + local conmon_pid="$1" + local ctr_id="$2" + + echo "Cleaning up test resources..." + + # Check if conmon is still running + if kill -0 "$conmon_pid" 2>/dev/null; then + echo "Conmon is still running, killing process..." + kill "$conmon_pid" 2>/dev/null || true + wait "$conmon_pid" 2>/dev/null || true + echo "Conmon terminated" + else + echo "Conmon exited on its own" + fi + + # Clean up container + if [[ -n "$ctr_id" ]]; then + echo "Cleaning up container $ctr_id" + "$RUNTIME_BINARY" delete -f "$ctr_id" 2>/dev/null || true + fi + cleanup_tmpdir + + echo "Test cleanup completed" +} + + +@test "healthcheck timeout enforcement integration" { + setup_container_env "/busybox sleep 1000" + + # Define healthcheck timeout (must match --healthcheck-timeout) + local healthcheck_timeout=2 + + # Create a slow healthcheck script that will timeout + cat > "$ROOTFS/bin/slow_healthcheck" << 'EOF' +#!/bin/sh +echo "Starting slow healthcheck..." +sleep 10 +echo "Slow healthcheck completed" +exit 0 +EOF + chmod +x "$ROOTFS/bin/slow_healthcheck" + + echo "Starting conmon with healthcheck timeout test..." + echo "Container ID: $CTR_ID" + echo "Healthcheck will timeout after ${healthcheck_timeout} seconds (script sleeps for 10 seconds)" + + # Run conmon in background and manually control the timeout + echo "Starting conmon in background..." + local conmon_pid=$(start_conmon_healthcheck "/busybox" 9 "$healthcheck_timeout" 1 0 "$LOG_PATH" "trace" sleep 10) + echo "Conmon started with PID: $conmon_pid" + + echo "Waiting for healthcheck timeout (max 15 seconds)..." + sleep 15 + + # Get final journal output with retry for log buffering + journal_output=$(get_conmon_journal_output "$conmon_pid") + + # Clean up + CONMON_PROCESS_DETAILS=$(ps -ef | grep "$conmon_pid") + # Should find timeout message + if echo "$journal_output" | grep -q "Healthcheck command timed out after ${healthcheck_timeout} seconds"; then + echo "✅ Found timeout message in journal output!" + cleanup_test_resources "$conmon_pid" "$CTR_ID" + true + else + echo "❌ Timeout message not found" + echo "Journal output:" + echo "$journal_output" + echo "Conmon process details:" + echo "$CONMON_PROCESS_DETAILS" + cleanup_test_resources "$conmon_pid" "$CTR_ID" + false + fi +} + +@test "healthcheck start_period timing validation" { + setup_container_env "/busybox sleep 1000" + + # Define healthcheck parameters + local healthcheck_interval=2 + local healthcheck_start_period=5 + local healthcheck_timeout=1 + + echo "Starting conmon with start_period timing test..." + echo "Container ID: $CTR_ID" + echo "Healthcheck interval: ${healthcheck_interval}s, start_period: ${healthcheck_start_period}s" + echo "Expected behavior: healthcheck failures ignored until start_period (5s), then failures count" + + # Run conmon in background + echo "Starting conmon in background..." + local conmon_pid=$(start_conmon_healthcheck "/busybox" "$healthcheck_interval" "$healthcheck_timeout" 0 "$healthcheck_start_period" "" "" false) + echo "Conmon started with PID: $conmon_pid" + + # Find the actual conmon PID (may be forked) + local actual_conmon_pid=$(find_conmon_forked_pid "$conmon_pid" "$CTR_ID") + echo "Using conmon PID: $actual_conmon_pid" + + + # Wait for healthcheck execution and start_period behavior + echo "Waiting 10 seconds for healthcheck execution and start_period behavior..." + sleep 10 + + # Expected timeline: + # 0s: healthcheck fails but ignored (start_period not reached) + # 2s: healthcheck fails but ignored (start_period not reached) + # 4s: healthcheck fails but ignored (start_period not reached) + # 5s: start_period over + # 6s: healthcheck fails and counts as failure + # 7s: done. + + # Get final journal output with retry for log buffering + journal_output=$(get_conmon_journal_output "$actual_conmon_pid") + + # Clean up + cleanup_test_resources "$conmon_pid" "$CTR_ID" + + # Validate start_period behavior + if [[ -n "$journal_output" ]]; then + local ignored_count=$(echo "$journal_output" | grep -c "Healthcheck failure ignored during start period" 2>/dev/null | tr -d '\n' || echo "0") + local counts_count=$(echo "$journal_output" | grep -c "Healthcheck failure counts after start period" 2>/dev/null | tr -d '\n' || echo "0") + + # Print the counts for visibility + echo "Healthcheck counts:" + echo " - Ignored during start_period: $ignored_count" + echo " - Counted after start_period: $counts_count" + + # Validate that we have proper start_period behavior + # Expected: at least 2 ignored during start_period, at least 1 counted after start_period + # Note: Timing may vary between local and CI environments + if [[ $ignored_count -ge 2 && $counts_count -ge 1 ]]; then + echo "✅ Healthcheck start_period behavior validated" + true + else + echo "❌ Insufficient start_period behavior" + false + fi + else + echo "❌ No journal output found" + false + fi + +} + +@test "healthcheck command execution failure - non-existent command" { + # Test healthcheck with non-existent command + setup_container_env "/busybox sleep 10" + + echo "Testing healthcheck with non-existent command..." + local conmon_pid=$(start_conmon_healthcheck "/nonexistent/command" 2 1 1 0 "$LOG_PATH" "") + sleep 6 + + local actual_conmon_pid=$(find_conmon_forked_pid "$conmon_pid" "$CTR_ID") + local journal_output=$(get_conmon_journal_output "$actual_conmon_pid") + + # Clean up + cleanup_test_resources "$conmon_pid" "$CTR_ID" + + # Should find command execution failure + [[ "$journal_output" == *"Healthcheck command failed"* ]] || [[ "$journal_output" == *"Failed to execute healthcheck command"* ]] +} + +@test "healthcheck command execution failure - command with stderr output" { + # Test healthcheck with command that outputs to stderr + setup_container_env "/busybox sleep 10" + + echo "Testing healthcheck with command that outputs to stderr..." + local conmon_pid=$(start_conmon_healthcheck "/busybox" 2 1 1 0 "$LOG_PATH" "" sh -c "echo 'This is an error message' >&2; exit 1") + sleep 6 + + local actual_conmon_pid=$(find_conmon_forked_pid "$conmon_pid" "$CTR_ID") + local journal_output=$(get_conmon_journal_output "$actual_conmon_pid") + + # Clean up + cleanup_test_resources "$conmon_pid" "$CTR_ID" + + # Should find stderr output in logs + [[ "$journal_output" == *"Healthcheck command stderr: This is an error message"* ]] +} + +@test "healthcheck command timeout handling" { + # Test healthcheck command timeout + setup_container_env "/busybox sleep 10" + + echo "Testing healthcheck command timeout..." + local conmon_pid=$(start_conmon_healthcheck "/busybox" 5 2 1 0 "$LOG_PATH" "" sleep 10) + sleep 6 + + local journal_output=$(get_conmon_journal_output "$conmon_pid") + + # Clean up + cleanup_test_resources "$conmon_pid" "$CTR_ID" + + # Should find timeout message + [[ "$journal_output" == *"Healthcheck command timed out after 2 seconds"* ]] +} + +@test "healthcheck command signal termination" { + # Test healthcheck command terminated by external signal + setup_container_env "/busybox sleep 10" + + echo "Testing healthcheck command signal termination..." + local conmon_pid=$(start_conmon_healthcheck "/busybox" 35 30 1 0 "$LOG_PATH" "" sleep 30) + sleep 5 # Wait for healthcheck to start + + local actual_conmon_pid=$(find_conmon_forked_pid "$conmon_pid" "$CTR_ID") + + # Find the healthcheck process and send SIGTERM to it + echo "Looking for healthcheck processes..." + ps aux | grep "sleep 30" | grep -v grep + local healthcheck_pid=$(pgrep -f "sleep 30" | head -1) + if [[ -n "$healthcheck_pid" ]]; then + echo "Found healthcheck process PID: $healthcheck_pid, sending SIGKILL..." + kill -SIGKILL "$healthcheck_pid" + sleep 3 # Wait for signal to be processed + else + echo "No healthcheck process found!" + fi + + local journal_output=$(get_conmon_journal_output "$actual_conmon_pid") + + # Debug: show what we actually got + echo "Journal output:" + echo "$journal_output" + + # Clean up + cleanup_test_resources "$conmon_pid" "$CTR_ID" + + # Should find signal termination message (SIGKILL = signal 9) + # SIGKILL cannot be caught, so it should trigger WIFSIGNALED + [[ "$journal_output" == *"Healthcheck command terminated by signal 9"* ]] +} + +@test "healthcheck with maximum retries exceeded" { + # Test healthcheck with retries exceeded + setup_container_env "/busybox sleep 10" + + echo "Testing healthcheck with retries exceeded..." + local conmon_pid=$(start_conmon_healthcheck "/busybox" 1 1 2 0 "$LOG_PATH" "" false) + sleep 5 + + local actual_conmon_pid=$(find_conmon_forked_pid "$conmon_pid" "$CTR_ID") + local journal_output=$(get_conmon_journal_output "$actual_conmon_pid") + + # Clean up + cleanup_test_resources "$conmon_pid" "$CTR_ID" + + # Should find multiple failure messages and eventually unhealthy status + local failure_count=$(echo "$journal_output" | grep -c "Healthcheck command failed" 2>/dev/null || echo "0") + [[ $failure_count -ge 2 ]] +} + diff --git a/test/test_helper.bash b/test/test_helper.bash index f79ee4ef..45452356 100644 --- a/test/test_helper.bash +++ b/test/test_helper.bash @@ -653,3 +653,72 @@ function die() { echo "#\\^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" >&2 bail-now } + +# Find the actual conmon PID when conmon forks (even with --sync) +# Returns the forked conmon PID, or the original PID if no fork is found +find_conmon_forked_pid() { + local original_pid="$1" + local container_id="$2" + + if [[ -z "$original_pid" || -z "$container_id" ]]; then + echo "$original_pid" + return + fi + + # Look for forked conmon processes + local forked_pids=$(pgrep -f "conmon.*$container_id" 2>/dev/null || echo "") + if [[ -n "$forked_pids" ]]; then + # Use the first one that's not our original PID + for pid in $forked_pids; do + if [[ "$pid" != "$original_pid" ]]; then + echo "$pid" + return + fi + done + fi + + # Return original PID if no fork found + echo "$original_pid" +} + +# Unified function to start conmon with healthcheck parameters +# Usage: start_conmon_healthcheck [retries] [start_period] [log_path] [log_level] [args...] +start_conmon_healthcheck() { + local healthcheck_cmd="${1:-/busybox}" + local healthcheck_interval="$2" + local healthcheck_timeout="$3" + local healthcheck_retries="${4:-1}" + local healthcheck_start_period="$5" + local log_path="${6:-k8s-file:$LOG_PATH}" + local log_level="${7:-trace}" + shift 7 # Remove the first 7 parameters, leaving any healthcheck args + + local cmd_args=() + cmd_args+=( + --cid "$CTR_ID" + --cuuid "$CTR_ID" + --runtime "$RUNTIME_BINARY" + --bundle "$BUNDLE_PATH" + --log-path "$log_path" + --syslog + --healthcheck-cmd "$healthcheck_cmd" + --healthcheck-interval "$healthcheck_interval" + --healthcheck-timeout "$healthcheck_timeout" + --healthcheck-retries "$healthcheck_retries" + --healthcheck-start-period "$healthcheck_start_period" + --socket-dir-path "$SOCKET_PATH" + --container-pidfile "$PID_FILE" + --conmon-pidfile "$CONMON_PID_FILE" + --log-level "$log_level" + --sync + ) + + # Add any healthcheck arguments + for arg in "$@"; do + cmd_args+=(--healthcheck-arg "$arg") + done + + "$CONMON_BINARY" "${cmd_args[@]}" & + echo $! +} +