Skip to content

Commit e62dd5c

Browse files
author
Samuel Archambault
committed
healthcheck feature
Signed-off-by: Samuel Archambault <samuel.archambault@getmaintainx.com>
1 parent 42da7ac commit e62dd5c

File tree

9 files changed

+1297
-3
lines changed

9 files changed

+1297
-3
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ LIBEXECDIR ?= ${PREFIX}/libexec
55
PKG_CONFIG ?= pkg-config
66
HEADERS := $(wildcard src/*.h)
77

8-
OBJS := src/conmon.o src/cmsg.o src/ctr_logging.o src/utils.o src/cli.o src/globals.o src/cgroup.o src/conn_sock.o src/oom.o src/ctrl.o src/ctr_stdio.o src/parent_pipe_fd.o src/ctr_exit.o src/runtime_args.o src/close_fds.o src/seccomp_notify.o
8+
OBJS := src/conmon.o src/cmsg.o src/ctr_logging.o src/utils.o src/cli.o src/globals.o src/cgroup.o src/conn_sock.o src/oom.o src/ctrl.o src/ctr_stdio.o src/parent_pipe_fd.o src/ctr_exit.o src/runtime_args.o src/close_fds.o src/seccomp_notify.o src/healthcheck.o
99

1010
MAKEFILE_PATH := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
1111

src/cli.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,12 @@ char *opt_seccomp_notify_plugins = NULL;
6161
gboolean opt_log_rotate = FALSE;
6262
int opt_log_max_files = 1;
6363
gchar **opt_log_allowlist_dirs = NULL;
64+
char *opt_healthcheck_cmd = NULL;
65+
gchar **opt_healthcheck_args = NULL;
66+
int opt_healthcheck_interval = -1;
67+
int opt_healthcheck_timeout = -1;
68+
int opt_healthcheck_retries = -1;
69+
int opt_healthcheck_start_period = -1;
6470
GOptionEntry opt_entries[] = {
6571
{"api-version", 0, 0, G_OPTION_ARG_NONE, &opt_api_version, "Conmon API version to use", NULL},
6672
{"bundle", 'b', 0, G_OPTION_ARG_STRING, &opt_bundle_path, "Location of the OCI Bundle path", NULL},
@@ -125,6 +131,15 @@ GOptionEntry opt_entries[] = {
125131
NULL},
126132
{"log-max-files", 0, 0, G_OPTION_ARG_INT, &opt_log_max_files, "Number of backup log files to keep (default: 1)", NULL},
127133
{"log-allowlist-dir", 0, 0, G_OPTION_ARG_STRING_ARRAY, &opt_log_allowlist_dirs, "Allowed log directory", NULL},
134+
{"healthcheck-cmd", 0, 0, G_OPTION_ARG_STRING, &opt_healthcheck_cmd, "Healthcheck command to execute", NULL},
135+
{"healthcheck-arg", 0, 0, G_OPTION_ARG_STRING_ARRAY, &opt_healthcheck_args,
136+
"Healthcheck command arguments (can be used multiple times)", NULL},
137+
{"healthcheck-interval", 0, 0, G_OPTION_ARG_INT, &opt_healthcheck_interval, "Healthcheck interval in seconds (default: 30)", NULL},
138+
{"healthcheck-timeout", 0, 0, G_OPTION_ARG_INT, &opt_healthcheck_timeout, "Healthcheck timeout in seconds (default: 30)", NULL},
139+
{"healthcheck-retries", 0, 0, G_OPTION_ARG_INT, &opt_healthcheck_retries,
140+
"Number of consecutive failures before marking unhealthy (default: 3)", NULL},
141+
{"healthcheck-start-period", 0, 0, G_OPTION_ARG_INT, &opt_healthcheck_start_period,
142+
"Start period in seconds before healthchecks start counting failures (default: 0)", NULL},
128143
{NULL, 0, 0, 0, NULL, NULL, NULL}};
129144

130145

@@ -228,4 +243,11 @@ void process_cli()
228243
if (opt_no_container_partial_message && !logging_is_journald_enabled()) {
229244
nwarnf("--no-container-partial-message has no effect without journald log driver");
230245
}
246+
247+
/* Validate healthcheck parameters - if any healthcheck options were provided without --healthcheck-cmd */
248+
if (opt_healthcheck_cmd == NULL
249+
&& (opt_healthcheck_interval != -1 || opt_healthcheck_timeout != -1 || opt_healthcheck_retries != -1
250+
|| opt_healthcheck_start_period != -1 || opt_healthcheck_args != NULL)) {
251+
nexit("Healthcheck parameters specified without --healthcheck-cmd. Please provide --healthcheck-cmd to enable healthcheck functionality.");
252+
}
231253
}

src/cli.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,12 @@ extern char *opt_seccomp_notify_plugins;
5050
extern gboolean opt_log_rotate;
5151
extern int opt_log_max_files;
5252
extern gchar **opt_log_allowlist_dirs;
53+
extern char *opt_healthcheck_cmd;
54+
extern gchar **opt_healthcheck_args;
55+
extern int opt_healthcheck_interval;
56+
extern int opt_healthcheck_timeout;
57+
extern int opt_healthcheck_retries;
58+
extern int opt_healthcheck_start_period;
5359
extern GOptionEntry opt_entries[];
5460
extern gboolean opt_full_attach_path;
5561

src/conmon.c

Lines changed: 85 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "close_fds.h"
2121
#include "seccomp_notify.h"
2222
#include "runtime_args.h"
23+
#include "healthcheck.h"
2324

2425
#include <sys/stat.h>
2526
#include <locale.h>
@@ -46,7 +47,6 @@ int main(int argc, char *argv[])
4647
_cleanup_close_ int dev_null_r_cleanup = -1;
4748
_cleanup_close_ int dev_null_w_cleanup = -1;
4849
_cleanup_close_ int dummyfd = -1;
49-
5050
int initialize_ec = initialize_cli(argc, argv);
5151
if (initialize_ec >= 0) {
5252
exit(initialize_ec);
@@ -396,7 +396,6 @@ int main(int argc, char *argv[])
396396
}
397397

398398
container_pid = atoi(contents);
399-
ndebugf("container PID: %d", container_pid);
400399

401400
g_hash_table_insert(pid_to_handler, (pid_t *)&container_pid, container_exit_cb);
402401

@@ -408,6 +407,87 @@ int main(int argc, char *argv[])
408407
if ((opt_api_version >= 1 || !opt_exec) && sync_pipe_fd >= 0)
409408
write_or_close_sync_fd(&sync_pipe_fd, container_pid, NULL);
410409

410+
/* Start healthcheck timers if healthcheck command is provided */
411+
if (opt_healthcheck_cmd != NULL) {
412+
413+
healthcheck_config_t config;
414+
memset(&config, 0, sizeof(config));
415+
416+
/* Parse healthcheck command and arguments into array */
417+
/* Count total arguments: command + args + NULL terminator */
418+
int argc = 1; // At least the command
419+
if (opt_healthcheck_args != NULL) {
420+
for (int i = 0; opt_healthcheck_args[i] != NULL; i++) {
421+
argc++;
422+
}
423+
}
424+
425+
/* Allocate array for command and arguments */
426+
config.test = calloc(argc + 1, sizeof(char *));
427+
if (config.test == NULL) {
428+
pexit("Failed to allocate memory for healthcheck command");
429+
}
430+
431+
/* Copy command */
432+
config.test[0] = strdup(opt_healthcheck_cmd);
433+
if (config.test[0] == NULL) {
434+
pexit("Failed to duplicate healthcheck command");
435+
}
436+
437+
/* Copy arguments */
438+
if (opt_healthcheck_args != NULL) {
439+
for (int i = 0; opt_healthcheck_args[i] != NULL; i++) {
440+
config.test[i + 1] = strdup(opt_healthcheck_args[i]);
441+
if (config.test[i + 1] == NULL) {
442+
/* Clean up on error */
443+
for (int j = 0; j <= i; j++) {
444+
free(config.test[j]);
445+
}
446+
free(config.test);
447+
pexit("Failed to duplicate healthcheck argument");
448+
}
449+
}
450+
}
451+
config.test[argc] = NULL; /* NULL terminator */
452+
453+
/* Set healthcheck parameters from CLI, using defaults for -1 values */
454+
config.enabled = true;
455+
config.interval = opt_healthcheck_interval != -1 ? opt_healthcheck_interval : 30;
456+
config.timeout = opt_healthcheck_timeout != -1 ? opt_healthcheck_timeout : 30;
457+
config.retries = opt_healthcheck_retries != -1 ? opt_healthcheck_retries : 3;
458+
/* First healthcheck runs immediately, then after 'interval' seconds.
459+
* Here we give a default of 10 seconds to allow container to fully initialize.
460+
* If the user knows the container will take less time to initialize, they can set the start_period to a lower value.
461+
*/
462+
config.start_period = opt_healthcheck_start_period != -1 ? opt_healthcheck_start_period : 10;
463+
464+
/* Validate healthcheck configuration */
465+
if (!healthcheck_validate_config(&config)) {
466+
nwarnf("Invalid healthcheck configuration for container %s", opt_cid);
467+
healthcheck_config_free(&config);
468+
return 1;
469+
}
470+
471+
healthcheck_timer_t *timer = healthcheck_timer_new(opt_cid, &config);
472+
if (timer != NULL) {
473+
/* Start healthcheck with a 3-second delay to allow container to fully initialize in
474+
addition to the default of 10 seconds.
475+
*/
476+
if (g_timeout_add_seconds(3, healthcheck_delayed_start_callback, timer)) {
477+
active_healthcheck_timer = timer;
478+
ninfof("Scheduled healthcheck for container %s (will start after 3s delay)", opt_cid);
479+
} else {
480+
nwarnf("Failed to schedule delayed healthcheck for container %s", opt_cid);
481+
healthcheck_timer_free(timer);
482+
}
483+
} else {
484+
nwarnf("Failed to create healthcheck timer for container %s", opt_cid);
485+
}
486+
487+
/* Always free the config, regardless of success or failure */
488+
healthcheck_config_free(&config);
489+
}
490+
411491
#ifdef __linux__
412492
setup_oom_handling(container_pid);
413493
#endif
@@ -495,6 +575,9 @@ int main(int argc, char *argv[])
495575
g_source_remove(signal_fd_tag);
496576
close(signal_fd);
497577

578+
/* Cleanup healthcheck timers */
579+
healthcheck_cleanup();
580+
498581
/*
499582
* Podman injects some fd's into the conmon process so that exposed ports are kept busy while
500583
* the container runs. Close them before we notify the container exited, so that they can be

0 commit comments

Comments
 (0)