Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 97 additions & 5 deletions docker/standalone/start-all.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,28 @@
#!/bin/bash
set -e

# =============================================================================
# Embedded pg0 data integrity check (#675)
#
# When using embedded pg0, check if the data directory has existing PostgreSQL
# data before starting. If the directory exists but appears empty/corrupt
# (e.g., missing PG_VERSION file), log a warning. This helps diagnose data
# loss scenarios where a container restart caused the data directory to be
# wiped despite a volume mount being present.
# =============================================================================
PG0_DATA_DIR="${HOME}/.pg0"
if [ -d "$PG0_DATA_DIR" ]; then
# Look for actual PostgreSQL data directories (pg0 creates subdirs per instance)
if compgen -G "$PG0_DATA_DIR"/*/PG_VERSION > /dev/null 2>&1; then
echo "✅ Existing pg0 data directory detected at $PG0_DATA_DIR"
elif [ "$(ls -A "$PG0_DATA_DIR" 2>/dev/null)" ]; then
echo "⚠️ WARNING: pg0 data directory exists at $PG0_DATA_DIR but no PG_VERSION found."
echo " This may indicate data corruption or an incomplete previous shutdown."
echo " If you see all migrations running from scratch after this, your data may have been lost."
echo " See: https://github.com/vectorize-io/hindsight/issues/675"
fi
fi

# Service flags (default to true if not set)
ENABLE_API="${HINDSIGHT_ENABLE_API:-true}"
ENABLE_CP="${HINDSIGHT_ENABLE_CP:-true}"
Expand Down Expand Up @@ -71,6 +93,63 @@ if [ "${HINDSIGHT_WAIT_FOR_DEPS:-false}" = "true" ]; then
done
fi

# =============================================================================
# Graceful shutdown handler (#675)
#
# Docker sends SIGTERM on `docker stop`/`docker restart`. Without a trap, child
# processes (hindsight-api + pg0, control-plane) are killed abruptly. For the
# embedded pg0 database this can cause data loss when the data directory is on
# a Docker volume that gets remounted after restart.
#
# The trap forwards SIGTERM to all tracked child PIDs so that:
# - hindsight-api receives the signal and can run its shutdown hooks
# - pg0 gets a clean PostgreSQL shutdown (checkpoint + WAL flush)
# - The control-plane Node.js process exits cleanly
# =============================================================================
# Guard against concurrent cleanup (e.g., child crash + SIGTERM arriving together)
SHUTTING_DOWN=false

cleanup() {
if $SHUTTING_DOWN; then return; fi
SHUTTING_DOWN=true

echo ""
echo "🛑 Received shutdown signal, stopping services gracefully..."
for pid in "${PIDS[@]}"; do
if kill -0 "$pid" 2>/dev/null; then
kill -TERM "$pid" 2>/dev/null
fi
done
# Give processes time to shut down cleanly (pg0 needs to flush WAL).
# NOTE: Docker's default stop_grace_period is 10s. If you use the default,
# either set stop_grace_period: 30s in your compose file / docker stop -t 30,
# or Docker will SIGKILL the container before this timeout expires.
local timeout=30
for ((i=1; i<=timeout; i++)); do
local all_stopped=true
for pid in "${PIDS[@]}"; do
if kill -0 "$pid" 2>/dev/null; then
all_stopped=false
break
fi
done
if $all_stopped; then
echo "✅ All services stopped cleanly"
exit 0
fi
sleep 1
done
# Force kill if still running after timeout
echo "⚠️ Timeout reached, forcing shutdown..."
for pid in "${PIDS[@]}"; do
if kill -0 "$pid" 2>/dev/null; then
kill -9 "$pid" 2>/dev/null
fi
done
exit 1
}
trap cleanup SIGTERM SIGINT

# Track PIDs for wait
PIDS=()

Expand Down Expand Up @@ -138,8 +217,21 @@ if [ ${#PIDS[@]} -eq 0 ]; then
exit 1
fi

# Wait for any process to exit
wait -n

# Exit with status of first exited process
exit $?
# Wait for any process to exit (use wait -n with trap-safe loop)
while true; do
# wait -n returns when any child exits; it also returns on signal delivery
# (the trap handler will run and exit, so this loop is just for robustness).
# `&& true` prevents `set -e` from killing the script when wait -n returns
# non-zero (child exited with error or no backgrounded children remain).
wait -n && true
# Check if any tracked PID has exited
for pid in "${PIDS[@]}"; do
if ! kill -0 "$pid" 2>/dev/null; then
wait "$pid" 2>/dev/null
exit_code=$?
echo "⚠️ Service (PID $pid) exited with code $exit_code"
# Trigger cleanup for remaining services
cleanup
fi
done
done
Loading