vectorize-io · nicoloboschi · Mar 27, 2026 · Mar 26, 2026 · Mar 26, 2026
diff --git a/docker/standalone/start-all.sh b/docker/standalone/start-all.sh
@@ -1,6 +1,28 @@
 #!/bin/bash
 set -e
 
+# =============================================================================
+# Embedded pg0 data integrity check (#675)
+#
+# When using embedded pg0, check if the data directory has existing PostgreSQL
+# data before starting. If the directory exists but appears empty/corrupt
+# (e.g., missing PG_VERSION file), log a warning. This helps diagnose data
+# loss scenarios where a container restart caused the data directory to be
+# wiped despite a volume mount being present.
+# =============================================================================
+PG0_DATA_DIR="${HOME}/.pg0"
+if [ -d "$PG0_DATA_DIR" ]; then
+    # Look for actual PostgreSQL data directories (pg0 creates subdirs per instance)
+    if compgen -G "$PG0_DATA_DIR"/*/PG_VERSION > /dev/null 2>&1; then
+        echo "✅ Existing pg0 data directory detected at $PG0_DATA_DIR"
+    elif [ "$(ls -A "$PG0_DATA_DIR" 2>/dev/null)" ]; then
+        echo "⚠️  WARNING: pg0 data directory exists at $PG0_DATA_DIR but no PG_VERSION found."
+        echo "   This may indicate data corruption or an incomplete previous shutdown."
+        echo "   If you see all migrations running from scratch after this, your data may have been lost."
+        echo "   See: https://github.com/vectorize-io/hindsight/issues/675"
+    fi
+fi
+
 # Service flags (default to true if not set)
 ENABLE_API="${HINDSIGHT_ENABLE_API:-true}"
 ENABLE_CP="${HINDSIGHT_ENABLE_CP:-true}"
@@ -71,6 +93,63 @@ if [ "${HINDSIGHT_WAIT_FOR_DEPS:-false}" = "true" ]; then
     done
 fi
 
+# =============================================================================
+# Graceful shutdown handler (#675)
+#
+# Docker sends SIGTERM on `docker stop`/`docker restart`. Without a trap, child
+# processes (hindsight-api + pg0, control-plane) are killed abruptly. For the
+# embedded pg0 database this can cause data loss when the data directory is on
+# a Docker volume that gets remounted after restart.
+#
+# The trap forwards SIGTERM to all tracked child PIDs so that:
+#   - hindsight-api receives the signal and can run its shutdown hooks
+#   - pg0 gets a clean PostgreSQL shutdown (checkpoint + WAL flush)
+#   - The control-plane Node.js process exits cleanly
+# =============================================================================
+# Guard against concurrent cleanup (e.g., child crash + SIGTERM arriving together)
+SHUTTING_DOWN=false
+
+cleanup() {
+    if $SHUTTING_DOWN; then return; fi
+    SHUTTING_DOWN=true
+
+    echo ""
+    echo "🛑 Received shutdown signal, stopping services gracefully..."
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            kill -TERM "$pid" 2>/dev/null
+        fi
+    done
+    # Give processes time to shut down cleanly (pg0 needs to flush WAL).
+    # NOTE: Docker's default stop_grace_period is 10s. If you use the default,
+    # either set stop_grace_period: 30s in your compose file / docker stop -t 30,
+    # or Docker will SIGKILL the container before this timeout expires.
+    local timeout=30
+    for ((i=1; i<=timeout; i++)); do
+        local all_stopped=true
+        for pid in "${PIDS[@]}"; do
+            if kill -0 "$pid" 2>/dev/null; then
+                all_stopped=false
+                break
+            fi
+        done
+        if $all_stopped; then
+            echo "✅ All services stopped cleanly"
+            exit 0
+        fi
+        sleep 1
+    done
+    # Force kill if still running after timeout
+    echo "⚠️  Timeout reached, forcing shutdown..."
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            kill -9 "$pid" 2>/dev/null
+        fi
+    done
+    exit 1
+}
+trap cleanup SIGTERM SIGINT
+
 # Track PIDs for wait
 PIDS=()
 
@@ -138,8 +217,21 @@ if [ ${#PIDS[@]} -eq 0 ]; then
     exit 1
 fi
 
-# Wait for any process to exit
-wait -n
-
-# Exit with status of first exited process
-exit $?
+# Wait for any process to exit (use wait -n with trap-safe loop)
+while true; do
+    # wait -n returns when any child exits; it also returns on signal delivery
+    # (the trap handler will run and exit, so this loop is just for robustness).
+    # `&& true` prevents `set -e` from killing the script when wait -n returns
+    # non-zero (child exited with error or no backgrounded children remain).
+    wait -n && true
+    # Check if any tracked PID has exited
+    for pid in "${PIDS[@]}"; do
+        if ! kill -0 "$pid" 2>/dev/null; then
+            wait "$pid" 2>/dev/null
+            exit_code=$?
+            echo "⚠️  Service (PID $pid) exited with code $exit_code"
+            # Trigger cleanup for remaining services
+            cleanup
+        fi
+    done
+done