diff --git a/docker/Dockerfile b/docker/Dockerfile index 2127e02..2c9dbf7 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -18,11 +18,12 @@ FROM debian:trixie-slim WORKDIR /work RUN apt-get update \ - && apt-get install -y --no-install-recommends tini bash ca-certificates procps libssl3 \ + && apt-get install -y --no-install-recommends tini bash ca-certificates procps libssl3 curl \ && rm -rf /var/lib/apt/lists/* COPY --from=builder /src/target/release/xdbg /usr/local/bin/xdbg COPY xmtp_debug/newrunner.sh /usr/local/bin/newrunner.sh -RUN chmod +x /usr/local/bin/xdbg /usr/local/bin/newrunner.sh +COPY xmtp_debug/web_healthcheck.sh /usr/local/bin/web_healthcheck.sh +RUN chmod +x /usr/local/bin/xdbg /usr/local/bin/newrunner.sh /usr/local/bin/web_healthcheck.sh ENTRYPOINT ["/usr/bin/tini", "--", "/usr/local/bin/newrunner.sh"] \ No newline at end of file diff --git a/newrunner.sh b/newrunner.sh index 887f611..75f510e 100644 --- a/newrunner.sh +++ b/newrunner.sh @@ -1,6 +1,6 @@ #!/bin/bash -: "${XDBG_LOOP_PAUSE:=300}" # default interval between restarts +: "${MONITOR_LOOP_PAUSE:=300}" # default interval between restarts function log { echo "[$(date '+%F %T')] $*" @@ -19,23 +19,25 @@ log "WORKSPACE='${WORKSPACE:-}' -> backend='${BACKEND}'" while true; do log "Reset environment.." - XDBG_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" --clear - XDBG_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity identity --amount 5 --concurrency 1 - XDBG_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity group --amount 1 --concurrency 1 --invite 1 + MONITOR_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" --clear + MONITOR_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity identity --amount 5 --concurrency 1 + MONITOR_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity group --amount 1 --concurrency 1 --invite 1 log "Reset complete, starting tests" for x in {1..10}; { log "Identities..." - XDBG_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity identity --amount 1 --concurrency 1 + MONITOR_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity identity --amount 1 --concurrency 1 log "Sleeping 20s..." sleep 20 log "Groups..." - XDBG_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity group --amount 1 --concurrency 1 --invite 1 + MONITOR_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity group --amount 1 --concurrency 1 --invite 1 log "Sleeping 20s..." sleep 20 log "Messages..." - XDBG_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity message --amount 1 --concurrency 1 - log "Sleeping $XDBG_LOOP_PAUSE seconds..." - sleep $XDBG_LOOP_PAUSE + MONITOR_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity message --amount 1 --concurrency 1 + log "Running health checks..." + bash "$(dirname "$0")/web_healthcheck.sh" + log "Sleeping $MONITOR_LOOP_PAUSE seconds..." + sleep $MONITOR_LOOP_PAUSE } done \ No newline at end of file diff --git a/web_healthcheck.sh b/web_healthcheck.sh new file mode 100644 index 0000000..4b67269 --- /dev/null +++ b/web_healthcheck.sh @@ -0,0 +1,72 @@ +#!/bin/bash + + +: "${PUSHGATEWAY_URL:=http://localhost:9091}" +: "${HEALTHCHECK_TIMEOUT:=10}" + +function log { + echo "[$(date '+%F %T')] [healthcheck] $*" +} + +if [ -z "${WEB_HEALTHCHECK_ENDPOINTS}" ]; then + log "WEB_HEALTHCHECK_ENDPOINTS not set, skipping health checks" + exit 0 +fi + +IFS=',' read -ra ENDPOINTS <<< "${WEB_HEALTHCHECK_ENDPOINTS}" + +if [ ${#ENDPOINTS[@]} -eq 0 ]; then + log "No endpoints configured" + exit 0 +fi + +log "Checking ${#ENDPOINTS[@]} endpoint(s)..." + +for endpoint in "${ENDPOINTS[@]}"; do + endpoint=$(echo "$endpoint" | xargs) + + endpoint_id=$(echo "$endpoint" | sed 's|https\?://||' | sed 's|[^a-zA-Z0-9]|_|g') + + log "Checking endpoint: $endpoint (id: $endpoint_id)" + + http_code=$(curl -o /dev/null -s -w "%{http_code}" -m "$HEALTHCHECK_TIMEOUT" "$endpoint") + curl_exit_code=$? + + if [ $curl_exit_code -eq 0 ] && [ "$http_code" = "200" ]; then + health_status=1 + log "[OK] $endpoint - OK (200)" + else + health_status=0 + if [ $curl_exit_code -ne 0 ]; then + log "[FAIL] $endpoint - FAILED (curl error: $curl_exit_code)" + else + log "[FAIL] $endpoint - FAILED (HTTP $http_code)" + fi + fi + + metric_name="web_endpoint_health" + + metrics_payload="# HELP ${metric_name} Health status of web endpoints (1 = healthy, 0 = unhealthy) +# TYPE ${metric_name} gauge +${metric_name}{endpoint=\"${endpoint}\",endpoint_id=\"${endpoint_id}\",http_code=\"${http_code}\"} ${health_status} +" + + push_url="${PUSHGATEWAY_URL}/metrics/job/web_healthcheck/instance/${endpoint_id}" + + push_response=$(curl -s -w "\nHTTP_CODE:%{http_code}" -X POST -H "Content-Type: text/plain" --data-binary "$metrics_payload" "$push_url" 2>&1) + push_exit_code=$? + push_http_code=$(echo "$push_response" | grep "HTTP_CODE:" | cut -d: -f2) + + if [ $push_exit_code -eq 0 ] && [ "$push_http_code" = "200" ]; then + log "Metrics pushed for $endpoint_id" + else + log "ERROR: Failed to push metrics for $endpoint_id (HTTP $push_http_code, exit code: $push_exit_code)" + error_message=$(echo "$push_response" | grep -v "HTTP_CODE:") + if [ -n "$error_message" ]; then + log "ERROR: $error_message" + fi + fi +done + +log "Health check cycle complete" +