From 7dfbab30fb000e23d524112f3fe8baa6defed5af Mon Sep 17 00:00:00 2001 From: Isaac Hartford Date: Wed, 12 Nov 2025 03:33:56 +0800 Subject: [PATCH 1/8] Add web healthcheck --- newrunner.sh | 20 +++++++------- web_healthcheck.sh | 67 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 9 deletions(-) create mode 100644 web_healthcheck.sh diff --git a/newrunner.sh b/newrunner.sh index 887f611..75f510e 100644 --- a/newrunner.sh +++ b/newrunner.sh @@ -1,6 +1,6 @@ #!/bin/bash -: "${XDBG_LOOP_PAUSE:=300}" # default interval between restarts +: "${MONITOR_LOOP_PAUSE:=300}" # default interval between restarts function log { echo "[$(date '+%F %T')] $*" @@ -19,23 +19,25 @@ log "WORKSPACE='${WORKSPACE:-}' -> backend='${BACKEND}'" while true; do log "Reset environment.." - XDBG_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" --clear - XDBG_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity identity --amount 5 --concurrency 1 - XDBG_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity group --amount 1 --concurrency 1 --invite 1 + MONITOR_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" --clear + MONITOR_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity identity --amount 5 --concurrency 1 + MONITOR_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity group --amount 1 --concurrency 1 --invite 1 log "Reset complete, starting tests" for x in {1..10}; { log "Identities..." - XDBG_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity identity --amount 1 --concurrency 1 + MONITOR_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity identity --amount 1 --concurrency 1 log "Sleeping 20s..." sleep 20 log "Groups..." - XDBG_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity group --amount 1 --concurrency 1 --invite 1 + MONITOR_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity group --amount 1 --concurrency 1 --invite 1 log "Sleeping 20s..." sleep 20 log "Messages..." - XDBG_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity message --amount 1 --concurrency 1 - log "Sleeping $XDBG_LOOP_PAUSE seconds..." - sleep $XDBG_LOOP_PAUSE + MONITOR_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity message --amount 1 --concurrency 1 + log "Running health checks..." + bash "$(dirname "$0")/web_healthcheck.sh" + log "Sleeping $MONITOR_LOOP_PAUSE seconds..." + sleep $MONITOR_LOOP_PAUSE } done \ No newline at end of file diff --git a/web_healthcheck.sh b/web_healthcheck.sh new file mode 100644 index 0000000..c5e6fca --- /dev/null +++ b/web_healthcheck.sh @@ -0,0 +1,67 @@ +#!/bin/bash + + +: "${PUSHGATEWAY_URL:=http://localhost:9091}" +: "${HEALTHCHECK_TIMEOUT:=10}" + +function log { + echo "[$(date '+%F %T')] [healthcheck] $*" +} + +if [ -z "${WEB_HEALTHCHECK_ENDPOINTS}" ]; then + log "WEB_HEALTHCHECK_ENDPOINTS not set, skipping health checks" + exit 0 +fi + +IFS=',' read -ra ENDPOINTS <<< "${WEB_HEALTHCHECK_ENDPOINTS}" + +if [ ${#ENDPOINTS[@]} -eq 0 ]; then + log "No endpoints configured" + exit 0 +fi + +log "Checking ${#ENDPOINTS[@]} endpoint(s)..." + +for endpoint in "${ENDPOINTS[@]}"; do + endpoint=$(echo "$endpoint" | xargs) + + endpoint_id=$(echo "$endpoint" | sed 's|https\?://||' | sed 's|[^a-zA-Z0-9]|_|g') + + log "Checking endpoint: $endpoint (id: $endpoint_id)" + + http_code=$(curl -o /dev/null -s -w "%{http_code}" -m "$HEALTHCHECK_TIMEOUT" "$endpoint") + curl_exit_code=$? + + if [ $curl_exit_code -eq 0 ] && [ "$http_code" = "200" ]; then + health_status=1 + log "[OK] $endpoint - OK (200)" + else + health_status=0 + if [ $curl_exit_code -ne 0 ]; then + log "[FAIL] $endpoint - FAILED (curl error: $curl_exit_code)" + else + log "[FAIL] $endpoint - FAILED (HTTP $http_code)" + fi + fi + + metric_name="web_endpoint_health" + timestamp=$(date +%s) + + metrics_payload=$(cat < /dev/null 2>&1; then + log "Metrics pushed for $endpoint_id" + else + log "WARNING: Failed to push metrics for $endpoint_id" + fi +done + +log "Health check cycle complete" + From 29e38c3af4aadb079bd915751571a2e7637bf7cb Mon Sep 17 00:00:00 2001 From: Isaac Hartford Date: Fri, 14 Nov 2025 04:51:36 +0800 Subject: [PATCH 2/8] Copy the healthcheck script properly --- docker/Dockerfile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 2127e02..2c9dbf7 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -18,11 +18,12 @@ FROM debian:trixie-slim WORKDIR /work RUN apt-get update \ - && apt-get install -y --no-install-recommends tini bash ca-certificates procps libssl3 \ + && apt-get install -y --no-install-recommends tini bash ca-certificates procps libssl3 curl \ && rm -rf /var/lib/apt/lists/* COPY --from=builder /src/target/release/xdbg /usr/local/bin/xdbg COPY xmtp_debug/newrunner.sh /usr/local/bin/newrunner.sh -RUN chmod +x /usr/local/bin/xdbg /usr/local/bin/newrunner.sh +COPY xmtp_debug/web_healthcheck.sh /usr/local/bin/web_healthcheck.sh +RUN chmod +x /usr/local/bin/xdbg /usr/local/bin/newrunner.sh /usr/local/bin/web_healthcheck.sh ENTRYPOINT ["/usr/bin/tini", "--", "/usr/local/bin/newrunner.sh"] \ No newline at end of file From 999f041930067563aac73a57f2b3ef79e03679d4 Mon Sep 17 00:00:00 2001 From: Isaac Hartford Date: Fri, 14 Nov 2025 19:45:10 +0800 Subject: [PATCH 3/8] Debug - why isn't the metric showing up in pgw?? --- web_healthcheck.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/web_healthcheck.sh b/web_healthcheck.sh index c5e6fca..284c0d2 100644 --- a/web_healthcheck.sh +++ b/web_healthcheck.sh @@ -58,6 +58,20 @@ EOF if curl -s -X POST -H "Content-Type: text/plain" --data-binary "$metrics_payload" "$push_url" > /dev/null 2>&1; then log "Metrics pushed for $endpoint_id" + + # VERIFY: Read back the metric we just pushed + log "VERIFY: Reading back metrics from Pushgateway..." + verification=$(curl -s "${PUSHGATEWAY_URL}/metrics" | grep "web_endpoint_health.*${endpoint_id}") + if [ -n "$verification" ]; then + log "VERIFY SUCCESS: Metric found in Pushgateway:" + log "$verification" + else + log "VERIFY FAILED: Metric NOT found in Pushgateway!" + log "VERIFY: Checking all web_endpoint_health metrics:" + curl -s "${PUSHGATEWAY_URL}/metrics" | grep "web_endpoint_health" | while read -r line; do + log " $line" + done + fi else log "WARNING: Failed to push metrics for $endpoint_id" fi From 4f4932f0265f6e77f9d2940a4c9a55a314cd0603 Mon Sep 17 00:00:00 2001 From: Isaac Hartford Date: Fri, 14 Nov 2025 20:26:12 +0800 Subject: [PATCH 4/8] More debug --- web_healthcheck.sh | 50 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/web_healthcheck.sh b/web_healthcheck.sh index 284c0d2..4cafef6 100644 --- a/web_healthcheck.sh +++ b/web_healthcheck.sh @@ -56,24 +56,64 @@ EOF push_url="${PUSHGATEWAY_URL}/metrics/job/web_healthcheck/instance/${endpoint_id}" - if curl -s -X POST -H "Content-Type: text/plain" --data-binary "$metrics_payload" "$push_url" > /dev/null 2>&1; then - log "Metrics pushed for $endpoint_id" + log "DEBUG: About to push to Pushgateway" + log "DEBUG: Push URL: $push_url" + log "DEBUG: Payload to push:" + log "$metrics_payload" + + # Test if we can reach Pushgateway first + log "DEBUG: Testing Pushgateway connectivity..." + if curl -s -f "${PUSHGATEWAY_URL}/metrics" > /dev/null 2>&1; then + log "DEBUG: Pushgateway is reachable at ${PUSHGATEWAY_URL}" + else + log "DEBUG: WARNING - Cannot reach Pushgateway at ${PUSHGATEWAY_URL}" + fi + + # Push with full response capture + push_response=$(curl -s -w "\nHTTP_CODE:%{http_code}" -X POST -H "Content-Type: text/plain" --data-binary "$metrics_payload" "$push_url" 2>&1) + push_exit_code=$? + push_http_code=$(echo "$push_response" | grep "HTTP_CODE:" | cut -d: -f2) + + log "DEBUG: Push exit code: $push_exit_code" + log "DEBUG: Push HTTP code: $push_http_code" + if [ -n "$push_response" ]; then + log "DEBUG: Push response: $push_response" + fi + + if [ $push_exit_code -eq 0 ]; then + log "Metrics pushed for $endpoint_id (HTTP $push_http_code)" + + # Wait a moment for Pushgateway to process + sleep 1 # VERIFY: Read back the metric we just pushed log "VERIFY: Reading back metrics from Pushgateway..." - verification=$(curl -s "${PUSHGATEWAY_URL}/metrics" | grep "web_endpoint_health.*${endpoint_id}") + log "VERIFY: Fetching all metrics from ${PUSHGATEWAY_URL}/metrics" + + all_metrics=$(curl -s "${PUSHGATEWAY_URL}/metrics") + verification=$(echo "$all_metrics" | grep "web_endpoint_health.*${endpoint_id}") + if [ -n "$verification" ]; then log "VERIFY SUCCESS: Metric found in Pushgateway:" log "$verification" else log "VERIFY FAILED: Metric NOT found in Pushgateway!" log "VERIFY: Checking all web_endpoint_health metrics:" - curl -s "${PUSHGATEWAY_URL}/metrics" | grep "web_endpoint_health" | while read -r line; do + web_health_metrics=$(echo "$all_metrics" | grep "web_endpoint_health") + if [ -n "$web_health_metrics" ]; then + echo "$web_health_metrics" | while read -r line; do + log " $line" + done + else + log " No web_endpoint_health metrics found at all!" + fi + log "VERIFY: First 50 lines of all Pushgateway metrics:" + echo "$all_metrics" | head -50 | while read -r line; do log " $line" done fi else - log "WARNING: Failed to push metrics for $endpoint_id" + log "WARNING: Failed to push metrics for $endpoint_id (exit code: $push_exit_code)" fi done From da82355f31dc56727398fadcd96bf5d06ca372a1 Mon Sep 17 00:00:00 2001 From: Isaac Hartford Date: Fri, 14 Nov 2025 21:18:11 +0800 Subject: [PATCH 5/8] Try this!? --- web_healthcheck.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/web_healthcheck.sh b/web_healthcheck.sh index 4cafef6..75f9eb5 100644 --- a/web_healthcheck.sh +++ b/web_healthcheck.sh @@ -51,6 +51,7 @@ for endpoint in "${ENDPOINTS[@]}"; do # HELP ${metric_name} Health status of web endpoints (1 = healthy, 0 = unhealthy) # TYPE ${metric_name} gauge ${metric_name}{endpoint="${endpoint}",endpoint_id="${endpoint_id}",http_code="${http_code}"} ${health_status} ${timestamp}000 + EOF ) From b0bb3aba059ec3e320e06192e54db4a1908fc083 Mon Sep 17 00:00:00 2001 From: Isaac Hartford Date: Fri, 14 Nov 2025 23:05:30 +0800 Subject: [PATCH 6/8] More fixes --- web_healthcheck.sh | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/web_healthcheck.sh b/web_healthcheck.sh index 75f9eb5..c55f327 100644 --- a/web_healthcheck.sh +++ b/web_healthcheck.sh @@ -45,15 +45,13 @@ for endpoint in "${ENDPOINTS[@]}"; do fi metric_name="web_endpoint_health" - timestamp=$(date +%s) + timestamp=$(date +%s)000 - metrics_payload=$(cat < Date: Fri, 14 Nov 2025 23:25:52 +0800 Subject: [PATCH 7/8] Remove timestamp --- web_healthcheck.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/web_healthcheck.sh b/web_healthcheck.sh index c55f327..2545211 100644 --- a/web_healthcheck.sh +++ b/web_healthcheck.sh @@ -45,12 +45,11 @@ for endpoint in "${ENDPOINTS[@]}"; do fi metric_name="web_endpoint_health" - timestamp=$(date +%s)000 - # Construct payload with explicit newlines + # Construct payload with explicit newlines (NO timestamp - Pushgateway adds its own) metrics_payload="# HELP ${metric_name} Health status of web endpoints (1 = healthy, 0 = unhealthy) # TYPE ${metric_name} gauge -${metric_name}{endpoint=\"${endpoint}\",endpoint_id=\"${endpoint_id}\",http_code=\"${http_code}\"} ${health_status} ${timestamp} +${metric_name}{endpoint=\"${endpoint}\",endpoint_id=\"${endpoint_id}\",http_code=\"${http_code}\"} ${health_status} " push_url="${PUSHGATEWAY_URL}/metrics/job/web_healthcheck/instance/${endpoint_id}" From f89c4e36e0d85974f26079b976db1fa509344cf9 Mon Sep 17 00:00:00 2001 From: Isaac Hartford Date: Sat, 15 Nov 2025 01:09:53 +0800 Subject: [PATCH 8/8] Working - cleanup for CR. Hopefully this doesnt break it --- web_healthcheck.sh | 61 ++++++---------------------------------------- 1 file changed, 7 insertions(+), 54 deletions(-) diff --git a/web_healthcheck.sh b/web_healthcheck.sh index 2545211..4b67269 100644 --- a/web_healthcheck.sh +++ b/web_healthcheck.sh @@ -46,7 +46,6 @@ for endpoint in "${ENDPOINTS[@]}"; do metric_name="web_endpoint_health" - # Construct payload with explicit newlines (NO timestamp - Pushgateway adds its own) metrics_payload="# HELP ${metric_name} Health status of web endpoints (1 = healthy, 0 = unhealthy) # TYPE ${metric_name} gauge ${metric_name}{endpoint=\"${endpoint}\",endpoint_id=\"${endpoint_id}\",http_code=\"${http_code}\"} ${health_status} @@ -54,64 +53,18 @@ ${metric_name}{endpoint=\"${endpoint}\",endpoint_id=\"${endpoint_id}\",http_code push_url="${PUSHGATEWAY_URL}/metrics/job/web_healthcheck/instance/${endpoint_id}" - log "DEBUG: About to push to Pushgateway" - log "DEBUG: Push URL: $push_url" - log "DEBUG: Payload to push:" - log "$metrics_payload" - - # Test if we can reach Pushgateway first - log "DEBUG: Testing Pushgateway connectivity..." - if curl -s -f "${PUSHGATEWAY_URL}/metrics" > /dev/null 2>&1; then - log "DEBUG: Pushgateway is reachable at ${PUSHGATEWAY_URL}" - else - log "DEBUG: WARNING - Cannot reach Pushgateway at ${PUSHGATEWAY_URL}" - fi - - # Push with full response capture push_response=$(curl -s -w "\nHTTP_CODE:%{http_code}" -X POST -H "Content-Type: text/plain" --data-binary "$metrics_payload" "$push_url" 2>&1) push_exit_code=$? push_http_code=$(echo "$push_response" | grep "HTTP_CODE:" | cut -d: -f2) - log "DEBUG: Push exit code: $push_exit_code" - log "DEBUG: Push HTTP code: $push_http_code" - if [ -n "$push_response" ]; then - log "DEBUG: Push response: $push_response" - fi - - if [ $push_exit_code -eq 0 ]; then - log "Metrics pushed for $endpoint_id (HTTP $push_http_code)" - - # Wait a moment for Pushgateway to process - sleep 1 - - # VERIFY: Read back the metric we just pushed - log "VERIFY: Reading back metrics from Pushgateway..." - log "VERIFY: Fetching all metrics from ${PUSHGATEWAY_URL}/metrics" - - all_metrics=$(curl -s "${PUSHGATEWAY_URL}/metrics") - verification=$(echo "$all_metrics" | grep "web_endpoint_health.*${endpoint_id}") - - if [ -n "$verification" ]; then - log "VERIFY SUCCESS: Metric found in Pushgateway:" - log "$verification" - else - log "VERIFY FAILED: Metric NOT found in Pushgateway!" - log "VERIFY: Checking all web_endpoint_health metrics:" - web_health_metrics=$(echo "$all_metrics" | grep "web_endpoint_health") - if [ -n "$web_health_metrics" ]; then - echo "$web_health_metrics" | while read -r line; do - log " $line" - done - else - log " No web_endpoint_health metrics found at all!" - fi - log "VERIFY: First 50 lines of all Pushgateway metrics:" - echo "$all_metrics" | head -50 | while read -r line; do - log " $line" - done - fi + if [ $push_exit_code -eq 0 ] && [ "$push_http_code" = "200" ]; then + log "Metrics pushed for $endpoint_id" else - log "WARNING: Failed to push metrics for $endpoint_id (exit code: $push_exit_code)" + log "ERROR: Failed to push metrics for $endpoint_id (HTTP $push_http_code, exit code: $push_exit_code)" + error_message=$(echo "$push_response" | grep -v "HTTP_CODE:") + if [ -n "$error_message" ]; then + log "ERROR: $error_message" + fi fi done