Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,12 @@ FROM debian:trixie-slim
WORKDIR /work

RUN apt-get update \
&& apt-get install -y --no-install-recommends tini bash ca-certificates procps libssl3 \
&& apt-get install -y --no-install-recommends tini bash ca-certificates procps libssl3 curl \
&& rm -rf /var/lib/apt/lists/*

COPY --from=builder /src/target/release/xdbg /usr/local/bin/xdbg
COPY xmtp_debug/newrunner.sh /usr/local/bin/newrunner.sh
RUN chmod +x /usr/local/bin/xdbg /usr/local/bin/newrunner.sh
COPY xmtp_debug/web_healthcheck.sh /usr/local/bin/web_healthcheck.sh
RUN chmod +x /usr/local/bin/xdbg /usr/local/bin/newrunner.sh /usr/local/bin/web_healthcheck.sh

ENTRYPOINT ["/usr/bin/tini", "--", "/usr/local/bin/newrunner.sh"]
20 changes: 11 additions & 9 deletions newrunner.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

: "${XDBG_LOOP_PAUSE:=300}" # default interval between restarts
: "${MONITOR_LOOP_PAUSE:=300}" # default interval between restarts

function log {
echo "[$(date '+%F %T')] $*"
Expand All @@ -19,23 +19,25 @@ log "WORKSPACE='${WORKSPACE:-<unset>}' -> backend='${BACKEND}'"

while true; do
log "Reset environment.."
XDBG_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" --clear
XDBG_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity identity --amount 5 --concurrency 1
XDBG_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity group --amount 1 --concurrency 1 --invite 1
MONITOR_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" --clear
MONITOR_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity identity --amount 5 --concurrency 1
MONITOR_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity group --amount 1 --concurrency 1 --invite 1
log "Reset complete, starting tests"

for x in {1..10}; {
log "Identities..."
XDBG_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity identity --amount 1 --concurrency 1
MONITOR_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity identity --amount 1 --concurrency 1
log "Sleeping 20s..."
sleep 20
log "Groups..."
XDBG_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity group --amount 1 --concurrency 1 --invite 1
MONITOR_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity group --amount 1 --concurrency 1 --invite 1
log "Sleeping 20s..."
sleep 20
log "Messages..."
XDBG_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity message --amount 1 --concurrency 1
log "Sleeping $XDBG_LOOP_PAUSE seconds..."
sleep $XDBG_LOOP_PAUSE
MONITOR_LOOP_PAUSE=0 xdbg -d -b "${BACKEND}" generate --entity message --amount 1 --concurrency 1
log "Running health checks..."
bash "$(dirname "$0")/web_healthcheck.sh"
log "Sleeping $MONITOR_LOOP_PAUSE seconds..."
sleep $MONITOR_LOOP_PAUSE
}
done
72 changes: 72 additions & 0 deletions web_healthcheck.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/bin/bash


: "${PUSHGATEWAY_URL:=http://localhost:9091}"
: "${HEALTHCHECK_TIMEOUT:=10}"

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

HEALTHCHECK_TIMEOUT isn’t validated. If it’s non‑numeric or out of range, curl -m fails and health is misreported. Consider validating it as a positive integer within a sane range and defaulting when invalid.

+if ! [[ "$HEALTHCHECK_TIMEOUT" =~ ^[0-9]+$ ]] || [ "$HEALTHCHECK_TIMEOUT" -le 0 ] || [ "$HEALTHCHECK_TIMEOUT" -gt 300 ]; then
+    log "Invalid HEALTHCHECK_TIMEOUT: $HEALTHCHECK_TIMEOUT; using 10"
+    HEALTHCHECK_TIMEOUT=10
+fi

🚀 Reply to ask Macroscope to explain or update this suggestion.

👍 Helpful? React to give us feedback.

function log {
echo "[$(date '+%F %T')] [healthcheck] $*"
}

if [ -z "${WEB_HEALTHCHECK_ENDPOINTS}" ]; then
log "WEB_HEALTHCHECK_ENDPOINTS not set, skipping health checks"
exit 0
fi

IFS=',' read -ra ENDPOINTS <<< "${WEB_HEALTHCHECK_ENDPOINTS}"

if [ ${#ENDPOINTS[@]} -eq 0 ]; then
log "No endpoints configured"
exit 0
fi

log "Checking ${#ENDPOINTS[@]} endpoint(s)..."

for endpoint in "${ENDPOINTS[@]}"; do
endpoint=$(echo "$endpoint" | xargs)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Empty endpoints aren’t skipped after trim. This can call curl with no URL and create a blank endpoint_id (e.g., .../instance/). Consider continuing the loop when endpoint is empty.

Suggested change
endpoint=$(echo "$endpoint" | xargs)
endpoint=$(echo "$endpoint" | xargs)
if [ -z "$endpoint" ]; then
continue
fi

🚀 Reply to ask Macroscope to explain or update this suggestion.

👍 Helpful? React to give us feedback.


endpoint_id=$(echo "$endpoint" | sed 's|https\?://||' | sed 's|[^a-zA-Z0-9]|_|g')

log "Checking endpoint: $endpoint (id: $endpoint_id)"

http_code=$(curl -o /dev/null -s -w "%{http_code}" -m "$HEALTHCHECK_TIMEOUT" "$endpoint")
curl_exit_code=$?

if [ $curl_exit_code -eq 0 ] && [ "$http_code" = "200" ]; then
health_status=1
log "[OK] $endpoint - OK (200)"
else
health_status=0
if [ $curl_exit_code -ne 0 ]; then
log "[FAIL] $endpoint - FAILED (curl error: $curl_exit_code)"
else
log "[FAIL] $endpoint - FAILED (HTTP $http_code)"
fi
fi

metric_name="web_endpoint_health"

metrics_payload="# HELP ${metric_name} Health status of web endpoints (1 = healthy, 0 = unhealthy)
# TYPE ${metric_name} gauge
${metric_name}{endpoint=\"${endpoint}\",endpoint_id=\"${endpoint_id}\",http_code=\"${http_code}\"} ${health_status}
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Label values in metrics_payload are not escaped, so an endpoint containing \, " or newlines can produce invalid Prometheus exposition and be rejected by Pushgateway. Consider escaping backslashes, double-quotes and newlines for endpoint (and endpoint_id) before building the metric label string.

-    metrics_payload="# HELP ${metric_name} Health status of web endpoints (1 = healthy, 0 = unhealthy)
-# TYPE ${metric_name} gauge
-${metric_name}{endpoint=\"${endpoint}\",endpoint_id=\"${endpoint_id}\",http_code=\"${http_code}\"} ${health_status}
-"
+    # escape label values per Prometheus exposition format
+    escaped_endpoint=$(printf '%s' "$endpoint" | sed -e 's/\\/\\\\/g' -e 's/"/\\"/g' -e ':a;N;s/\n/\\n/;ta')
+    escaped_endpoint_id=$(printf '%s' "$endpoint_id" | sed -e 's/\\/\\\\/g' -e 's/"/\\"/g' -e ':a;N;s/\n/\\n/;ta')
+    metrics_payload="# HELP ${metric_name} Health status of web endpoints (1 = healthy, 0 = unhealthy)
+# TYPE ${metric_name} gauge
+${metric_name}{endpoint=\"${escaped_endpoint}\",endpoint_id=\"${escaped_endpoint_id}\",http_code=\"${http_code}\"} ${health_status}
+"

🚀 Reply to ask Macroscope to explain or update this suggestion.

👍 Helpful? React to give us feedback.

"

push_url="${PUSHGATEWAY_URL}/metrics/job/web_healthcheck/instance/${endpoint_id}"

push_response=$(curl -s -w "\nHTTP_CODE:%{http_code}" -X POST -H "Content-Type: text/plain" --data-binary "$metrics_payload" "$push_url" 2>&1)
push_exit_code=$?
push_http_code=$(echo "$push_response" | grep "HTTP_CODE:" | cut -d: -f2)

if [ $push_exit_code -eq 0 ] && [ "$push_http_code" = "200" ]; then
log "Metrics pushed for $endpoint_id"
else
log "ERROR: Failed to push metrics for $endpoint_id (HTTP $push_http_code, exit code: $push_exit_code)"
error_message=$(echo "$push_response" | grep -v "HTTP_CODE:")
if [ -n "$error_message" ]; then
log "ERROR: $error_message"
fi
fi
done

log "Health check cycle complete"