From 2fd183a51e3a443893b3aabdd0c77e87cb5da86f Mon Sep 17 00:00:00 2001 From: "Brian L. Troutwine" Date: Tue, 2 Dec 2025 15:33:21 -0800 Subject: [PATCH] Introduce a k8s based lading example This commit demonstrates how one might use lading in a k8s environment to determine the memory bounds for a target container, in this case Datadog Agent. The example used here is harsh and comes from Datadog Agent's own Regression Detector experiment `uds_dogstatsd_to_api`. Run like so: ``` ./k8s/experiment.sh --total-limit 1200 --agent-memory 700 --trace-memory 100 --sysprobe-memory 300 --process-memory 100 --duration 600 --tags "purpose:smp-experiment,agent-limit:2048" ``` This invocation demostrates a memory allocation that works for Agent under these conditions, results: ``` ======================================== RESULT: SUCCESS ======================================== No restarts detected Test duration: 600 seconds Tags: purpose:smp-experiment,agent-limit:2048 Container memory usage: agent: 640.67 MB / 700 MB (91.5%) trace-agent: 31.36 MB / 100 MB (31.4%) system-probe: 266.26 MB / 300 MB (88.8%) process-agent: 48.00 MB / 100 MB (48.0%) TOTAL: 986.29 MB / 1200 MB (82.2%) ``` Instructions are present in the `k8s/README.md` for changing lading's configuration and Datadog Agent's own configuration. Signed-off-by: Brian L. Troutwine --- k8s/README.md | 93 +++++++++ k8s/analyze_memory.py | 65 ++++++ k8s/experiment.sh | 321 ++++++++++++++++++++++++++++++ k8s/manifests/datadog-agent.yaml | 84 ++++++++ k8s/manifests/datadog-secret.yaml | 8 + k8s/manifests/deny-egress.yaml | 22 ++ k8s/manifests/lading-intake.yaml | 67 +++++++ k8s/manifests/lading.yaml | 137 +++++++++++++ 8 files changed, 797 insertions(+) create mode 100644 k8s/README.md create mode 100755 k8s/analyze_memory.py create mode 100755 k8s/experiment.sh create mode 100644 k8s/manifests/datadog-agent.yaml create mode 100644 k8s/manifests/datadog-secret.yaml create mode 100644 k8s/manifests/deny-egress.yaml create mode 100644 k8s/manifests/lading-intake.yaml create mode 100644 k8s/manifests/lading.yaml diff --git a/k8s/README.md b/k8s/README.md new file mode 100644 index 000000000..eab015805 --- /dev/null +++ b/k8s/README.md @@ -0,0 +1,93 @@ +# Lading in k8s Demonstration + +Testing setup to demonstrate memory limits for Datadog Agent under lading load. + +Experiment is rigged up through `experiment.sh`. That script takes multiple +memory parameters for each configured Agent pod container, setting them as +limits in `manifests/datadog-agent.yaml`. Experiment runs for a given duration +-- suggested, 300 seconds at a minimum -- and does two things: + +* watches for container restarts during the experiment, signaling failure if one + is detected or +* executes to experiment duration and queries Prometheus to calculate the peak + memory consumed by each Agent container, relative to configured limits. + +Experiments are **isolated from the internet** to avoid sending metrics et al to +actual Datadog intake. See `manifests/deny-egress.yaml` for details. + +## Prerequisites + +- kind: `brew install kind` +- kubectl: `brew install kubectl` +- helm: `brew install helm` +- jq: `brew install jq` +- python3: System Python 3 +- Docker running + +## Usage + +### Test a specific memory limit + +```bash +# Test 2000 MB total for 5 minutes with explicit per-container limits +./k8s/experiment.sh --total-limit 2000 --agent-memory 1200 --trace-memory 400 --sysprobe-memory 300 --process-memory 100 --tags "purpose:test,limit:2000mb" +``` + +All memory flags are mandatory and must sum to `--total-limit`, which acts as a check flag. + +### To find a minimum memory limit + +Run the script multiple times with different limits. Results are: + +- **OOMKilled** (FAILURE): Agent needs more memory, script exits +- **Stable** (SUCCESS): Agent survived test duration, cluster kept running for examination + +## Manifests + +All manifests are in `manifests/` directory. The script uses template +substitution for: + +- **manifests/datadog-agent.yaml**: DatadogAgent CRD for Datadog Operator + - Uses `{{ AGENT_MEMORY_MB }}`, `{{ TRACE_MEMORY_MB }}`, `{{ + SYSPROBE_MEMORY_MB }}`, `{{ PROCESS_MEMORY_MB }}`, and `{{ DD_TAGS }}` + placeholders + - Configured for DogStatsD via Unix domain socket at `/var/run/datadog/dsd.socket` + - Shares `/var/run/datadog` via hostPath with lading pod + +- **manifests/lading.yaml**: Lading load generator (lading 0.29.2) + - ConfigMap with exact config from `uds_dogstatsd_to_api` test + - Sends 100 MiB/s of DogStatsD metrics + - High cardinality: 1k-10k contexts, many tags + - Service with Prometheus scrape annotations for lading metrics + +- **manifests/lading-intake.yaml**: Lading intake (blackhole) mimicking Datadog + API (lading 0.29.2) + - Receives and discards agent output for self-contained testing + +- **manifests/datadog-secret.yaml**: Placeholder secret (fake API key, not validated) +- **manifests/deny-egress.yaml**: NetworkPolicy blocking internet egress (security isolation) + +## Test configuration + +Taken from +[`datadog-agent/test/regression/cases/uds_dogstatsd_to_api`](https://github.com/DataDog/datadog-agent/blob/main/test/regression/cases/uds_dogstatsd_to_api/lading/lading.yaml). This +experiment is **high stress** for metrics intake and high memory use from +`agent` container is expected. + +Adjust lading load generation configuration in the ConfigMap called +`lading-config`. Adjust Agent configuration in `manifests/datadog-agent.yaml`. + +## Cleanup + +Cluster is left online after script exits. Re-run of `experiment.sh` will +destroy the cluster. Manually clean up the cluster like so: + +```bash +kind delete cluster --name lading-test +``` + +## Notes + +- **Agent version**: 7.72.1 +- **Lading version**: 0.29.2 +- **Agent features enabled**: APM (trace-agent), Log Collection, NPM/system-probe, DogStatsD, Prometheus scrape diff --git a/k8s/analyze_memory.py b/k8s/analyze_memory.py new file mode 100755 index 000000000..492fca0c5 --- /dev/null +++ b/k8s/analyze_memory.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +import sys +import json +import urllib.request +import urllib.parse + +def query_container(prom_url, pod, container, duration): + query = f'max_over_time(container_memory_working_set_bytes{{namespace="default",pod="{pod}",container="{container}"}}[{duration}s])' + params = {'query': query} + url = f"{prom_url}?{urllib.parse.urlencode(params)}" + + try: + with urllib.request.urlopen(url, timeout=10) as response: + data = json.loads(response.read().decode()) + + if data['status'] == 'success' and data['data']['result']: + value_bytes = float(data['data']['result'][0]['value'][1]) + return data, value_bytes + return data, None + except Exception as e: + print(f"Error querying {container}: {e}", file=sys.stderr) + return None, None + +def main(): + if len(sys.argv) != 8: + print("Usage: analyze_memory.py ", file=sys.stderr) + sys.exit(1) + + prom_url = sys.argv[1] + pod = sys.argv[2] + duration = sys.argv[3] + agent_limit = int(sys.argv[4]) + trace_limit = int(sys.argv[5]) + sysprobe_limit = int(sys.argv[6]) + process_limit = int(sys.argv[7]) + total_limit = agent_limit + trace_limit + sysprobe_limit + process_limit + + containers = { + 'agent': agent_limit, + 'trace-agent': trace_limit, + 'system-probe': sysprobe_limit, + 'process-agent': process_limit + } + + results = {} + + for container, limit_mb in containers.items(): + data, value_bytes = query_container(prom_url, pod, container, duration) + + if value_bytes is not None: + value_mb = value_bytes / 1024 / 1024 + percent = (value_mb / limit_mb) * 100 + results[container] = (value_mb, limit_mb, percent) + print(f" {container}: {value_mb:.2f} MB / {limit_mb} MB ({percent:.1f}%)") + else: + print(f" {container}: Could not retrieve metrics") + results[container] = (0, limit_mb, 0) + + # Calculate total + total_mb = sum(r[0] for r in results.values()) + total_percent = (total_mb / total_limit) * 100 + print(f" TOTAL: {total_mb:.2f} MB / {total_limit} MB ({total_percent:.1f}%)") + +if __name__ == '__main__': + main() diff --git a/k8s/experiment.sh b/k8s/experiment.sh new file mode 100755 index 000000000..537d7052e --- /dev/null +++ b/k8s/experiment.sh @@ -0,0 +1,321 @@ +#!/bin/bash +set -e + +# Parse arguments +TOTAL_LIMIT="" +AGENT_MEMORY_MB="" +TRACE_MEMORY_MB="" +SYSPROBE_MEMORY_MB="" +PROCESS_MEMORY_MB="" +DURATION=300 +DD_TAGS_VALUE="" + +while [[ $# -gt 0 ]]; do + case $1 in + --total-limit) + TOTAL_LIMIT="$2" + shift 2 + ;; + --agent-memory) + AGENT_MEMORY_MB="$2" + shift 2 + ;; + --trace-memory) + TRACE_MEMORY_MB="$2" + shift 2 + ;; + --sysprobe-memory) + SYSPROBE_MEMORY_MB="$2" + shift 2 + ;; + --process-memory) + PROCESS_MEMORY_MB="$2" + shift 2 + ;; + --duration) + DURATION="$2" + shift 2 + ;; + --tags) + DD_TAGS_VALUE="$2" + shift 2 + ;; + *) + echo "Unknown option: $1" + echo "Usage: $0 --total-limit --agent-memory --trace-memory --sysprobe-memory --process-memory [--duration ] --tags " + exit 1 + ;; + esac +done + +if [ -z "$TOTAL_LIMIT" ]; then + echo "ERROR: --total-limit is required" + echo "Usage: $0 --total-limit --agent-memory --trace-memory --sysprobe-memory --process-memory --tags " + exit 1 +fi + +if [ -z "$AGENT_MEMORY_MB" ]; then + echo "ERROR: --agent-memory is required" + echo "Usage: $0 --total-limit --agent-memory --trace-memory --sysprobe-memory --process-memory --tags " + exit 1 +fi + +if [ -z "$TRACE_MEMORY_MB" ]; then + echo "ERROR: --trace-memory is required" + echo "Usage: $0 --total-limit --agent-memory --trace-memory --sysprobe-memory --process-memory --tags " + exit 1 +fi + +if [ -z "$SYSPROBE_MEMORY_MB" ]; then + echo "ERROR: --sysprobe-memory is required" + echo "Usage: $0 --total-limit --agent-memory --trace-memory --sysprobe-memory --process-memory --tags " + exit 1 +fi + +if [ -z "$PROCESS_MEMORY_MB" ]; then + echo "ERROR: --process-memory is required" + echo "Usage: $0 --total-limit --agent-memory --trace-memory --sysprobe-memory --process-memory --tags " + exit 1 +fi + +if [ -z "$DD_TAGS_VALUE" ]; then + echo "ERROR: --tags is required" + echo "Usage: $0 --total-limit --agent-memory --trace-memory --sysprobe-memory --process-memory --tags " + exit 1 +fi + +# Verify individual limits sum to total. +CALCULATED_TOTAL=$((AGENT_MEMORY_MB + TRACE_MEMORY_MB + SYSPROBE_MEMORY_MB + PROCESS_MEMORY_MB)) +if [ "$CALCULATED_TOTAL" -ne "$TOTAL_LIMIT" ]; then + echo "ERROR: Individual memory limits do not sum to total limit" + echo "Total limit: ${TOTAL_LIMIT} MB" + echo "Sum of individual limits: ${CALCULATED_TOTAL} MB (agent=${AGENT_MEMORY_MB} + trace=${TRACE_MEMORY_MB} + sysprobe=${SYSPROBE_MEMORY_MB} + process=${PROCESS_MEMORY_MB})" + exit 1 +fi + +TOTAL_MEMORY_MB=$TOTAL_LIMIT + +echo "========================================" +echo "Datadog Agent Memory Limit Test" +echo "========================================" +echo "Memory limits per container:" +echo " agent: ${AGENT_MEMORY_MB} MB" +echo " trace-agent: ${TRACE_MEMORY_MB} MB" +echo " system-probe: ${SYSPROBE_MEMORY_MB} MB" +echo " process-agent: ${PROCESS_MEMORY_MB} MB" +echo " TOTAL: ${TOTAL_MEMORY_MB} MB" +echo "Test duration: ${DURATION} seconds" +echo "Tags: ${DD_TAGS_VALUE}" +echo "Started at: $(date)" +echo + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +echo "[1/6] Checking prerequisites..." +command -v kind >/dev/null 2>&1 || { echo "ERROR: kind not found"; exit 1; } +command -v kubectl >/dev/null 2>&1 || { echo "ERROR: kubectl not found"; exit 1; } +command -v helm >/dev/null 2>&1 || { echo "ERROR: helm not found"; exit 1; } +command -v jq >/dev/null 2>&1 || { echo "ERROR: jq not found"; exit 1; } +command -v bc >/dev/null 2>&1 || { echo "ERROR: bc not found"; exit 1; } +echo " ✓ Prerequisites available" +echo + +echo "[2/6] Creating fresh cluster..." +if kind get clusters 2>/dev/null | grep -q "^lading-test$"; then + echo " Deleting existing cluster..." + kind delete cluster --name lading-test +fi +kind create cluster --name lading-test +echo " ✓ Cluster ready" +echo + +echo "[3/6] Installing Prometheus..." +kubectl create namespace monitoring 2>/dev/null || true +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts >/dev/null 2>&1 || true +helm repo update >/dev/null 2>&1 +helm install prometheus prometheus-community/prometheus \ + --namespace monitoring \ + --set server.service.type=ClusterIP \ + --set alertmanager.enabled=false \ + --set prometheus-pushgateway.enabled=false \ + --set kube-state-metrics.enabled=true >/dev/null 2>&1 +echo " ✓ Prometheus installed" +echo + +echo "[4/6] Installing Datadog Operator..." +helm repo add datadog https://helm.datadoghq.com >/dev/null 2>&1 || true +helm repo update >/dev/null 2>&1 +helm install datadog-operator datadog/datadog-operator --version 2.15.2 >/dev/null 2>&1 +echo " Waiting for operator..." +kubectl wait --for=condition=available --timeout=120s deployment/datadog-operator 2>/dev/null || sleep 30 +echo " ✓ Operator ready" +echo + +echo "[5/6] Applying manifests with ${TOTAL_MEMORY_MB} MB limit..." +kubectl apply -f "$SCRIPT_DIR/manifests/datadog-secret.yaml" +kubectl apply -f "$SCRIPT_DIR/manifests/deny-egress.yaml" +kubectl apply -f "$SCRIPT_DIR/manifests/lading-intake.yaml" + +AGENT_MANIFEST=$(cat "$SCRIPT_DIR/manifests/datadog-agent.yaml" | \ + sed "s/{{ AGENT_MEMORY_MB }}/${AGENT_MEMORY_MB}/g" | \ + sed "s/{{ TRACE_MEMORY_MB }}/${TRACE_MEMORY_MB}/g" | \ + sed "s/{{ SYSPROBE_MEMORY_MB }}/${SYSPROBE_MEMORY_MB}/g" | \ + sed "s/{{ PROCESS_MEMORY_MB }}/${PROCESS_MEMORY_MB}/g" | \ + sed "s|{{ DD_TAGS }}|${DD_TAGS_VALUE}|g") + +if echo "$AGENT_MANIFEST" | grep -q "{{ .*_MEMORY_MB }}"; then + echo " ✗ ERROR: Template substitution failed for memory placeholders" + exit 1 +fi +if echo "$AGENT_MANIFEST" | grep -q "{{ DD_TAGS }}"; then + echo " ✗ ERROR: Template substitution failed for DD_TAGS" + exit 1 +fi + +echo "$AGENT_MANIFEST" | kubectl apply -f - +echo " ✓ Agent deployed (egress blocked)" + +# We wait for agent pods to be ready and socket to be created before starting +# the lading load generator instance. +echo " Waiting for agent and DogStatsD socket..." +TIMEOUT=120 +ELAPSED=0 +while [ $ELAPSED -lt $TIMEOUT ]; do + AGENT_POD=$(kubectl get pods -l app.kubernetes.io/name=datadog-agent-deployment -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + if [ -n "$AGENT_POD" ]; then + SOCKET_EXISTS=$(kubectl exec "$AGENT_POD" -c agent -- test -S /var/run/datadog/dsd.socket 2>/dev/null && echo "yes" || echo "no") + if [ "$SOCKET_EXISTS" = "yes" ]; then + echo " ✓ DogStatsD socket ready" + break + fi + fi + sleep 2 + ELAPSED=$((ELAPSED + 2)) +done + +if [ $ELAPSED -ge $TIMEOUT ]; then + echo " ✗ Timeout waiting for DogStatsD socket" + exit 1 +fi + +# Now deploy lading load generator instance. +kubectl apply -f "$SCRIPT_DIR/manifests/lading.yaml" +echo " ✓ Manifests applied" +echo + +echo " Waiting for lading health..." +TIMEOUT=60 +ELAPSED=0 +while [ $ELAPSED -lt $TIMEOUT ]; do + LADING_POD=$(kubectl get pods -l app=lading -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + if [ -n "$LADING_POD" ]; then + HTTP_CODE=$(kubectl exec "$LADING_POD" -- wget -q -O- --timeout=2 http://localhost:9000/metrics 2>/dev/null | head -1 && echo "ok" || echo "fail") + if [ "$HTTP_CODE" = "ok" ]; then + echo " ✓ Lading Prometheus endpoint healthy" + break + fi + fi + sleep 2 + ELAPSED=$((ELAPSED + 2)) +done + +if [ $ELAPSED -ge $TIMEOUT ]; then + echo " ✗ Timeout waiting for lading health" + kubectl logs -l app=lading --tail=20 + exit 1 +fi + +# Check for any failed pods. No failures are expected. Failure signals invalid +# memory limits maybe but at this point more likely misconfiguration. +FAILED_PODS=$(kubectl get pods -o json | jq -r '.items[] | select(.status.phase == "Failed" or .status.phase == "Unknown" or .status.phase == "CrashLoopBackOff") | .metadata.name') +if [ -n "$FAILED_PODS" ]; then + echo " ✗ Found failed pods:" + kubectl get pods + exit 1 +fi +echo " ✓ All systems healthy" +echo + +# Monitor for restarts +echo "[6/6] Monitoring for restarts (${DURATION}s)..." +echo " Started at: $(date)" +MONITOR_START_TIME=$(date +%s) +ELAPSED=0 +LAST_REPORT=0 + +while [ $ELAPSED -lt $DURATION ]; do + RESTART_DATA=$(kubectl get pods -l app.kubernetes.io/name=datadog-agent-deployment -o json 2>/dev/null) + if [ $? -ne 0 ]; then + sleep 5 + ELAPSED=$((ELAPSED + 5)) + continue + fi + + RESTART_COUNT=$(echo "$RESTART_DATA" | jq '[.items[].status.containerStatuses[]?.restartCount // 0] | add' 2>/dev/null || echo 0) + if [ -z "$RESTART_COUNT" ] || [ "$RESTART_COUNT" = "null" ]; then + RESTART_COUNT=0 + fi + + if [ $((ELAPSED - LAST_REPORT)) -ge 30 ]; then + REMAINING=$((DURATION - ELAPSED)) + echo " ${ELAPSED}s elapsed, ${REMAINING}s remaining (restarts: ${RESTART_COUNT})" + LAST_REPORT=$ELAPSED + fi + + if [ "$RESTART_COUNT" -gt 0 ]; then + CONTAINER_NAME=$(echo "$RESTART_DATA" | jq -r '.items[].status.containerStatuses[]? | select(.restartCount > 0) | .name' 2>/dev/null | head -1) + REASON=$(echo "$RESTART_DATA" | jq -r '.items[].status.containerStatuses[]? | select(.restartCount > 0) | .lastState.terminated.reason // "Unknown"' 2>/dev/null | head -1) + + echo + echo "========================================" + echo "RESULT: FAILURE" + echo "========================================" + echo "Container restarted: ${CONTAINER_NAME}" + echo "Restart count: ${RESTART_COUNT}" + echo "Reason: ${REASON}" + echo "Time to failure: ${ELAPSED}s" + echo + + if [ "$REASON" = "OOMKilled" ]; then + echo "💡 Container needs MORE memory" + else + echo "⚠️ Non-OOM restart:" + kubectl logs -l app.kubernetes.io/name=datadog-agent-deployment -c "${CONTAINER_NAME}" --previous --tail=20 + fi + echo "========================================" + exit 1 + fi + + sleep 5 + ELAPSED=$((ELAPSED + 5)) +done + +echo " Completed at: $(date)" +echo + +echo "========================================" +echo "RESULT: SUCCESS" +echo "========================================" +echo "No restarts detected" +echo "Test duration: ${DURATION} seconds" +echo "Tags: ${DD_TAGS_VALUE}" +echo + +# Query Prometheus for per-container memory usage +echo "Container memory usage:" +AGENT_POD=$(kubectl get pods -l app.kubernetes.io/name=datadog-agent-deployment -o jsonpath='{.items[0].metadata.name}') + +# Port-forward Prometheus to localhost +kubectl port-forward -n monitoring svc/prometheus-server 9090:80 >/dev/null 2>&1 & +PROM_PID=$! +sleep 3 + +# Run Python analysis script +python3 "$SCRIPT_DIR/analyze_memory.py" "http://localhost:9090/api/v1/query" "${AGENT_POD}" "${DURATION}" "${AGENT_MEMORY_MB}" "${TRACE_MEMORY_MB}" "${SYSPROBE_MEMORY_MB}" "${PROCESS_MEMORY_MB}" + +# Kill port-forward quietly +kill $PROM_PID >/dev/null 2>&1 +wait $PROM_PID 2>/dev/null +echo + +echo "💡 Agent stable - cluster is still running for examination" diff --git a/k8s/manifests/datadog-agent.yaml b/k8s/manifests/datadog-agent.yaml new file mode 100644 index 000000000..c78f494e6 --- /dev/null +++ b/k8s/manifests/datadog-agent.yaml @@ -0,0 +1,84 @@ +apiVersion: datadoghq.com/v2alpha1 +kind: DatadogAgent +metadata: + name: datadog + namespace: default +spec: + global: + clusterName: lading-test + site: datadoghq.com + credentials: + apiSecret: + secretName: datadog-secret + keyName: api-key + endpoint: + url: http://lading-intake:8080 + + features: + apm: + enabled: true + + logCollection: + enabled: true + + dogstatsd: + unixDomainSocketConfig: + enabled: true + path: /var/run/datadog/dsd.socket + + npm: + enabled: true + + prometheusScrape: + enabled: true + enableServiceEndpoints: true + + override: + clusterAgent: + containers: + cluster-agent: + livenessProbe: + initialDelaySeconds: 60 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 10 + readinessProbe: + initialDelaySeconds: 60 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 10 + nodeAgent: + image: + name: gcr.io/datadoghq/agent:7.72.1 + containers: + agent: + env: + - name: DD_HOSTNAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: DD_TAGS + value: "{{ DD_TAGS }}" + resources: + limits: + memory: "{{ AGENT_MEMORY_MB }}Mi" + volumeMounts: + - name: dsdsocket + mountPath: /var/run/datadog + trace-agent: + resources: + limits: + memory: "{{ TRACE_MEMORY_MB }}Mi" + system-probe: + resources: + limits: + memory: "{{ SYSPROBE_MEMORY_MB }}Mi" + process-agent: + resources: + limits: + memory: "{{ PROCESS_MEMORY_MB }}Mi" + volumes: + - name: dsdsocket + hostPath: + path: /var/run/datadog + type: DirectoryOrCreate diff --git a/k8s/manifests/datadog-secret.yaml b/k8s/manifests/datadog-secret.yaml new file mode 100644 index 000000000..0a12fb4ea --- /dev/null +++ b/k8s/manifests/datadog-secret.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Secret +metadata: + name: datadog-secret + namespace: default +type: Opaque +stringData: + api-key: "fake-api-key-for-testing" diff --git a/k8s/manifests/deny-egress.yaml b/k8s/manifests/deny-egress.yaml new file mode 100644 index 000000000..b21e1293a --- /dev/null +++ b/k8s/manifests/deny-egress.yaml @@ -0,0 +1,22 @@ +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: deny-internet-egress + namespace: default +spec: + podSelector: {} + policyTypes: + - Egress + egress: + # Allow DNS + - to: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: kube-system + ports: + - protocol: UDP + port: 53 + # Allow intra-cluster communication + - to: + - podSelector: {} + # Block everything else (internet egress blocked) diff --git a/k8s/manifests/lading-intake.yaml b/k8s/manifests/lading-intake.yaml new file mode 100644 index 000000000..10a086393 --- /dev/null +++ b/k8s/manifests/lading-intake.yaml @@ -0,0 +1,67 @@ +# Lading intake (blackhole) - mimics Datadog API to receive agent output +# +# This deployment acts as a fake Datadog backend for self-contained testing: +# - Accepts agent API v2 submissions at :8080 +# - Discards all received data (blackhole mode) +# - Allows testing without external Datadog connectivity +# - Used with network isolation (deny-egress.yaml) to ensure agent only talks to this intake +# - Infinite runtime: runs until manually stopped +# +# The agent is configured to send to http://lading-intake:8080 instead of Datadog. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: lading-intake-config + namespace: default +data: + lading.yaml: | + blackhole: + - datadog: + v2: + binding_addr: "0.0.0.0:8080" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: lading-intake + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: lading-intake + template: + metadata: + labels: + app: lading-intake + spec: + containers: + - name: lading + image: ghcr.io/datadog/lading:0.29.2 + args: + - "--config-path" + - "/etc/lading/lading.yaml" + - "--no-target" + - "--prometheus-addr" + - "0.0.0.0:9000" + - "--experiment-duration-infinite" + volumeMounts: + - name: config + mountPath: /etc/lading + volumes: + - name: config + configMap: + name: lading-intake-config +--- +apiVersion: v1 +kind: Service +metadata: + name: lading-intake + namespace: default +spec: + selector: + app: lading-intake + ports: + - port: 8080 + targetPort: 8080 diff --git a/k8s/manifests/lading.yaml b/k8s/manifests/lading.yaml new file mode 100644 index 000000000..b38ee1372 --- /dev/null +++ b/k8s/manifests/lading.yaml @@ -0,0 +1,137 @@ +# Lading load generator - sends 100 MiB/s of DogStatsD metrics to the Datadog agent +# +# This deployment generates synthetic load matching the uds_dogstatsd_to_api regression test: +# - High cardinality: 1,000-10,000 unique metric contexts +# - Heavy tagging: 2-50 tags per metric, 3-150 chars each +# - Unix domain socket: connects to agent at /var/run/datadog/dsd.socket +# - Deterministic: uses fixed seed for reproducible load patterns +# - Infinite runtime: runs until manually stopped +# +# The Service exposes Prometheus metrics at :9000/metrics for monitoring lading itself. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: lading-config + namespace: default +data: + lading.yaml: | + # From datadog-agent test/regression/cases/uds_dogstatsd_to_api + generator: + - unix_datagram: + seed: [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, + 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131] + path: "/var/run/datadog/dsd.socket" + variant: + dogstatsd: + contexts: + inclusive: + min: 1000 + max: 10000 + name_length: + inclusive: + min: 1 + max: 200 + tag_length: + inclusive: + min: 3 + max: 150 + tags_per_msg: + inclusive: + min: 2 + max: 50 + multivalue_count: + inclusive: + min: 2 + max: 32 + multivalue_pack_probability: 0.08 + kind_weights: + metric: 90 + event: 5 + service_check: 5 + metric_weights: + count: 100 + gauge: 10 + timer: 0 + distribution: 0 + set: 0 + histogram: 0 + bytes_per_second: "100 MiB" + maximum_prebuild_cache_size_bytes: "500 MiB" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: lading + namespace: default + labels: + app: lading +spec: + replicas: 1 + selector: + matchLabels: + app: lading + template: + metadata: + labels: + app: lading + spec: + containers: + - name: lading + image: ghcr.io/datadog/lading:0.29.2 + command: ["lading"] + args: + - "--config-path" + - "/etc/lading/lading.yaml" + - "--no-target" + - "--prometheus-addr" + - "0.0.0.0:9000" + - "--experiment-duration-infinite" + resources: + limits: + memory: "4Gi" + cpu: "2" + requests: + memory: "4Gi" + cpu: "2" + volumeMounts: + - name: config + mountPath: /etc/lading + readOnly: true + - name: dsdsocket + mountPath: /var/run/datadog + volumes: + - name: config + configMap: + name: lading-config + - name: dsdsocket + hostPath: + path: /var/run/datadog + type: DirectoryOrCreate +--- +apiVersion: v1 +kind: Service +metadata: + name: lading + namespace: default + annotations: + ad.datadoghq.com/service.checks: | + { + "openmetrics": { + "init_config": {}, + "instances": [ + { + "openmetrics_endpoint": "http://%%host%%:9000/metrics", + "namespace": "lading", + "metrics": [".*"] + } + ] + } + } +spec: + selector: + app: lading + ports: + - name: prometheus + port: 9000 + targetPort: 9000