From 695abb7ef8e263642bbde1a6495b387e76de60df Mon Sep 17 00:00:00 2001 From: Felix Delattre Date: Fri, 24 Oct 2025 12:00:49 +0200 Subject: [PATCH 1/3] feat: add Knative integration for notifications - Add CloudEvents sink deployment for eoapi-notifier integration - Configure dynamic secret name for PostgreSQL connection - Add local development configuration with reduced resources - Support both CI and local test environments --- .github/workflows/helm-tests.yml | 210 ++++++++++++++------- .gitignore | 2 + CHANGELOG.md | 2 + charts/eoapi/test-k3s-unittest-values.yaml | 2 +- charts/eoapi/test-local-values.yaml | 109 +++++++++++ scripts/deploy.sh | 87 ++++++++- scripts/test.sh | 6 +- 7 files changed, 340 insertions(+), 78 deletions(-) create mode 100644 charts/eoapi/test-local-values.yaml diff --git a/.github/workflows/helm-tests.yml b/.github/workflows/helm-tests.yml index 2af833fc..e18fef38 100644 --- a/.github/workflows/helm-tests.yml +++ b/.github/workflows/helm-tests.yml @@ -25,8 +25,8 @@ jobs: - name: Run Helm unit tests run: make tests - integration: - name: Integration Tests (K3s) + k3s-integration-tests: + name: K3s Integration Tests if: github.event.pull_request.head.repo.full_name == github.repository permissions: contents: 'read' @@ -47,106 +47,170 @@ jobs: - name: Set release name run: echo "RELEASE_NAME=eoapi-$(echo "${{ github.sha }}" | cut -c1-8)" >> "$GITHUB_ENV" - - name: Deploy eoAPI - id: deploy - continue-on-error: true + - name: Wait for K3s to be fully ready run: | - echo "=== Starting eoAPI deployment ===" - export RELEASE_NAME="$RELEASE_NAME" - export PGO_VERSION="${{ env.PGO_VERSION }}" - export GITHUB_SHA="${{ github.sha }}" - ./scripts/deploy.sh --ci + echo "=== Waiting for K3s to be fully ready ===" - - name: Check deployment status - id: check - if: steps.deploy.outcome == 'success' - run: | - echo "=== Checking deployment status ===" - export RELEASE_NAME="$RELEASE_NAME" - ./scripts/test.sh check-deployment --debug + # Wait for core K3s components to be ready + echo "Waiting for kube-system pods to be ready..." + kubectl wait --for=condition=Ready pod -l k8s-app=kube-dns -n kube-system --timeout=300s + kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=traefik -n kube-system --timeout=300s - - name: Debug pgstac jobs if deployment failed - if: steps.deploy.outcome == 'failure' - continue-on-error: true - run: | - echo "=== Debugging pgstac job failures ===" - - # Check pgstac-migrate job - echo "===== pgstac-migrate Job Status =====" - kubectl get jobs -l app.kubernetes.io/name=pgstac-migrate -o wide || echo "No pgstac-migrate jobs found" - - MIGRATE_PODS=$(kubectl get pods -l app.kubernetes.io/name=pgstac-migrate -o jsonpath='{.items[*].metadata.name}' 2>/dev/null) - if [ -n "$MIGRATE_PODS" ]; then - for POD in $MIGRATE_PODS; do - echo "--- Logs from migrate pod $POD ---" - kubectl logs "$POD" --tail=100 || true - echo "--- Description of migrate pod $POD ---" - kubectl describe pod "$POD" - done - fi - - # Check pgstac-load-samples job - echo "===== pgstac-load-samples Job Status =====" - kubectl get jobs -l app.kubernetes.io/name=pgstac-load-samples -o wide || echo "No pgstac-load-samples jobs found" - - SAMPLES_PODS=$(kubectl get pods -l app.kubernetes.io/name=pgstac-load-samples -o jsonpath='{.items[*].metadata.name}' 2>/dev/null) - if [ -n "$SAMPLES_PODS" ]; then - for POD in $SAMPLES_PODS; do - echo "--- Logs from samples pod $POD ---" - kubectl logs "$POD" --tail=100 || true - echo "--- Description of samples pod $POD ---" - kubectl describe pod "$POD" - done - fi + # Wait for API server to be fully responsive + echo "Checking API server responsiveness..." + kubectl get nodes + kubectl get pods --all-namespaces - # Check database status - echo "===== Database Pod Status =====" - kubectl get pods -l postgres-operator.crunchydata.com/cluster -o wide - kubectl get postgrescluster -o wide + # Give K3s a moment to initialize all CRDs + echo "Waiting for K3s initialization to complete..." + sleep 10 - # Check ConfigMaps - echo "===== Relevant ConfigMaps =====" - kubectl get configmaps | grep -E "initdb|pgstac" || echo "No relevant configmaps found" + echo "✅ K3s is ready" - # Check for any related events - echo "===== Related Kubernetes Events =====" - kubectl get events | grep -E "pgstac|initdb" || echo "No relevant events found" + - name: Install Knative Serving + run: | + echo "=== Installing Knative Serving ===" + # Install Knative Serving CRDs + kubectl apply -f https://github.com/knative/serving/releases/download/knative-v1.17.0/serving-crds.yaml + # Install Knative Serving core components + kubectl apply -f https://github.com/knative/serving/releases/download/knative-v1.17.0/serving-core.yaml + # Install Kourier networking layer for Knative + kubectl apply -f https://github.com/knative/net-kourier/releases/download/knative-v1.17.0/kourier.yaml + # Configure Knative to use Kourier + kubectl patch configmap/config-network \ + --namespace knative-serving \ + --type merge \ + --patch '{"data":{"ingress-class":"kourier.ingress.networking.knative.dev"}}' + # Wait for Knative Serving to be ready + echo "Waiting for Knative Serving to be ready..." + kubectl wait --for=condition=Ready pod -l app=controller -n knative-serving --timeout=300s + kubectl wait --for=condition=Ready pod -l app=webhook -n knative-serving --timeout=300s + kubectl wait --for=condition=Ready pod -l app=3scale-kourier-gateway -n kourier-system --timeout=300s + + - name: Install Knative Eventing + run: | + echo "=== Installing Knative Eventing ===" + # Install Knative Eventing CRDs (includes SinkBinding) + kubectl apply -f https://github.com/knative/eventing/releases/download/knative-v1.17.0/eventing-crds.yaml + # Install Knative Eventing core components + kubectl apply -f https://github.com/knative/eventing/releases/download/knative-v1.17.0/eventing-core.yaml + # Wait for Knative Eventing to be ready + echo "Waiting for Knative Eventing to be ready..." + kubectl wait --for=condition=Ready pod -l app=eventing-controller -n knative-eventing --timeout=300s + kubectl wait --for=condition=Ready pod -l app=eventing-webhook -n knative-eventing --timeout=300s + + - name: Deploy CloudEvents sink for eoapi-notifier + run: | + echo "=== Deploying CloudEvents sink ===" + # Create the namespace first + kubectl create namespace eoapi || true + # Deploy the CloudEvents sink service + kubectl apply -f charts/eoapi/samples/cloudevents-sink.yaml + # Wait for the Knative service to be ready + echo "Waiting for CloudEvents sink to be ready..." + kubectl wait --for=condition=Ready ksvc/eoapi-cloudevents-sink -n eoapi --timeout=300s + + - name: Wait for Traefik to be ready + run: | + echo "=== Waiting for Traefik to be ready ===" + + # Wait for Traefik pods to be ready first + echo "Waiting for Traefik controller to be ready..." + kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=traefik -n kube-system --timeout=300s + + # Wait for essential Traefik CRDs to be available + echo "Checking for Traefik CRDs..." + timeout=300 + counter=0 + required_crds=("middlewares.traefik.io" "ingressroutes.traefik.io") + + for crd in "${required_crds[@]}"; do + echo "Checking for CRD: $crd" + counter=0 + while [ $counter -lt $timeout ]; do + if kubectl get crd "$crd" &>/dev/null; then + echo "✅ $crd is available" + break + fi + echo "⏳ Waiting for $crd... ($counter/$timeout)" + sleep 3 + counter=$((counter + 3)) + done - # Check notification system status - echo "===== Notification System Status =====" - kubectl get deployments -l app.kubernetes.io/name=eoapi-notifier -o wide || echo "No eoapi-notifier deployment found" - kubectl get ksvc -l app.kubernetes.io/component=cloudevents-sink -o wide || echo "No Knative CloudEvents sink found" + if [ $counter -ge $timeout ]; then + echo "❌ Timeout waiting for $crd" + echo "Available Traefik CRDs:" + kubectl get crd | grep traefik || echo "No Traefik CRDs found" + echo "All CRDs:" + kubectl get crd + exit 1 + fi + done - exit 1 + echo "✅ All required Traefik CRDs are ready" + + - name: Deploy eoAPI + id: deploy + run: | + echo "=== Starting eoAPI deployment ===" + export RELEASE_NAME="$RELEASE_NAME" + export PGO_VERSION="${{ env.PGO_VERSION }}" + export GITHUB_SHA="${{ github.sha }}" + ./scripts/deploy.sh --ci - name: Run integration tests - if: steps.deploy.outcome == 'success' run: | echo "=== Running integration tests ===" export RELEASE_NAME="$RELEASE_NAME" ./scripts/test.sh integration --debug - - name: Debug deployment status - if: always() + - name: Debug failed deployment + if: failure() run: | - echo "=== Final Deployment Status ===" + echo "=== Deployment failed - collecting debug information ===" kubectl get pods -o wide kubectl get jobs -o wide kubectl get services -o wide - kubectl get ingress + kubectl get events --sort-by='.lastTimestamp' | tail -20 || true + + # Check Knative installation status + echo "=== Knative Installation Status ===" + kubectl get pods -n knative-serving -o wide || echo "Knative Serving not installed" + kubectl get pods -n knative-eventing -o wide || echo "Knative Eventing not installed" + kubectl get pods -n kourier-system -o wide || echo "Kourier not installed" + # Check Knative CRDs + echo "=== Knative CRDs Status ===" + kubectl get crd | grep knative || echo "No Knative CRDs found" + kubectl get crd sinkbindings.sources.knative.dev || echo "SinkBinding CRD not found" + + # Check Traefik status + echo "=== Traefik Status ===" + kubectl get pods -n kube-system -l app.kubernetes.io/name=traefik -o wide || echo "No Traefik pods found" + kubectl get crd | grep traefik || echo "No Traefik CRDs found" + kubectl get crd middlewares.traefik.io || echo "Middleware CRD not found" + kubectl get crd ingressroutes.traefik.io || echo "IngressRoute CRD not found" # Check notification system final status echo "=== Notification System Final Status ===" kubectl get deployments -l app.kubernetes.io/name=eoapi-notifier -o wide || echo "No eoapi-notifier deployment" kubectl get pods -l app.kubernetes.io/name=eoapi-notifier -o wide || echo "No eoapi-notifier pods" - kubectl get ksvc -l app.kubernetes.io/component=cloudevents-sink -o wide || echo "No Knative CloudEvents sink" - kubectl get pods -l serving.knative.dev/service -o wide || echo "No Knative CloudEvents sink pods" + kubectl get ksvc -n eoapi -o wide || echo "No Knative services in eoapi namespace" + kubectl get ksvc eoapi-cloudevents-sink -n eoapi -o wide || echo "No eoapi-cloudevents-sink Knative service" + kubectl get pods -l serving.knative.dev/service=eoapi-cloudevents-sink -n eoapi -o wide || echo "No CloudEvents sink pods" + # Check SinkBinding resources + echo "=== SinkBinding Resources ===" + kubectl get sinkbindings -A -o wide || echo "No SinkBinding resources found" # Show notification logs if they exist echo "=== eoapi-notifier Logs ===" kubectl logs -l app.kubernetes.io/name=eoapi-notifier --tail=20 || echo "No eoapi-notifier logs" echo "=== Knative CloudEvents Sink Logs ===" - kubectl logs -l serving.knative.dev/service --tail=20 || echo "No Knative CloudEvents sink logs" + kubectl logs -l serving.knative.dev/service=eoapi-cloudevents-sink -n eoapi --tail=20 || echo "No CloudEvents sink logs" + # Show Knative system logs if there are issues + echo "=== Knative Serving Controller Logs ===" + kubectl logs -n knative-serving -l app=controller --tail=20 || echo "No Knative Serving controller logs" + echo "=== Knative Eventing Controller Logs ===" + kubectl logs -n knative-eventing -l app=eventing-controller --tail=20 || echo "No Knative Eventing controller logs" - name: Cleanup diff --git a/.gitignore b/.gitignore index 35f7b4e8..469ec3dc 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ charts/config.yaml charts/eoapi/charts/*.tgz config_ingress.yaml __pycache__ + +CLAUDE.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a09ab58..cf7a966f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- Added knative in CI to test eoapi-notifier. + ## [0.7.12] - 2025-10-17 - Bumped eoapi-notifier dependency version to 0.0.7 diff --git a/charts/eoapi/test-k3s-unittest-values.yaml b/charts/eoapi/test-k3s-unittest-values.yaml index 33a4ec56..a1e83a16 100644 --- a/charts/eoapi/test-k3s-unittest-values.yaml +++ b/charts/eoapi/test-k3s-unittest-values.yaml @@ -62,7 +62,7 @@ eoapi-notifier: channel: pgstac_items_change connection: existingSecret: - name: "eoapi-test-pguser-eoapi" + name: "" keys: username: "user" password: "password" diff --git a/charts/eoapi/test-local-values.yaml b/charts/eoapi/test-local-values.yaml new file mode 100644 index 00000000..59e139dd --- /dev/null +++ b/charts/eoapi/test-local-values.yaml @@ -0,0 +1,109 @@ +# Local test configuration for minikube/local development +# Based on test-k3s-unittest-values.yaml with minimal changes for local environment + +testing: true +ingress: + enabled: true + className: "nginx" # Changed from "traefik" for minikube + pathType: "Prefix" + host: "eoapi.local" + +pgstacBootstrap: + enabled: true + settings: + resources: + requests: + cpu: "256m" + memory: "1024Mi" + limits: + cpu: "512m" + memory: "1024Mi" + +raster: + enabled: true + settings: + resources: + limits: + cpu: "768m" + memory: "2048Mi" # Reduced from 4096Mi for local + requests: + cpu: "256m" + memory: "1024Mi" + +stac: + enabled: true + settings: + resources: + limits: + cpu: "1280m" + memory: "1536Mi" + requests: + cpu: "512m" + memory: "1024Mi" + +vector: + enabled: true + settings: + resources: + limits: + cpu: "768m" + memory: "1536Mi" + requests: + cpu: "256m" + memory: "1024Mi" + envVars: + TIPG_DEBUG: "True" + +eoapi-notifier: + enabled: true + config: + logLevel: DEBUG + sources: + - type: pgstac + config: + channel: pgstac_items_change + connection: + existingSecret: + name: "" # Set dynamically by deploy script + keys: + username: "user" + password: "password" + host: "host" + port: "port" + database: "dbname" + outputs: + - type: cloudevents + config: + source: /eoapi/pgstac + event_type: org.eoapi.stac.item + destination: + ref: + apiVersion: serving.knative.dev/v1 + kind: Service + name: eoapi-cloudevents-sink + resources: + requests: + cpu: "50m" + memory: "64Mi" + limits: + cpu: "200m" + memory: "128Mi" + +# Reduce PostgreSQL resources for local development +postgrescluster: + instances: + - name: "postgres" + replicas: 1 + dataVolumeClaimSpec: + accessModes: + - "ReadWriteOnce" + resources: + requests: + storage: "1Gi" # Reduced for local + resources: + requests: + cpu: "100m" # Reduced for local + memory: "512Mi" # Reduced for local + limits: + cpu: "500m" # Reduced for local + memory: "1Gi" # Reduced for local diff --git a/scripts/deploy.sh b/scripts/deploy.sh index 0b356109..2eca31dd 100755 --- a/scripts/deploy.sh +++ b/scripts/deploy.sh @@ -81,6 +81,55 @@ install_pgo() { kubectl get pods -l postgres-operator.crunchydata.com/control-plane=postgres-operator } +# Setup Knative for local development +setup_knative() { + log_info "Setting up Knative for local development..." + + if kubectl get namespace knative-serving &>/dev/null && kubectl get namespace knative-eventing &>/dev/null; then + log_info "Knative already installed, skipping installation" + return 0 + fi + + log_info "Installing Knative Serving..." + kubectl apply -f https://github.com/knative/serving/releases/download/knative-v1.17.0/serving-crds.yaml + kubectl apply -f https://github.com/knative/serving/releases/download/knative-v1.17.0/serving-core.yaml + kubectl apply -f https://github.com/knative/net-kourier/releases/download/knative-v1.17.0/kourier.yaml + # Configure Knative to use Kourier + kubectl patch configmap/config-network \ + --namespace knative-serving \ + --type merge \ + --patch '{"data":{"ingress-class":"kourier.ingress.networking.knative.dev"}}' + + log_info "Installing Knative Eventing..." + # Install Knative Eventing CRDs (includes SinkBinding) + kubectl apply -f https://github.com/knative/eventing/releases/download/knative-v1.17.0/eventing-crds.yaml + # Install Knative Eventing core components + kubectl apply -f https://github.com/knative/eventing/releases/download/knative-v1.17.0/eventing-core.yaml + + log_info "Waiting for Knative components to be ready..." + kubectl wait --for=condition=Ready pod -l app=controller -n knative-serving --timeout=300s + kubectl wait --for=condition=Ready pod -l app=webhook -n knative-serving --timeout=300s + kubectl wait --for=condition=Ready pod -l app=3scale-kourier-gateway -n kourier-system --timeout=300s + kubectl wait --for=condition=Ready pod -l app=eventing-controller -n knative-eventing --timeout=300s + kubectl wait --for=condition=Ready pod -l app=eventing-webhook -n knative-eventing --timeout=300s + + log_info "✅ Knative installation complete" +} + +deploy_cloudevents_sink() { + log_info "Deploying CloudEvents sink for notifications..." + + kubectl create namespace "$NAMESPACE" --dry-run=client -o yaml | kubectl apply -f - + + if kubectl apply -f charts/eoapi/samples/cloudevents-sink.yaml; then + log_info "Waiting for CloudEvents sink to be ready..." + kubectl wait --for=condition=Ready ksvc/eoapi-cloudevents-sink -n "$NAMESPACE" --timeout=300s + log_info "✅ CloudEvents sink deployed successfully" + else + log_warn "Failed to deploy CloudEvents sink, continuing without it" + fi +} + # Integrated Helm dependency setup setup_helm_dependencies() { log_info "Setting up Helm dependencies..." @@ -140,10 +189,17 @@ deploy_eoapi() { HELM_CMD="$HELM_CMD -f ./eoapi/values.yaml" fi - # CI-specific configuration + # Environment-specific configuration if [ "$CI_MODE" = true ] && [ -f "./eoapi/test-k3s-unittest-values.yaml" ]; then log_info "Using CI test configuration..." HELM_CMD="$HELM_CMD -f ./eoapi/test-k3s-unittest-values.yaml" + # Fix eoapi-notifier secret name dynamically + HELM_CMD="$HELM_CMD --set eoapi-notifier.config.sources[0].config.connection.existingSecret.name=$RELEASE_NAME-pguser-eoapi" + elif [ -f "./eoapi/test-local-values.yaml" ]; then + log_info "Using local test configuration..." + HELM_CMD="$HELM_CMD -f ./eoapi/test-local-values.yaml" + # Fix eoapi-notifier secret name dynamically for local mode too + HELM_CMD="$HELM_CMD --set eoapi-notifier.config.sources[0].config.connection.existingSecret.name=$RELEASE_NAME-pguser-eoapi" fi # Set git SHA if available @@ -160,6 +216,31 @@ deploy_eoapi() { cd .. || exit + # Wait for pgstac jobs to complete first + if kubectl get job -n "$NAMESPACE" -l "app=$RELEASE_NAME-pgstac-migrate" >/dev/null 2>&1; then + log_info "Waiting for pgstac-migrate job to complete..." + if ! kubectl wait --for=condition=complete job -l "app=$RELEASE_NAME-pgstac-migrate" -n "$NAMESPACE" --timeout=600s; then + log_error "pgstac-migrate job failed to complete" + kubectl describe job -l "app=$RELEASE_NAME-pgstac-migrate" -n "$NAMESPACE" + kubectl logs -l "app=$RELEASE_NAME-pgstac-migrate" -n "$NAMESPACE" --tail=50 || true + exit 1 + fi + fi + + if kubectl get job -n "$NAMESPACE" -l "app=$RELEASE_NAME-pgstac-load-samples" >/dev/null 2>&1; then + log_info "Waiting for pgstac-load-samples job to complete..." + if ! kubectl wait --for=condition=complete job -l "app=$RELEASE_NAME-pgstac-load-samples" -n "$NAMESPACE" --timeout=600s; then + log_error "pgstac-load-samples job failed to complete" + kubectl describe job -l "app=$RELEASE_NAME-pgstac-load-samples" -n "$NAMESPACE" + kubectl logs -l "app=$RELEASE_NAME-pgstac-load-samples" -n "$NAMESPACE" --tail=50 || true + exit 1 + fi + fi + + if [ "$CI_MODE" != true ]; then + deploy_cloudevents_sink + fi + # Verify deployment log_info "Verifying deployment..." kubectl get pods -n "$NAMESPACE" -o wide @@ -225,6 +306,10 @@ case $COMMAND in ;; deploy) install_pgo + + if [ "$CI_MODE" != true ]; then + setup_knative + fi setup_helm_dependencies deploy_eoapi ;; diff --git a/scripts/test.sh b/scripts/test.sh index 9fea0df8..805c4285 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -127,9 +127,9 @@ install_test_deps() { python_cmd="python3" fi - if ! $python_cmd -m pip install --quiet pytest httpx >/dev/null 2>&1; then - log_error "Failed to install test dependencies (pytest, httpx)" - log_error "Please install manually: pip install pytest httpx" + if ! $python_cmd -m pip install --quiet pytest httpx psycopg2-binary >/dev/null 2>&1; then + log_error "Failed to install test dependencies (pytest, httpx, psycopg2-binary)" + log_error "Please install manually: pip install pytest httpx psycopg2-binary" exit 1 fi From 5829f02e9088a1aabf50e9ce6e1d2a87c5fd1f8d Mon Sep 17 00:00:00 2001 From: Felix Delattre Date: Thu, 23 Oct 2025 15:47:32 +0200 Subject: [PATCH 2/3] feat: integrate observability into main eoapi chart - Remove separate eoapi-support chart and integrate all observability features into main chart - Add optional monitoring dependencies: metrics-server, prometheus, prometheus-adapter, grafana - Add monitoring helper templates (_monitoring.yaml, _resources.yaml, observability.yaml) - Add comprehensive observability documentation with deployment examples - Add reusable monitoring configuration base (values/monitoring.yaml) - Add autoscaling test suite for HPA validation - Move Grafana dashboard from eoapi-support to main chart - Update main values.yaml with observability configuration options This consolidation simplifies deployment by eliminating the need for a separate observability chart while maintaining full flexibility for enabling monitoring features. --- CHANGELOG.md | 3 +- charts/eoapi-support/.gitignore | 2 - charts/eoapi-support/.helmignore | 30 - charts/eoapi-support/Chart.yaml | 33 - charts/eoapi-support/README.md | 5 - charts/eoapi-support/values.yaml | 178 ----- charts/eoapi/Chart.yaml | 24 + charts/eoapi/README.md | 2 + .../dashboards/eoAPI-Dashboard.json | 0 charts/eoapi/templates/_monitoring.yaml | 80 +++ charts/eoapi/templates/_resources.yaml | 51 ++ .../templates/observability.yaml} | 2 + charts/eoapi/tests/autoscaling_tests.yaml | 241 +++++++ charts/eoapi/values.yaml | 115 ++- charts/eoapi/values/monitoring.yaml | 59 ++ docs/examples/values-autoscaling.yaml | 212 ++++++ docs/examples/values-full-observability.yaml | 303 ++++++++ docs/index.md | 11 +- docs/operations/autoscaling.md | 677 ++++++++++-------- docs/operations/observability.md | 334 +++++++++ 20 files changed, 1789 insertions(+), 573 deletions(-) delete mode 100644 charts/eoapi-support/.gitignore delete mode 100644 charts/eoapi-support/.helmignore delete mode 100644 charts/eoapi-support/Chart.yaml delete mode 100644 charts/eoapi-support/README.md delete mode 100644 charts/eoapi-support/values.yaml rename charts/{eoapi-support => eoapi}/dashboards/eoAPI-Dashboard.json (100%) create mode 100644 charts/eoapi/templates/_monitoring.yaml create mode 100644 charts/eoapi/templates/_resources.yaml rename charts/{eoapi-support/templates/dashboard.config.yaml => eoapi/templates/observability.yaml} (77%) create mode 100644 charts/eoapi/tests/autoscaling_tests.yaml create mode 100644 charts/eoapi/values/monitoring.yaml create mode 100644 docs/examples/values-autoscaling.yaml create mode 100644 docs/examples/values-full-observability.yaml create mode 100644 docs/operations/observability.md diff --git a/CHANGELOG.md b/CHANGELOG.md index cf7a966f..309770a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,7 +36,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- Excluded renovate.json from CHANGELOG.md edits [#301](https://github.com/developmentseed/eoapi-k8s/pull/301) +- Refactores eoapi-support into core eoapi chart [#262](https://github.com/developmentseed/eoapi-k8s/pull/262) + ## [0.7.8] - 2025-09-10 diff --git a/charts/eoapi-support/.gitignore b/charts/eoapi-support/.gitignore deleted file mode 100644 index 082a7414..00000000 --- a/charts/eoapi-support/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -Chart.lock -/charts diff --git a/charts/eoapi-support/.helmignore b/charts/eoapi-support/.helmignore deleted file mode 100644 index ada987c3..00000000 --- a/charts/eoapi-support/.helmignore +++ /dev/null @@ -1,30 +0,0 @@ -# Non default entries manually added by support developers - -# Ignore the .yaml that generates the .json, only the .json is relevant to -# bundle with the Helm chart when it is packaged or "helm dep up" is used to -# copy it over to another location where it is referenced. -values.schema.yaml - -# ----------------------------------------------------------------------------- - -# Patterns to ignore when building packages. -# This supports shell glob matching, relative path matching, and -# negation (prefixed with !). Only one pattern per line. -.DS_Store -# Common VCS dirs -.git/ -.gitignore -.bzr/ -.bzrignore -.hg/ -.hgignore -.svn/ -# Common backup files -*.swp -*.bak -*.tmp -*~ -# Various IDEs -.project -.idea/ -*.tmproj diff --git a/charts/eoapi-support/Chart.yaml b/charts/eoapi-support/Chart.yaml deleted file mode 100644 index 38bcca10..00000000 --- a/charts/eoapi-support/Chart.yaml +++ /dev/null @@ -1,33 +0,0 @@ -apiVersion: v2 -name: eoapi-support - -appVersion: "0.1.7" -version: "0.1.7" - -dependencies: - - name: metrics-server - version: 7.4.12 - repository: https://charts.bitnami.com/bitnami - - # Prometheus for collection of metrics. - # https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus - # - - name: prometheus - # NOTE: configuration for this dependency is handled in `eoapi-support/values.yaml.prometheus` values - version: 27.41.2 - repository: https://prometheus-community.github.io/helm-charts - - # used to create custom metrics to autoscale on - # - - name: prometheus-adapter - # NOTE: configuration for this dependency is handled in `eoapi-support/values.yaml.prometheus-adapter` values - version: 5.2.0 - repository: https://prometheus-community.github.io/helm-charts - - # Grafana for dashboarding of metrics - # https://github.com/grafana/helm-charts/tree/main/charts/grafana - # - - name: grafana - # NOTE: configuration for this dependency is handled in `eoapi-support/values.yaml.grafana` values - version: 10.1.2 - repository: https://grafana.github.io/helm-charts diff --git a/charts/eoapi-support/README.md b/charts/eoapi-support/README.md deleted file mode 100644 index b218eb69..00000000 --- a/charts/eoapi-support/README.md +++ /dev/null @@ -1,5 +0,0 @@ -#### eoAPI Support - -observability, monitoring and some custom metrics for autoscaling - -(please see documentation about `helm install` and configuration at ../../docs/autoscaling.md) diff --git a/charts/eoapi-support/values.yaml b/charts/eoapi-support/values.yaml deleted file mode 100644 index febe6af3..00000000 --- a/charts/eoapi-support/values.yaml +++ /dev/null @@ -1,178 +0,0 @@ -# most of this was cribbed from https://github.com/2i2c-org/infrastructure/blob/master/helm-charts/support/ -# so giving props where props are due to Yuvi Panda :sparkles: -prometheus-adapter: - prometheus: - # NOTE: the `url` below makes assumptions about release name and namespace: - # 1) Release name is "eoapi-support" (follows RELEASE_NAME-prometheus-server pattern) - # 2) Deployed in "eoapi" namespace - # 3) If using different release name, update to: http://YOUR_RELEASE_NAME-prometheus-server.YOUR_NAMESPACE.svc.cluster.local - url: http://eoapi-support-prometheus-server.eoapi.svc.cluster.local - port: 80 - path: "" - rules: - default: false - # NOTE: the `name.as` values below make some assumptions about your release name - # namely that you have run `helm install eoapi eoapi/eoapi --create-namespace=eoapi` - custom: - - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}' - seriesFilters: [] - resources: - template: <<.Resource>> - name: - matches: "" - as: "nginx_ingress_controller_requests_rate_vector_eoapi" - metricsQuery: round(sum(rate(<<.Series>>{service="vector",path=~"/vector.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001) - - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}' - seriesFilters: [] - resources: - template: <<.Resource>> - name: - matches: "" - as: "nginx_ingress_controller_requests_rate_raster_eoapi" - metricsQuery: round(sum(rate(<<.Series>>{service="raster",path=~"/raster.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001) - - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}' - seriesFilters: [] - resources: - template: <<.Resource>> - name: - matches: "" - as: "nginx_ingress_controller_requests_rate_stac_eoapi" - metricsQuery: round(sum(rate(<<.Series>>{service="stac",path=~"/stac.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001) - -prometheus: - # alertmanager is an optional prometheus chart dependency that we opt-out from - # as we favor Grafana for this functionality. Grafana provides alerts and does - # so with a better UI that we expose publicly behind auth anyhow. - # - alertmanager: - enabled: false - - # prometheus-pushgateway is an optional prometheus chart dependency that we - # opt-out from. pushgateway provides a way to complement prometheus server's - # behavior of scraping metrics from services by allowing services to push - # metrics to prometheus. - # - prometheus-pushgateway: - enabled: false - - # kube-state-metrics is deployed by default but listing here just so we know it is - kube-state-metrics: - enabled: true - - # prometheus-node-exporter is an optional prometheus chart dependency that we - # rely on to collect metrics about the nodes - # - # values ref: https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-node-exporter/values.yaml - # - prometheus-node-exporter: - # resources for the node-exporter was set after inspecting cpu and memory - # use via prometheus and grafana. - # - # node-exporter is typically found using between 0-3m CPU and 2-22Mi memory, - # but we've seen it fail to report cpu/memory use metrics from time to time - # when requesting and limiting to 5m, so we've increased requests/limit it - # to 10m. - # - # PromQL queries for CPU and memory use: - # - CPU: sum(rate(container_cpu_usage_seconds_total{container="node-exporter", namespace="support"}[5m])) by (pod) - # - Memory: sum(container_memory_usage_bytes{container="node-exporter", namespace="support"}) by (pod) - # - resources: - limits: - cpu: 10m - memory: 30Mi - requests: - cpu: 10m - memory: 30Mi - server: - service: - annotations: - service.beta.kubernetes.io/aws-load-balancer-type: "nlb" - service.beta.kubernetes.io/aws-load-balancer-internal: "false" - type: LoadBalancer - -grafana: - persistence: - enabled: false - deploymentStrategy: - type: Recreate - service: - type: LoadBalancer - annotations: - service.beta.kubernetes.io/aws-load-balancer-type: "nlb" - service.beta.kubernetes.io/aws-load-balancer-internal: "false" - rbac: - namespaced: true - pspEnabled: false - # initChownData refers to an init container enabled by default that isn't - # needed as we don't reconfigure the linux user the grafana server will run - # as. - initChownData: - enabled: false - - # resources for grafana was set after inspecting cpu and memory use via - # prometheus and grafana. - # - # Grafana's memory use seems to increase over time but seems reasonable to - # stay below 200Mi for years to come. Grafana's CPU use seems miniscule with - # peaks at up to 9m CPU from one user is browsing its dashboards. - # - # PromQL queries for CPU and memory use: - # - CPU: sum(rate(container_cpu_usage_seconds_total{container="grafana", namespace="support"}[5m])) by (pod) - # - Memory: sum(container_memory_usage_bytes{container="grafana", namespace="support"}) by (pod) - # - resources: - limits: - cpu: 100m - memory: 200Mi - requests: - cpu: 10m - memory: 200Mi - - datasources: - datasources.yaml: - apiVersion: 1 - datasources: - # Automatically add the prometheus server in the same namespace as the grafana as a datasource - - name: prometheus - orgId: 1 - type: prometheus - # NOTE: the `url` below makes assumptions about release name and namespace: - # 1) Release name is "eoapi-support" (follows RELEASE_NAME-prometheus-server pattern) - # 2) Deployed in "eoapi" namespace - # 3) If using different release name, update to: http://YOUR_RELEASE_NAME-prometheus-server.YOUR_NAMESPACE.svc.cluster.local - url: http://eoapi-support-prometheus-server.eoapi.svc.cluster.local - access: proxy - jsonData: - timeInterval: "5s" - isDefault: true - editable: true - version: 1 # This number should be increased when changes are made to update the datasource - - dashboardProviders: - dashboardproviders.yaml: - apiVersion: 1 - providers: - - name: 'default' - orgId: 1 - folder: '' - type: file - disableDeletion: false - editable: true - options: - path: /var/lib/grafana/dashboards/default - - dashboardsConfigMaps: - # NOTE: This must match the ConfigMap name created in templates/dashboard.config.yaml - # The template creates: {{ .Release.Name }}-dashboards - # If release name is "eoapi-support", this should be "eoapi-support-dashboards" - # Update this value to match your actual release name + "-dashboards" - default: "eoapi-support-dashboards" - -metrics-server: - image: - registry: docker.io - repository: bitnamilegacy/metrics-server - tag: "0.8.0-debian-12-r4" - apiService: - create: true diff --git a/charts/eoapi/Chart.yaml b/charts/eoapi/Chart.yaml index c94a7b07..8fe33af0 100644 --- a/charts/eoapi/Chart.yaml +++ b/charts/eoapi/Chart.yaml @@ -57,3 +57,27 @@ dependencies: version: 0.0.7 repository: "oci://ghcr.io/developmentseed/charts" condition: eoapi-notifier.enabled + + # Optional monitoring components for metrics collection and autoscaling + # These are disabled by default to keep deployments lightweight + # Enable via: monitoring.prometheus.enabled=true, monitoring.metricsServer.enabled=true + - name: metrics-server + version: 7.2.8 + repository: https://charts.bitnami.com/bitnami + condition: monitoring.metricsServer.enabled + + - name: prometheus + version: 25.3.1 + repository: https://prometheus-community.github.io/helm-charts + condition: monitoring.prometheus.enabled + + - name: prometheus-adapter + version: 4.7.1 + repository: https://prometheus-community.github.io/helm-charts + condition: monitoring.prometheusAdapter.enabled + + # Observability components - Grafana dashboards and visualization + - name: grafana + version: 7.3.3 + repository: https://grafana.github.io/helm-charts + condition: observability.grafana.enabled diff --git a/charts/eoapi/README.md b/charts/eoapi/README.md index a7a95cdf..a76520ed 100644 --- a/charts/eoapi/README.md +++ b/charts/eoapi/README.md @@ -14,6 +14,8 @@ A Helm chart for deploying Earth Observation APIs with integrated STAC, raster, - Flexible database configuration - Real-time PostgreSQL notifications for STAC item changes - Unified ingress system +- Autoscaling +- Integrated observability (Prometheus & Grafana) ## TL;DR diff --git a/charts/eoapi-support/dashboards/eoAPI-Dashboard.json b/charts/eoapi/dashboards/eoAPI-Dashboard.json similarity index 100% rename from charts/eoapi-support/dashboards/eoAPI-Dashboard.json rename to charts/eoapi/dashboards/eoAPI-Dashboard.json diff --git a/charts/eoapi/templates/_monitoring.yaml b/charts/eoapi/templates/_monitoring.yaml new file mode 100644 index 00000000..7f3bb1e3 --- /dev/null +++ b/charts/eoapi/templates/_monitoring.yaml @@ -0,0 +1,80 @@ +{{/* +Common monitoring configurations to avoid duplication across values files +*/}} + +{{/* +Basic monitoring stack configuration +*/}} +{{- define "eoapi.monitoring.basic" -}} +metricsServer: + enabled: true + apiService: + create: true +prometheus: + enabled: true + alertmanager: + enabled: false + prometheus-pushgateway: + enabled: false + kube-state-metrics: + enabled: true + prometheus-node-exporter: + enabled: true + resources: {{- include "eoapi.resources.small" . | nindent 6 }} + server: + service: + type: ClusterIP +{{- end -}} + +{{/* +Production monitoring with persistence +*/}} +{{- define "eoapi.monitoring.production" -}} +metricsServer: + enabled: true + apiService: + create: true +prometheus: + enabled: true + alertmanager: + enabled: true + prometheus-pushgateway: + enabled: false + kube-state-metrics: + enabled: true + prometheus-node-exporter: + enabled: true + resources: {{- include "eoapi.resources.small" . | nindent 6 }} + server: + service: + type: ClusterIP + persistentVolume: + enabled: true + size: 10Gi +{{- end -}} + +{{/* +Testing monitoring with minimal resources +*/}} +{{- define "eoapi.monitoring.testing" -}} +metricsServer: + enabled: true + apiService: + create: true +prometheus: + enabled: true + alertmanager: + enabled: false + prometheus-pushgateway: + enabled: false + kube-state-metrics: + enabled: true + prometheus-node-exporter: + enabled: true + resources: {{- include "eoapi.resources.small" . | nindent 6 }} + server: + service: + type: ClusterIP + persistentVolume: + enabled: false +{{- end -}} diff --git a/charts/eoapi/templates/_resources.yaml b/charts/eoapi/templates/_resources.yaml new file mode 100644 index 00000000..2c4ab5b7 --- /dev/null +++ b/charts/eoapi/templates/_resources.yaml @@ -0,0 +1,51 @@ +{{/* +Common resource definitions to avoid duplication across values files +*/}} + +{{/* +Small resource allocation for lightweight components +*/}} +{{- define "eoapi.resources.small" -}} +limits: + cpu: 10m + memory: 30Mi +requests: + cpu: 10m + memory: 30Mi +{{- end -}} + +{{/* +Medium resource allocation for standard services +*/}} +{{- define "eoapi.resources.medium" -}} +limits: + cpu: 100m + memory: 128Mi +requests: + cpu: 50m + memory: 64Mi +{{- end -}} + +{{/* +Large resource allocation for heavy workloads +*/}} +{{- define "eoapi.resources.large" -}} +limits: + cpu: 500m + memory: 512Mi +requests: + cpu: 250m + memory: 256Mi +{{- end -}} + +{{/* +Grafana specific resources based on observed usage patterns +*/}} +{{- define "eoapi.resources.grafana" -}} +limits: + cpu: 100m + memory: 200Mi +requests: + cpu: 50m + memory: 100Mi +{{- end -}} diff --git a/charts/eoapi-support/templates/dashboard.config.yaml b/charts/eoapi/templates/observability.yaml similarity index 77% rename from charts/eoapi-support/templates/dashboard.config.yaml rename to charts/eoapi/templates/observability.yaml index 6c0f2382..fdf132a2 100644 --- a/charts/eoapi-support/templates/dashboard.config.yaml +++ b/charts/eoapi/templates/observability.yaml @@ -1,3 +1,4 @@ +{{- if .Values.observability.grafana.enabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -7,3 +8,4 @@ metadata: data: kubernetes.json: |- {{ .Files.Get "dashboards/eoAPI-Dashboard.json" | indent 4 }} +{{- end }} diff --git a/charts/eoapi/tests/autoscaling_tests.yaml b/charts/eoapi/tests/autoscaling_tests.yaml new file mode 100644 index 00000000..18cd9452 --- /dev/null +++ b/charts/eoapi/tests/autoscaling_tests.yaml @@ -0,0 +1,241 @@ +suite: autoscaling tests +templates: + - templates/services/stac/hpa.yaml + - templates/services/raster/hpa.yaml + - templates/services/vector/hpa.yaml + - templates/services/multidim/hpa.yaml +tests: + - it: "autoscaling disabled by default" + set: + stac.autoscaling.enabled: false + raster.autoscaling.enabled: false + vector.autoscaling.enabled: false + multidim.autoscaling.enabled: false + asserts: + - hasDocuments: + count: 0 + + - it: "stac hpa not created when autoscaling disabled" + set: + stac.enabled: true + stac.autoscaling.enabled: false + template: templates/services/stac/hpa.yaml + asserts: + - hasDocuments: + count: 0 + + - it: "stac hpa created with cpu autoscaling" + set: + stac.enabled: true + stac.autoscaling.enabled: true + stac.autoscaling.type: "cpu" + stac.autoscaling.targets.cpu: 70 + template: templates/services/stac/hpa.yaml + asserts: + - isKind: + of: HorizontalPodAutoscaler + - equal: + path: metadata.name + value: "RELEASE-NAME-stac-hpa" + - equal: + path: spec.minReplicas + value: 1 + - isNotEmpty: + path: spec.maxReplicas + - equal: + path: spec.metrics[0].type + value: "Resource" + - equal: + path: spec.metrics[0].resource.name + value: "cpu" + - equal: + path: spec.metrics[0].resource.target.averageUtilization + value: 70 + + - it: "stac hpa created with request rate autoscaling" + set: + stac.enabled: true + stac.autoscaling.enabled: true + stac.autoscaling.type: "requestRate" + stac.autoscaling.targets.requestRate: "50000m" + template: templates/services/stac/hpa.yaml + asserts: + - isKind: + of: HorizontalPodAutoscaler + - equal: + path: spec.minReplicas + value: 1 + - isNotEmpty: + path: spec.maxReplicas + - equal: + path: spec.metrics[0].type + value: "Pods" + - equal: + path: spec.metrics[0].pods.metric.name + value: "nginx_ingress_controller_requests" + - equal: + path: spec.metrics[0].pods.target.averageValue + value: "50000m" + + - it: "stac hpa created with both cpu and request rate autoscaling" + set: + stac.enabled: true + stac.autoscaling.enabled: true + stac.autoscaling.type: "both" + stac.autoscaling.targets.cpu: 70 + stac.autoscaling.targets.requestRate: "50000m" + template: templates/services/stac/hpa.yaml + asserts: + - isKind: + of: HorizontalPodAutoscaler + - equal: + path: spec.metrics[0].type + value: "Resource" + - equal: + path: spec.metrics[0].resource.name + value: "cpu" + - equal: + path: spec.metrics[1].type + value: "Pods" + - equal: + path: spec.metrics[1].pods.metric.name + value: "nginx_ingress_controller_requests" + + - it: "raster hpa created with request rate autoscaling" + set: + raster.enabled: true + raster.autoscaling.enabled: true + raster.autoscaling.type: "requestRate" + raster.autoscaling.targets.requestRate: "30000m" + template: templates/services/raster/hpa.yaml + asserts: + - isKind: + of: HorizontalPodAutoscaler + - equal: + path: spec.metrics[0].pods.metric.name + value: "nginx_ingress_controller_requests" + - equal: + path: spec.metrics[0].pods.target.averageValue + value: "30000m" + + - it: "vector hpa created with request rate autoscaling" + set: + vector.enabled: true + vector.autoscaling.enabled: true + vector.autoscaling.type: "requestRate" + vector.autoscaling.targets.requestRate: "40000m" + template: templates/services/vector/hpa.yaml + asserts: + - isKind: + of: HorizontalPodAutoscaler + - equal: + path: spec.metrics[0].pods.metric.name + value: "nginx_ingress_controller_requests" + - equal: + path: spec.metrics[0].pods.target.averageValue + value: "40000m" + + - it: "multidim hpa not created when service disabled" + set: + multidim.enabled: false + multidim.autoscaling.enabled: true + template: templates/services/multidim/hpa.yaml + asserts: + - hasDocuments: + count: 0 + + - it: "multidim hpa created when enabled" + set: + multidim.enabled: true + multidim.autoscaling.enabled: true + multidim.autoscaling.type: "cpu" + multidim.autoscaling.targets.cpu: 80 + template: templates/services/multidim/hpa.yaml + asserts: + - isKind: + of: HorizontalPodAutoscaler + - equal: + path: spec.metrics[0].resource.target.averageUtilization + value: 80 + + - it: "hpa scaleTargetRef points to correct deployment" + set: + stac.enabled: true + stac.autoscaling.enabled: true + stac.autoscaling.type: "cpu" + template: templates/services/stac/hpa.yaml + asserts: + - equal: + path: spec.scaleTargetRef.name + value: "RELEASE-NAME-stac" + - equal: + path: spec.scaleTargetRef.kind + value: "Deployment" + + - it: "hpa custom replica configuration" + set: + stac.enabled: true + stac.autoscaling.enabled: true + stac.autoscaling.type: "cpu" + stac.autoscaling.minReplicas: 2 + stac.autoscaling.maxReplicas: 20 + template: templates/services/stac/hpa.yaml + asserts: + - equal: + path: spec.minReplicas + value: 2 + - equal: + path: spec.maxReplicas + value: 20 + + - it: "hpa includes proper labels" + set: + stac.enabled: true + stac.autoscaling.enabled: true + stac.autoscaling.type: "cpu" + template: templates/services/stac/hpa.yaml + asserts: + - equal: + path: metadata.labels.app + value: "RELEASE-NAME-stac" + + - it: "hpa behavior configuration applied when set" + set: + stac.enabled: true + stac.autoscaling.enabled: true + stac.autoscaling.type: "cpu" + stac.autoscaling.behavior.scaleUp.stabilizationWindowSeconds: 120 + stac.autoscaling.behavior.scaleDown.stabilizationWindowSeconds: 300 + template: templates/services/stac/hpa.yaml + asserts: + - equal: + path: spec.behavior.scaleUp.stabilizationWindowSeconds + value: 120 + - equal: + path: spec.behavior.scaleDown.stabilizationWindowSeconds + value: 300 + + - it: "stac hpa production configuration with higher minReplicas" + set: + stac.enabled: true + stac.autoscaling.enabled: true + stac.autoscaling.minReplicas: 2 + stac.autoscaling.maxReplicas: 20 + stac.autoscaling.type: "requestRate" + stac.autoscaling.targets.requestRate: "50000m" + template: templates/services/stac/hpa.yaml + asserts: + - isKind: + of: HorizontalPodAutoscaler + - equal: + path: spec.minReplicas + value: 2 + - equal: + path: spec.maxReplicas + value: 20 + - equal: + path: spec.metrics[0].type + value: "Pods" + - equal: + path: spec.metrics[0].pods.target.averageValue + value: "50000m" diff --git a/charts/eoapi/values.yaml b/charts/eoapi/values.yaml index dfcfd615..b07e70b4 100644 --- a/charts/eoapi/values.yaml +++ b/charts/eoapi/values.yaml @@ -202,7 +202,7 @@ raster: enabled: true # Control ingress specifically for raster service path: "/raster" # Configurable path prefix for the raster service autoscaling: - # NOTE: to have autoscaling working you'll need to install the `eoapi-support` chart + # NOTE: to have autoscaling working you'll need to enable monitoring # see ../../../docs/autoscaling.md for more information enabled: false minReplicas: 1 @@ -274,7 +274,7 @@ multidim: enabled: true # Control ingress specifically for multidim service path: "/multidim" # Configurable path prefix for the multidim service autoscaling: - # NOTE: to have autoscaling working you'll need to install the `eoapi-support` chart + # NOTE: to have autoscaling working you'll need to enable monitoring # see ../../../docs/autoscaling.md for more information enabled: false minReplicas: 1 @@ -346,7 +346,7 @@ stac: enabled: true # Control ingress specifically for stac service path: "/stac" # Configurable path prefix for the stac service autoscaling: - # NOTE: to have autoscaling working you'll need to install the `eoapi-support` chart + # NOTE: to have autoscaling working you'll need to enable monitoring # see ../../../docs/autoscaling.md for more information enabled: false minReplicas: 1 @@ -406,7 +406,7 @@ vector: enabled: true # Control ingress specifically for vector service path: "/vector" # Configurable path prefix for the vector service autoscaling: - # NOTE: to have autoscaling working you'll need to install the `eoapi-support` chart + # NOTE: to have autoscaling working you'll need to enable monitoring # see ../../../docs/autoscaling.md for more information enabled: false minReplicas: 1 @@ -522,9 +522,114 @@ eoapi-notifier: namespace: serverless # For HTTP endpoints, use: endpoint: https://webhook.example.com +###################### +# MONITORING +###################### +# Core monitoring components for metrics collection and autoscaling +monitoring: + # Metrics server - essential for HPA functionality + metricsServer: + enabled: false + apiService: + create: true + + # Prometheus - core metrics collection for autoscaling + prometheus: + enabled: false + alertmanager: + enabled: false + prometheus-pushgateway: + enabled: false + kube-state-metrics: + enabled: true + prometheus-node-exporter: + enabled: true + resources: + limits: + cpu: 10m + memory: 30Mi + requests: + cpu: 10m + memory: 30Mi + server: + service: + type: ClusterIP # Internal service, no external exposure by default + + # Prometheus adapter - enables custom HPA metrics + prometheusAdapter: + enabled: false + prometheus: + # URL to Prometheus server - will be auto-configured for same-release Prometheus + # If using external Prometheus, set this to your Prometheus URL + # Example: http://my-prometheus-server.monitoring.svc.cluster.local + url: http://eoapi-prometheus-server.eoapi.svc.cluster.local + port: 80 + path: "" + rules: + default: false + # Custom metrics for eoapi service autoscaling + custom: + - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}' + seriesFilters: [] + resources: + template: <<.Resource>> + name: + matches: "" + as: "nginx_ingress_controller_requests_rate_vector_eoapi" + metricsQuery: round(sum(rate(<<.Series>>{service="vector",path=~"/vector.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001) + - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}' + seriesFilters: [] + resources: + template: <<.Resource>> + name: + matches: "" + as: "nginx_ingress_controller_requests_rate_raster_eoapi" + metricsQuery: round(sum(rate(<<.Series>>{service="raster",path=~"/raster.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001) + - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}' + seriesFilters: [] + resources: + template: <<.Resource>> + name: + matches: "" + as: "nginx_ingress_controller_requests_rate_stac_eoapi" + metricsQuery: round(sum(rate(<<.Series>>{service="stac",path=~"/stac.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001) + +###################### +# OBSERVABILITY +###################### +# Grafana dashboards and visualization (requires monitoring.prometheus.enabled=true) +observability: + grafana: + enabled: false + persistence: + enabled: false + service: + type: LoadBalancer + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: "nlb" + service.beta.kubernetes.io/aws-load-balancer-internal: "false" + resources: + limits: + cpu: 100m + memory: 200Mi + requests: + cpu: 50m + memory: 100Mi + datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + url: "http://{{ .Release.Name }}-prometheus-server" + access: proxy + isDefault: true + dashboardsConfigMaps: + default: "{{ .Release.Name }}-dashboards" + # Version being upgraded from, used for migration purposes # Dont set the value in the values.yaml file -# prefer to set it in the command line +# Instead, set it during upgrade using --set previousVersion= # helm upgrade --set previousVersion=$PREVIOUS_VERSION # or in the CI/CD pipeline previousVersion: "" diff --git a/charts/eoapi/values/monitoring.yaml b/charts/eoapi/values/monitoring.yaml new file mode 100644 index 00000000..519d34bd --- /dev/null +++ b/charts/eoapi/values/monitoring.yaml @@ -0,0 +1,59 @@ +###################### +# MONITORING BASE CONFIG +###################### +# Base monitoring configuration - import in values files with: +# monitoring: !include values/monitoring.yaml + +monitoring: + enabled: true + + # Metrics server for HPA + metricsServer: + enabled: true + apiService: + create: true + resources: &small_resources + limits: + cpu: 10m + memory: 30Mi + requests: + cpu: 10m + memory: 30Mi + + # Prometheus stack + prometheus: + enabled: true + alertmanager: + enabled: false + prometheus-pushgateway: + enabled: false + + kube-state-metrics: + enabled: true + resources: *small_resources + + prometheus-node-exporter: + enabled: true + resources: *small_resources + + server: + service: + type: ClusterIP + persistentVolume: + enabled: false + size: 8Gi + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 200m + memory: 256Mi + +# Autoscaling defaults +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 10 + targetCPUUtilizationPercentage: 80 + targetMemoryUtilizationPercentage: 80 diff --git a/docs/examples/values-autoscaling.yaml b/docs/examples/values-autoscaling.yaml new file mode 100644 index 00000000..8abdeab4 --- /dev/null +++ b/docs/examples/values-autoscaling.yaml @@ -0,0 +1,212 @@ +# Example values for eoAPI with core monitoring and autoscaling enabled + +gitSha: "latest" + +###################### +# INGRESS +###################### +ingress: + enabled: true + className: "nginx" + # IMPORTANT: Set a proper hostname for metrics collection + # nginx ingress controller requires a specific host (not wildcard) to expose metrics + host: "your-eoapi.example.com" # Replace with your domain + tls: + enabled: true + secretName: eoapi-tls + +###################### +# DATABASE +###################### +# Using default PostgreSQL cluster configuration +postgrescluster: + enabled: true + instances: + - name: eoapi + replicas: 1 + dataVolumeClaimSpec: + accessModes: + - "ReadWriteOnce" + resources: + requests: + storage: "50Gi" # Increased for production workloads + cpu: "2048m" # More CPU for database under load + memory: "4096Mi" # More memory for database performance + +###################### +# MONITORING & AUTOSCALING +###################### +# Essential monitoring components for autoscaling +monitoring: + metricsServer: + enabled: true + apiService: + create: true + prometheus: + enabled: true + alertmanager: + enabled: false + prometheus-pushgateway: + enabled: false + kube-state-metrics: + enabled: true + prometheus-node-exporter: + enabled: true + resources: + limits: + cpu: 10m + memory: 30Mi + requests: + cpu: 10m + memory: 30Mi + server: + service: + type: ClusterIP + +# Custom metrics for request-rate based autoscaling +prometheusAdapter: + enabled: true + +###################### +# SERVICE CONFIGURATION WITH AUTOSCALING +###################### + +# STAC API Service +stac: + enabled: true + autoscaling: + enabled: true + minReplicas: 2 # Start with 2 replicas for availability + maxReplicas: 20 # Scale up to handle high loads + type: "requestRate" # Scale based on request rate + behaviour: + scaleDown: + stabilizationWindowSeconds: 300 # Wait 5 minutes before scaling down + scaleUp: + stabilizationWindowSeconds: 30 # Scale up quickly (30 seconds) + targets: + requestRate: 50000m # Scale when average > 50 requests/second + settings: + resources: + limits: + cpu: "1000m" + memory: "2048Mi" + requests: + cpu: "500m" # Higher baseline for autoscaling + memory: "1024Mi" + +# Raster Service (TiTiler) +raster: + enabled: true + autoscaling: + enabled: true + minReplicas: 1 + maxReplicas: 15 + type: "requestRate" + behaviour: + scaleDown: + stabilizationWindowSeconds: 180 # Scale down slower for raster (3 min) + scaleUp: + stabilizationWindowSeconds: 60 # Scale up moderately fast + targets: + requestRate: 30000m # Scale when average > 30 requests/second (raster is more resource intensive) + settings: + resources: + limits: + cpu: "1536m" # Raster processing needs more CPU + memory: "6144Mi" # Raster processing needs more memory + requests: + cpu: "768m" + memory: "3072Mi" + envVars: + # Optimized GDAL settings for autoscaling + GDAL_CACHEMAX: "512" # Increased cache for better performance + WEB_CONCURRENCY: "8" # More workers for higher throughput + +# Vector Service (TIPG) +vector: + enabled: true + autoscaling: + enabled: true + minReplicas: 1 + maxReplicas: 10 + type: "requestRate" + behaviour: + scaleDown: + stabilizationWindowSeconds: 240 + scaleUp: + stabilizationWindowSeconds: 45 + targets: + requestRate: 75000m # Vector is typically lighter, can handle more requests + settings: + resources: + limits: + cpu: "1000m" + memory: "2048Mi" + requests: + cpu: "512m" + memory: "1024Mi" + +# Multidimensional Service (optional) +multidim: + enabled: false # Disabled by default + autoscaling: + enabled: true + minReplicas: 1 + maxReplicas: 8 + type: "requestRate" + targets: + requestRate: 25000m # Conservative scaling for multidim + settings: + resources: + limits: + cpu: "2048m" # Multidim can be very CPU intensive + memory: "8192Mi" # Large memory requirements for multidim data + requests: + cpu: "1024m" + memory: "4096Mi" + +###################### +# STAC BROWSER +###################### +browser: + enabled: true + replicaCount: 2 # Static replicas (browser is just static files) + +###################### +# PGSTAC BOOTSTRAP +###################### +pgstacBootstrap: + enabled: true + settings: + loadSamples: false # Disable sample data for production + resources: + requests: + cpu: "1024m" + memory: "2048Mi" + limits: + cpu: "1024m" + memory: "2048Mi" + +###################### +# ADDITIONAL NOTES +###################### +# +# To use this configuration: +# +# 1. Update the ingress.host to your actual domain +# 2. Adjust scaling targets based on your load testing results +# 3. Monitor resource usage and adjust requests/limits accordingly +# 4. Consider enabling TLS for production deployments +# +# IMPORTANT: This configuration enables monitoring components that are +# disabled by default. This is required for autoscaling to work. +# +# For observability and dashboards, install the separate eoapi-observability chart: +# helm install eoapi-obs eoapi/eoapi-observability --namespace eoapi +# +# Load testing recommendations: +# - Test each service endpoint individually +# - Monitor HPA metrics: kubectl get hpa -n eoapi -w +# - Check custom metrics: kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" +# - Review Prometheus targets to ensure metrics collection is working diff --git a/docs/examples/values-full-observability.yaml b/docs/examples/values-full-observability.yaml new file mode 100644 index 00000000..993ca9f4 --- /dev/null +++ b/docs/examples/values-full-observability.yaml @@ -0,0 +1,303 @@ +# Example values for eoAPI with full observability stack +# This configuration includes both core monitoring (in main chart) and observability tools +# Deploy main chart with these values, then deploy eoapi-observability chart separately + +# Git SHA for deployments (set via CI/CD or command line) +gitSha: "latest" + +###################### +# INGRESS +###################### +ingress: + enabled: true + className: "nginx" + # IMPORTANT: Set a proper hostname for metrics collection + host: "eoapi.example.com" # Replace with your domain + tls: + enabled: true + secretName: eoapi-tls + +###################### +# DATABASE +###################### +postgrescluster: + enabled: true + monitoring: true # Enable PostgreSQL monitoring + instances: + - name: eoapi + replicas: 2 # HA setup for production + dataVolumeClaimSpec: + accessModes: + - "ReadWriteOnce" + resources: + requests: + storage: "100Gi" + cpu: "2048m" + memory: "8192Mi" + +###################### +# COMPREHENSIVE MONITORING +###################### +monitoring: + # Essential components + metricsServer: + enabled: true + apiService: + create: true + + # Full Prometheus setup with all collectors + prometheus: + enabled: true + # Keep alertmanager disabled - we'll use Grafana alerting instead + alertmanager: + enabled: false + # Enable pushgateway for advanced metrics + prometheus-pushgateway: + enabled: true + # Full metrics collection + kube-state-metrics: + enabled: true + prometheus-node-exporter: + enabled: true + # Production-ready resource allocation + resources: + limits: + cpu: 50m + memory: 64Mi + requests: + cpu: 50m + memory: 64Mi + # Prometheus server configuration + server: + # Expose Prometheus for external access (optional) + service: + type: LoadBalancer + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: "nlb" + service.beta.kubernetes.io/aws-load-balancer-internal: "true" + # Persistent storage for metrics + persistentVolume: + enabled: true + size: 50Gi + storageClass: "gp3" # Adjust for your cloud provider + # Retention and performance settings + retention: "30d" # Keep 30 days of metrics + resources: + limits: + cpu: "2000m" + memory: "4096Mi" + requests: + cpu: "1000m" + memory: "2048Mi" + + # Advanced prometheus-adapter configuration + prometheusAdapter: + enabled: true + # Enhanced resource allocation + resources: + limits: + cpu: 250m + memory: 256Mi + requests: + cpu: 100m + memory: 128Mi + +###################### +# SERVICES WITH ADVANCED AUTOSCALING +###################### + +stac: + enabled: true + autoscaling: + enabled: true + minReplicas: 3 # Higher minimum for HA + maxReplicas: 30 + type: "both" # Scale on both CPU and request rate + behaviour: + scaleDown: + stabilizationWindowSeconds: 600 # 10 minutes + policies: + - type: Percent + value: 50 + periodSeconds: 300 + scaleUp: + stabilizationWindowSeconds: 60 + policies: + - type: Percent + value: 100 + periodSeconds: 60 + targets: + cpu: 70 + requestRate: 40000m + settings: + resources: + limits: + cpu: "1500m" + memory: "3072Mi" + requests: + cpu: "750m" + memory: "1536Mi" + +raster: + enabled: true + autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 25 + type: "both" + behaviour: + scaleDown: + stabilizationWindowSeconds: 900 # 15 minutes - raster workloads are bursty + scaleUp: + stabilizationWindowSeconds: 120 # 2 minutes + targets: + cpu: 60 # Lower CPU target due to intensive processing + requestRate: 20000m + settings: + resources: + limits: + cpu: "2048m" + memory: "8192Mi" + requests: + cpu: "1024m" + memory: "4096Mi" + envVars: + GDAL_CACHEMAX: "1024" # 1GB cache + WEB_CONCURRENCY: "4" # Conservative for memory usage + GDAL_HTTP_MAX_RETRY: "3" + GDAL_HTTP_RETRY_DELAY: "1" + +vector: + enabled: true + autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 15 + type: "both" + targets: + cpu: 75 + requestRate: 60000m + settings: + resources: + limits: + cpu: "1200m" + memory: "2560Mi" + requests: + cpu: "600m" + memory: "1280Mi" + +multidim: + enabled: true # Enable for comprehensive setup + autoscaling: + enabled: true + minReplicas: 1 + maxReplicas: 10 + type: "cpu" # CPU-based scaling for multidim workloads + targets: + cpu: 50 # Very conservative due to resource intensity + settings: + resources: + limits: + cpu: "4096m" + memory: "16384Mi" # 16GB for large multidim datasets + requests: + cpu: "2048m" + memory: "8192Mi" + +###################### +# STAC BROWSER +###################### +browser: + enabled: true + replicaCount: 3 # HA setup + +###################### +# PGSTAC BOOTSTRAP +###################### +pgstacBootstrap: + enabled: true + settings: + loadSamples: false # No samples in production + waitConfig: + timeout: 1800 # 30 minutes timeout for large migrations + resources: + requests: + cpu: "1024m" + memory: "2048Mi" + limits: + cpu: "2048m" + memory: "4096Mi" + +###################### +# INTEGRATED OBSERVABILITY +###################### +# Grafana dashboards integrated with main chart (replaces separate eoapi-observability chart) +observability: + grafana: + enabled: true + persistence: + enabled: true + size: 10Gi + service: + type: LoadBalancer + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: "nlb" + service.beta.kubernetes.io/aws-load-balancer-internal: "false" + resources: + limits: + cpu: 100m + memory: 200Mi + requests: + cpu: 50m + memory: 100Mi + +###################### +# ADDITIONAL PRODUCTION SETTINGS +###################### + +# Service account with monitoring permissions +serviceAccount: + create: true + annotations: + # Add cloud provider annotations if needed + # eks.amazonaws.com/role-arn: arn:aws:iam::ACCOUNT:role/eoapi-monitoring-role + +###################### +# DEPLOYMENT NOTES +###################### +# +# This configuration provides comprehensive observability including: +# - Core metrics collection and autoscaling (included in main chart) +# - Persistent Prometheus storage with 30-day retention +# - Advanced HPA policies with both CPU and request-rate scaling +# - Production-ready resource allocations +# - High availability setup with multiple replicas +# +# To deploy the full stack: +# +# 1. Deploy main chart with monitoring: +# helm install eoapi eoapi/eoapi -f values-full-observability.yaml --namespace eoapi --create-namespace +# +# 2. Deploy observability chart separately: +# helm install eoapi-obs eoapi/eoapi-observability --namespace eoapi +# +# 3. Optional: Configure external integrations +# - DataDog: Set up prometheus scraping +# - New Relic: Deploy NR Kubernetes integration +# - External Grafana: Point to the exposed Prometheus service +# +# Monitoring endpoints (if LoadBalancer is used): +# - Prometheus: http://:9090 +# - Grafana: http:// (from observability chart) +# +# Security considerations: +# - Use internal LoadBalancers for Prometheus in production +# - Set up proper RBAC for service accounts +# - Configure network policies to restrict access +# - Enable TLS for all external endpoints +# +# Performance tuning: +# - Monitor actual resource usage and adjust requests/limits +# - Tune HPA scaling policies based on traffic patterns +# - Adjust Prometheus retention based on storage costs +# - Consider using remote storage for Prometheus (S3, GCS, etc.) diff --git a/docs/index.md b/docs/index.md index 1e5ca6c1..41ea94f6 100644 --- a/docs/index.md +++ b/docs/index.md @@ -27,11 +27,12 @@ Please refer to our [quick start guide](./installation/quick-start.md) 2. Install the PostgreSQL Operator dependency 3. Configure your deployment using the [Configuration Options](./installation/configuration.md) 4. Deploy using [Helm Installation](./installation/helm-install.md) instructions -5. Set up monitoring with [Autoscaling & Monitoring](./operations/autoscaling.md) +5. Set up monitoring with [Autoscaling](./operations/autoscaling.md) & [Observability](./operations/observability.md) ## Detailed documenation ### Cloud Provider Guides + - **[AWS EKS Setup](./installation/providers/aws-eks.md)** - Complete EKS cluster setup with OIDC, node autoscaling, EBS CSI, and NGINX ingress - **[GCP GKE Setup](./installation/providers/gcp-gke.md)** - GKE cluster creation with CSI driver, NGINX ingress, and cert-manager - **[Azure AKS Setup](./installation/providers/azure.md)** - Azure configuration with managed PostgreSQL, Key Vault integration, and Workload Identity @@ -42,13 +43,11 @@ Please refer to our [quick start guide](./installation/quick-start.md) - **[Manual Helm Installation](./installation/helm-install.md)** - Step-by-step Helm deployment process with custom configurations - **[Unified Ingress Configuration](./installation/unified-ingress.md)** - NGINX and Traefik ingress setup with TLS and cert-manager integration -## Database Management - -- **[Data Management](./operations/manage-data.md)** - Loading STAC collections and items into PostgreSQL using pypgstac - ## Operations & Monitoring -- **[Autoscaling & Monitoring](./operations/autoscaling.md)** - HPA setup with custom metrics, Grafana dashboards, Prometheus configuration, and load testing +- **[Autoscaling](./operations/autoscaling.md)** - Horizontal Pod Autoscaler configuration with CPU and request-rate metrics, scaling policies, and load testing strategies +- **[Observability](./operations/observability.md)** - Monitoring stack with Prometheus, Grafana dashboards, metrics collection, and custom metrics API integration +- **[Data Management](./operations/manage-data.md)** - Loading STAC collections and items into PostgreSQL using pypgstac ## Advanced Features diff --git a/docs/operations/autoscaling.md b/docs/operations/autoscaling.md index 2ad25aa2..5c119abd 100644 --- a/docs/operations/autoscaling.md +++ b/docs/operations/autoscaling.md @@ -1,25 +1,40 @@ -# Autoscaling / Monitoring / Observability +# Autoscaling -Autoscaling is both art and science. To test out your application's autoscaling requirements you often need to consider -your data volume, data usage patterns, bottlenecks (such as the database) among many, many other things. Load testing, -metrics, monitoring and observability will help you explore what those needs are. +Horizontal Pod Autoscaler (HPA) configuration for eoAPI services. Autoscaling requires monitoring components to be enabled in the main chart. +## Prerequisites -> ⓘ The `eoapi-support` chart in this repository (see `../charts/eoapi-support`) is required to be installed to -enable any of the eoAPI service autoscaling. It cannot be listed as a dependecy of `eoapi` chart -b/c of the limitations in `prometheus-adapter` and `grafana` for constructing the Prometheus internal -service domains dynamically. +Enable monitoring in your main eoapi installation: -If you are comfortable with k8s you probably only need to `helm install` the support chart and be on your way. Other folks -might want to read through the verbose walkthrough material below to familiarize yourself with how things work. +```yaml +monitoring: + prometheus: + enabled: true + prometheusAdapter: + enabled: true # Required for request-rate scaling + metricsServer: + enabled: true # Required for CPU scaling +``` ---- +## Configuration -## Helm Install `eoapi-support` +### Basic Autoscaling The following instructions assume you've gone through the [AWS](../installation/providers/aws-eks.md) or [GCP](../installation/providers/gcp-gke.md) cluster set up and installed the `eoapi` chart. +```yaml +stac: + autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 20 + type: "requestRate" # Options: "cpu", "requestRate", "both" + targets: + requestRate: 50000m # 50 requests/second +``` + +### Scaling Policies 1. Go to the [releases section](https://github.com/developmentseed/eoapi-k8s/releases) of this repository and find the latest `eoapi-support-` version to install, or use the following command to get the latest version: @@ -29,361 +44,397 @@ and installed the `eoapi` chart. export SUPPORT_VERSION=$(helm search repo eoapi/eoapi-support --versions | head -2 | tail -1 | awk '{print $2}') ``` +```yaml +stac: + autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 20 + type: "both" + behaviour: + scaleDown: + stabilizationWindowSeconds: 300 # 5min cooldown + policies: + - type: Percent + value: 50 # Max 50% pods removed per period + periodSeconds: 300 + scaleUp: + stabilizationWindowSeconds: 60 # 1min cooldown + policies: + - type: Percent + value: 100 # Max 100% pods added per period + periodSeconds: 60 + targets: + cpu: 70 + requestRate: 50000m +``` + +## Metrics Types + +### CPU-based Scaling +```yaml +type: "cpu" +targets: + cpu: 70 +``` + +### Request Rate Scaling +```yaml +type: "requestRate" +targets: + requestRate: 50000m # 50 requests/second +``` + + +### Combined Scaling +```yaml +type: "both" +targets: + cpu: 70 + requestRate: 100000m # 100 requests/second +``` + +## Custom Metrics Configuration + +When using request rate scaling, the prometheus-adapter needs to be configured to expose custom metrics. This is handled automatically when you enable monitoring in the main chart: + +```yaml +# In your main eoapi values file +ingress: + host: your-domain.com + +monitoring: + prometheusAdapter: + enabled: true + resources: + limits: + cpu: 250m + memory: 256Mi + requests: + cpu: 100m + memory: 128Mi +``` + +## Service-Specific Examples + +### STAC (High throughput) +```yaml +stac: + autoscaling: + enabled: true + minReplicas: 3 + maxReplicas: 20 + type: "requestRate" + targets: + requestRate: 40000m +``` + +### Raster (Resource intensive) +```yaml +raster: + autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 8 + type: "cpu" + behaviour: + scaleDown: + stabilizationWindowSeconds: 300 + targets: + cpu: 75 +``` + +### Vector (Balanced) +```yaml +vector: + autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 12 + type: "both" + targets: + cpu: 70 + requestRate: 75000m +``` + +## Configuration Examples + +For complete configuration examples, see the [examples directory](../examples/). + +## Resource Requirements + +### Autoscaling Components +- **metrics-server**: ~100m CPU, ~300Mi memory per node +- **prometheus-adapter**: ~250m CPU, ~256Mi memory +- **prometheus-server**: ~500m CPU, ~512Mi memory (varies with retention) + +## Verification + +### Check HPA Status + +```bash +# Check HPA status for all services +kubectl get hpa -n eoapi + +# Get detailed HPA information +kubectl describe hpa eoapi-stac -n eoapi +``` + +### Verify Custom Metrics API + +```bash +# Check if custom metrics API is available +kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" | jq . + +# Check specific request rate metrics +kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1/namespaces/eoapi/ingresses/*/requests_per_second" | jq . +``` + +### Check Prometheus Adapter + +```bash +# Check prometheus-adapter logs +kubectl logs -l app.kubernetes.io/name=prometheus-adapter -n eoapi +``` -2. Decide on a release name and `namespace` for your support chart. The next steps assume we've -chosen a release name of `eoapi-support` and a similar namespace of `eoapi-support` +## Load Testing +For load testing your autoscaling setup: -3. Then do a normal `helm install` but you'll want to parameterize and pass overrides for the prometheus URL to include -the release name and namespace chosen above. This allows other third-party dependencies used in the chart -(`prometheus-adpater` and `grafana`) know where to find the prometheus service internally. This is unfortunately a -manual step that cannot be automated +```yaml +ingress: + host: your-test-domain.com +``` +3. Check ingress configuration: ```bash - helm upgrade --install -n eoapi-support \ - --create-namespace eoapi-support eoapi/eoapi-support --version $SUPPORT_VERSION \ - --set prometheus-adapter.prometheus.url='http://eoapi-support-prometheus-server.eoapi-support.svc.cluster.local' \ - --set grafana.datasources.datasources\\.yaml.datasources[0].url='http://eoapi-support-prometheus-server.eoapi-support.svc.cluster.local' - ``` - - -4. verify that everything is set up correctly and no deployments are not failing: - - ```sh - watch -n 1 "kubectl -n eoapi-support get deploy,pod,svc" - NAME READY STATUS RESTARTS AGE - pod/eoapi-support-grafana-7fdc9688dd-wkw7p 1/1 Running 0 79s - pod/eoapi-support-kube-state-metrics-54d75784db-ghgbd 1/1 Running 0 79s - pod/eoapi-support-prometheus-adapter-668b6bd89c-kb25q 1/1 Running 0 79s - pod/eoapi-support-prometheus-node-exporter-6f96z 1/1 Running 0 79s - pod/eoapi-support-prometheus-node-exporter-fr96x 1/1 Running 0 79s - pod/eoapi-support-prometheus-node-exporter-pdvvp 1/1 Running 0 79s - pod/eoapi-support-prometheus-server-76dcfc684b-wmk5c 2/2 Running 0 79s - - NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE - service/eoapi-support-grafana LoadBalancer 10.123.248.75 104.154.59.180 80:30821/TCP 79s - service/eoapi-support-kube-state-metrics ClusterIP 10.123.241.247 8080/TCP 79s - service/eoapi-support-prometheus-adapter ClusterIP 10.123.249.21 443/TCP 79s - service/eoapi-support-prometheus-node-exporter ClusterIP 10.123.249.90 9100/TCP 79s - service/eoapi-support-prometheus-server ClusterIP 10.123.247.255 80/TCP 79s - ``` - - -5. If anything in steps 1 through 3 seems confusing then here is a quick bash script to clear it up: - - ```shell - export RELEASE_NAME=eoapi - export RELEASE_NS=eoapi - export SUPPORT_RELEASE_NAME=eoapi-support - export SUPPORT_RELEASE_NS=eoapi-support - - # Get latest chart versions - export SUPPORT_VERSION=$(helm search repo eoapi/eoapi-support --versions | head -2 | tail -1 | awk '{print $2}') - export EOAPI_VERSION=$(helm search repo eoapi/eoapi --versions | head -2 | tail -1 | awk '{print $2}') - - PROMETHEUS_URL="http://${SUPPORT_RELEASE_NAME}-prometheus-server.${SUPPORT_RELEASE_NS}.svc.cluster.local" - - helm upgrade --install \ - -n $SUPPORT_RELEASE_NS --create-namespace $SUPPORT_RELEASE_NAME \ - eoapi/eoapi-support --version $SUPPORT_VERSION \ - --set prometheus-adapter.prometheus.url=$PROMETHEUS_URL \ - --set grafana.datasources.datasources\\.yaml.datasources[0].url=$PROMETHEUS_URL \ - -f /tmp/values-overrides.yaml - - helm upgrade --install \ - -n $RELEASE_NS --create-namespace $RELEASE_NAME \ - eoapi/eoapi --version $EOAPI_VERSION \ - -f /tmp/support-values-overrides.yaml - ``` - - ---- - -### Review [Default Configuration and Options](../installation/configuration.md) - -[This document](../installation/configuration.md) will explain the differences in the `autoscaling` block for each service: - - ```yaml - autoscaling: - enabled: false - minReplicas: 1 - maxReplicas: 10 - # `type`: "cpu" || "requestRate" || "both" - type: "requestRate" - behaviour: {} - scaleDown: - stabilizationWindowSeconds: 60 - scaleUp: - stabilizationWindowSeconds: 0 - targets: - # matches `type` value above unless `type: "both"` is selected - cpu: 85 - requestRate: 15000 - ``` - ---- - -### How Autoscaling Works - -If you grok the default `eoapi-support` values in `values.yaml` you'll see we use custom metrics and prometheus queries -based on the nginx ingress controller's request rate under the `prometheus-adpater.prometheus:` key: - - ```yaml - prometheus-adapter: - prometheus: - # NOTE: the `url` below make some assumptions about the namespace where you released eoapi and prometheus - # 1) that you didn't change the default name of the `prometheus-server` or the port and installed in eoapi namespace - # 2) namely that you ran `helm install eoapi --create-namespace=eoapi` with the `eoapi` namespace - url: http://eoapi-support-prometheus-server.eoapi.svc.cluster.local - port: 80 - path: "" - rules: - default: false - # NOTE: the `name.as` values below make some assumptions about your release name - # namely that you have run `helm install eoapi eoapi/eoapi --create-namespace=eoapi` - custom: - - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}' - seriesFilters: [] - resources: - template: <<.Resource>> - name: - matches: "" - as: "nginx_ingress_controller_requests_rate_vector_eoapi" - metricsQuery: round(sum(rate(<<.Series>>{service="vector",path=~"/vector.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001) - - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}' - seriesFilters: [] - resources: - template: <<.Resource>> - name: - matches: "" - as: "nginx_ingress_controller_requests_rate_raster_eoapi" - metricsQuery: round(sum(rate(<<.Series>>{service="raster",path=~"/raster.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001) - - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}' - seriesFilters: [] - resources: - template: <<.Resource>> - name: - matches: "" - as: "nginx_ingress_controller_requests_rate_stac_eoapi" - metricsQuery: round(sum(rate(<<.Series>>{service="stac",path=~"/stac.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001) - ``` - -Prometheus adapter is a bridge for metrics between Prometheus (which scrapes nginx) and the k8s metrics server so it can autoscale deployments using these custom metrics. -If you've chosen `both` or `requestRate` as a autoscaling `type:` for those values then these custom metrics are used to template an `hpa.yaml` for each service - -### Log into Grafana - -When you `helm install` the support chart you by default get a Grafana dashboard set up with different default metrics charts -to help you load test and explore your service autoscaling. Grafana creates a new username `admin` and password for you -that you'll have to retrieve to login. - -> ⓘ Note that the `service/eoapi-support-grafana` has an EXTERNAL-IP that we can use to view it. -This is just a quick way to work with it. You'll want to set it up with an ingress in the future - - -1. To log into Grafana you'll need to export the default username/password it came installed with. Note that secret names are prefixed -with the `release` name we installed the chart with below `-grafana`: - - ```sh - kubectl get secret eoapi-support-grafana --template='{{index .data "admin-user"}}' -n eoapi | base64 -d - # - kubectl get secret eoapi-support-grafana --template='{{index .data "admin-password"}}' -n eoapi | base64 -d - # - ``` - -2. To find the URL for the load balancer for where to log in with Grafana you can query the services: - - ```sh - kubectl get svc -n eoapi-support + kubectl get ingress -n eoapi ``` -3. Login and you should be default be able to see the eoapi-k8s grafana dashboard. The Prometheus datasource will already be configured for you: - - ![Grafana Datasource Configuration](../images/datasource.png) - - You can then view the main eoAPI dashboard: - - ![](../images/gfdashboard.png) +## Troubleshooting - To add additional custom dashboards, you can use the dashboard import functionality: +### HPA Shows "Unknown" Metrics - ![Adding Custom Grafana Dashboards](../images/add-grafana-dashboard.png) +If HPA shows "unknown" for custom metrics: -### Install or Upgrade Autoscaling Changes to `eoapi` Chart - -1. If you haven't already decide which services (`vector` || `raster` || `stac`) you want to enable `autoscaling` on change your values yaml for these and redeploy - - ```yaml - stac: - enabled: true - autoscaling: - enabled: true - type: "requestRate" - targets: - requestRate: 50000m - settings: - resources: - limits: - cpu: "1280m" - memory: "1536Mi" - requests: - cpu: "512m" - memory: "1024Mi" - vector: - enabled: true - autoscaling: - enabled: true - type: "requestRate" - targets: - requestRate: 50000m - settings: - resources: - limits: - cpu: "768m" - memory: "1536Mi" - requests: - cpu: "256m" - memory: "1024Mi" +1. Verify prometheus-adapter is running: + ```bash + kubectl get pods -l app.kubernetes.io/name=prometheus-adapter -n eoapi ``` -2. Review what the heck the unit `m` means for your [autoscaling values in the k8s docs](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#quantities) - - -3. Then `helm install` the eoapi chart with these changes - - ```sh - helm upgrade --install -n eoapi... +2. Check prometheus-adapter logs: + ```bash + kubectl logs -l app.kubernetes.io/name=prometheus-adapter -n eoapi ``` ---- - -### Add Load Balancer Host as a Host to Your Ingress - -Unfortunately, nginx will not expose metrics for ingresses without hosts or hosts with wildcards. You'll either need to deploy -`eoapi-k8s` chart again with `ingress.tls.enabled` or need to find the `EXTERNAL-IP` for your `ingress-nginx-controller` and use that -to set up a simple host - -1. Find the IP that your `ingress-nginx-controller` service load balancer: - - ```sh - kubectl -n ingress-nginx get svc/ingress-nginx-controller -o=jsonpath='{.status.loadBalancer.ingress[0].hostname}' - http://abc5929f88f8c45c38f6cbab2faad43c-776419634.us-west-2.elb.amazonaws.com/ +3. Verify metrics are available in Prometheus: + ```bash + # Port forward to access Prometheus + kubectl port-forward service/eoapi-prometheus-server 9090:80 -n eoapi + # Then check metrics at http://localhost:9090 ``` -2. Then live edit your shared ingress for eoapi services to add the host: +### Review [Default Configuration and Options](../installation/configuration.md) - ```sh - kubectl edit ingress nginx-service-ingress-shared-eoapi -n eoapi +Default autoscaling configuration: + +```yaml +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 5 + # Type can be "cpu", "requestRate", or "both" + type: "cpu" + # Custom scaling behavior (optional) + behaviour: {} + # Scaling targets + targets: + # CPU target percentage (when type is "cpu" or "both") + cpu: 80 + # Request rate target in millirequests per second (when type is "requestRate" or "both") + requestRate: 30000m +``` + +### No Scaling Activity + +If pods aren't scaling: + +1. Check HPA events: + ```bash + kubectl describe hpa eoapi-stac -n eoapi ``` - ```yaml - # BEFORE - spec: - ingressClassName: nginx - rules: - - http: - paths: - ... +2. Verify metrics are being collected: + ```bash + kubectl top pods -n eoapi ``` - ```yaml - # AFTER - spec: - ingressClassName: nginx - rules: - - host: abc5929f88f8c45c38f6cbab2faad43c-776419634.us-west-2.elb.amazonaws.com - http: - paths: - ... +3. Check resource requests are set: + ```bash + kubectl describe pod eoapi-stac-xxx -n eoapi | grep -A 10 "Requests" ``` -And then finally roll out the deployment. - - ```sh - kubectl rollout restart deploy/ingress-nginx-controller -n ingress-nginx - - ``` +### Install or Upgrade Autoscaling Changes to `eoapi` Chart ---- +When enabling autoscaling, ensure monitoring is also enabled: + +```yaml +# Enable monitoring first +monitoring: + prometheus: + enabled: true + prometheusAdapter: + enabled: true + +# Then enable autoscaling +stac: + autoscaling: + enabled: true + type: "requestRate" + targets: + requestRate: 50000m + +# Configure resources for proper scaling metrics +stac: + settings: + resources: + limits: + cpu: 1000m + memory: 512Mi + requests: + cpu: 100m + memory: 128Mi +``` + +### Custom Metrics Not Working + +If request rate metrics aren't working: + +1. Verify nginx ingress controller has metrics enabled +2. Check prometheus is scraping ingress metrics +3. Confirm prometheus-adapter configuration +4. Validate ingress annotations for metrics + +### Scaling Too Aggressive/Slow + +Adjust scaling behavior: + +```yaml +autoscaling: + behaviour: + scaleUp: + stabilizationWindowSeconds: 60 # Faster scaling up + policies: + - type: Percent + value: 100 + periodSeconds: 60 + scaleDown: + stabilizationWindowSeconds: 300 # Slower scaling down + policies: + - type: Percent + value: 25 # More conservative scale down + periodSeconds: 300 +``` + +## Best Practices + +1. **Set appropriate resource requests**: HPA needs resource requests to calculate CPU utilization +2. **Use stabilization windows**: Prevent thrashing with appropriate cooldown periods +3. **Monitor costs**: Autoscaling can increase costs rapidly +4. **Test thoroughly**: Validate scaling behavior under realistic load +5. **Set reasonable limits**: Use `maxReplicas` to prevent runaway scaling +6. **Use multiple metrics**: Combine CPU and request rate for better scaling decisions + +Example ingress configuration for load testing: + +```yaml +# For AWS ALB +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: eoapi-ingress + annotations: + kubernetes.io/ingress.class: alb + alb.ingress.kubernetes.io/scheme: internet-facing +spec: + ingressClassName: nginx + rules: + - host: your-domain.com + http: + paths: [...] + +# For nginx ingress +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: eoapi-ingress +spec: + ingressClassName: nginx + rules: + - host: abc5929f88f8c45c38f6cbab2faad43c-776419634.us-west-2.elb.amazonaws.com + http: + paths: [...] +``` ## Load Testing #### Load Testing with `hey` -Everything mentioned below assumes you've already gone through the autoscaling setup above and -that you're deploying using `ingress.className: "nginx"`. +The `hey` tool is a simple HTTP load testing tool. ### Install and Run Load Tests -1. Install `hey` utility locally: - +1. Install hey: ```bash # macOS brew install hey # Linux - wget https://github.com/rakyll/hey/releases/latest/download/hey_linux_amd64 - chmod +x hey_linux_amd64 && sudo mv hey_linux_amd64 /usr/local/bin/hey + go install github.com/rakyll/hey@latest - # Or use Docker - alias hey='docker run --rm rcmorano/hey' + # Or download from releases + wget https://hey-release.s3.us-east-2.amazonaws.com/hey_linux_amd64 + chmod +x hey_linux_amd64 + sudo mv hey_linux_amd64 /usr/local/bin/hey ``` -2. Find the external IP of your shared nginx ingress: - +2. Run basic load test: ```bash - # For GKE clusters - export INGRESS_ENDPOINT=$(kubectl -n ingress-nginx get ingress/nginx-service-ingress-shared-eoapi -o=jsonpath='{.spec.rules[0].host}') - # Example output: eoapi-35.234.254.12.nip.io + # Test STAC endpoint + hey -z 5m -c 10 https://your-domain.com/stac/collections - # For EKS clusters - export INGRESS_ENDPOINT=$(kubectl -n ingress-nginx get svc/ingress-nginx-controller -o=jsonpath='{.status.loadBalancer.ingress[0].hostname}') - # Example output: k8s-eoapi-ingressn-404721dbb4-e6dec70321c3eddd.elb.us-west-2.amazonaws.com + # Test with higher concurrency + hey -z 10m -c 50 https://your-domain.com/stac/search ``` -3. Run load tests against different endpoints in separate terminals: - +3. Monitor during load test: ```bash - # Test Vector API - hey -n 2000000 -q 150 -c 20 "http://${INGRESS_ENDPOINT}/vector/collections/public.my_data/items?f=geojson" - - # Test STAC API - hey -n 2000000 -q 150 -c 20 "http://${INGRESS_ENDPOINT}/stac/" + # Watch HPA scaling + watch kubectl get hpa -n eoapi - # Test Raster API - hey -n 2000000 -q 150 -c 20 "http://${INGRESS_ENDPOINT}/raster/collections" + # Monitor pods + watch kubectl get pods -n eoapi ``` - **Load testing parameters:** - - `-n`: Total number of requests (2M for sustained testing) - - `-q`: Rate limit (150 requests/second per worker) - - `-c`: Number of concurrent workers (20) - -4. **Monitor autoscaling in Grafana** - Go back to your Grafana dashboard and watch your services autoscale for the endpoints you're hitting: - - ![Grafana Autoscaling Dashboard](../images/grafanaautoscale.png) - ### Load Testing Best Practices -- **Start small**: Begin with lower request rates and gradually increase -- **Monitor resources**: Watch CPU, memory, and request rate metrics -- **Test realistic scenarios**: Use actual data access patterns when possible -- **Verify autoscaling**: Ensure HPA triggers and pods scale up/down appropriately -- **Database bottlenecks**: Monitor PostgreSQL performance under load -- **Clean up**: Stop load tests gracefully to avoid overwhelming services +1. **Start small**: Begin with low concurrency and short duration +2. **Monitor resources**: Watch CPU, memory, and network usage +3. **Test realistic scenarios**: Use actual API endpoints and payloads +4. **Gradual increase**: Slowly increase load to find breaking points +5. **Test different endpoints**: Each service may have different characteristics ### Troubleshooting Load Tests -If autoscaling isn't triggering: -- Verify HPA is configured: `kubectl get hpa -n eoapi` -- Check custom metrics: `kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" | jq .` -- Ensure prometheus-adapter is running: `kubectl get pods -n eoapi-support` -- Validate ingress metrics: Check Grafana for nginx request rates +- **High response times**: May indicate need for more replicas or resources +- **Error rates**: Could suggest database bottlenecks or resource limits +- **No scaling**: Check HPA metrics and thresholds ### Advanced Load Testing -For more sophisticated testing consider: -- **[k6](https://k6.io/)** - JavaScript-based load testing with scenarios -- **[Artillery](https://artillery.io/)** - Node.js load testing toolkit -- **[JMeter](https://jmeter.apache.org/)** - GUI-based load testing with complex scenarios +For more comprehensive testing, consider: +- **[Artillery](https://artillery.io/)** - Feature-rich load testing toolkit +- **[k6](https://k6.io/)** - Developer-centric performance testing - **[Locust](https://locust.io/)** - Python-based distributed load testing + +For monitoring and observability setup, see [observability.md](observability.md). diff --git a/docs/operations/observability.md b/docs/operations/observability.md new file mode 100644 index 00000000..06f89ea8 --- /dev/null +++ b/docs/operations/observability.md @@ -0,0 +1,334 @@ + # Observability & Monitoring + +This guide covers metrics collection, monitoring, and visualization for eoAPI deployments. All monitoring components are optional and disabled by default. + +## Overview + +eoAPI observability is implemented through conditional dependencies in the main `eoapi` chart: + +### Core Monitoring +Essential metrics collection infrastructure including Prometheus server, metrics-server, kube-state-metrics, node-exporter, and prometheus-adapter. + +### Integrated Observability +Grafana dashboards and visualization tools are available as conditional dependencies within the main chart, eliminating the need for separate deployments. + +## Configuration + +**Prerequisites**: Kubernetes cluster with Helm 3 installed. + +### Quick Deployment + +```bash +# Deploy with monitoring and observability enabled +helm install eoapi eoapi/eoapi \ + --set monitoring.prometheus.enabled=true \ + --set observability.grafana.enabled=true + +# Access Grafana (get password) +kubectl get secret eoapi-grafana -n eoapi \ + -o jsonpath="{.data.admin-password}" | base64 -d +``` + +### Using Configuration Files + +For production deployments, use configuration files instead of command-line flags: + +```bash +# Deploy with integrated monitoring and observability +helm install eoapi eoapi/eoapi -f values-full-observability.yaml +``` + +**For a complete example**: See [examples/values-full-observability.yaml](../examples/values-full-observability.yaml) + +## Architecture & Components + +**Component Responsibilities:** + +- **Prometheus Server**: Central metrics storage and querying engine +- **metrics-server**: Provides resource metrics for `kubectl top` and HPA +- **kube-state-metrics**: Exposes Kubernetes object state as metrics +- **prometheus-node-exporter**: Collects hardware and OS metrics from nodes +- **prometheus-adapter**: Enables custom metrics for Horizontal Pod Autoscaler +- **Grafana**: Dashboards and visualization of collected metrics + +**Data Flow**: Exporters expose metrics → Prometheus scrapes and stores → Grafana/kubectl query via PromQL → Dashboards visualize data + +### Detailed Configuration + +#### Basic Monitoring Setup + +```yaml +# values.yaml - Enable core monitoring in main eoapi chart +monitoring: + metricsServer: + enabled: true + prometheus: + enabled: true + server: + persistentVolume: + enabled: true + size: 50Gi + retention: "30d" + kube-state-metrics: + enabled: true + prometheus-node-exporter: + enabled: true +``` + +#### Observability Chart Configuration + +```yaml +# Basic Grafana setup +grafana: + enabled: true + service: + type: LoadBalancer + +# Connect to external Prometheus (if not using eoapi's Prometheus) +prometheusUrl: "http://prometheus.monitoring.svc.cluster.local" + +# Production Grafana configuration +grafana: + persistence: + enabled: true + size: 10Gi + resources: + limits: + cpu: 200m + memory: 400Mi + requests: + cpu: 50m + memory: 200Mi +``` + +#### PostgreSQL Monitoring + +Enable PostgreSQL metrics collection: + +```yaml +postgrescluster: + monitoring: true # Enables postgres_exporter sidecar +``` + +## Available Metrics + +### Core Infrastructure Metrics +- **Container resources**: CPU, memory, network usage +- **Kubernetes state**: Pods, services, deployments status +- **Node metrics**: Hardware utilization, filesystem usage +- **PostgreSQL**: Database connections, query performance (when enabled) + +### Custom Application Metrics + +When prometheus-adapter and nginx ingress are both enabled, these custom metrics become available: +- `nginx_ingress_controller_requests_rate_stac_eoapi` +- `nginx_ingress_controller_requests_rate_raster_eoapi` +- `nginx_ingress_controller_requests_rate_vector_eoapi` +- `nginx_ingress_controller_requests_rate_multidim_eoapi` + +**Requirements**: +- nginx ingress controller with prometheus metrics enabled +- Ingress must use specific hostnames (not wildcard patterns) +- prometheus-adapter must be configured to expose these metrics + +## Pre-built Dashboards + +The `eoapi-observability` chart provides ready-to-use dashboards: + +### eoAPI Services Dashboard +- Request rates per service +- Response times and error rates +- Traffic patterns by endpoint + +### Infrastructure Dashboard +- CPU usage rate by pod +- CPU throttling metrics +- Memory usage and limits +- Pod count tracking + +### Container Resources Dashboard +- Resource consumption by container +- Resource quotas and limits +- Performance bottlenecks + +### PostgreSQL Dashboard (when enabled) +- Database connections +- Query performance +- Storage utilization + +#### Production Configuration + +```yaml +monitoring: + prometheus: + server: + # Persistent storage + persistentVolume: + enabled: true + size: 100Gi + storageClass: "gp3" + # Retention policy + retention: "30d" + # Resource allocation + resources: + limits: + cpu: "2000m" + memory: "4096Mi" + requests: + cpu: "1000m" + memory: "2048Mi" + # Security - internal access only + service: + type: ClusterIP +``` + +### Resource Requirements + +#### Core Monitoring Components + +Minimum resource requirements (actual usage varies by cluster size and metrics volume): + +| Component | CPU | Memory | Purpose | +|-----------|-----|---------|----------| +| prometheus-server | 500m | 1Gi | Metrics storage | +| metrics-server | 100m | 200Mi | Resource metrics | +| kube-state-metrics | 50m | 150Mi | K8s state | +| prometheus-node-exporter | 50m | 50Mi | Node metrics | +| prometheus-adapter | 100m | 128Mi | Custom metrics API | +| **Total** | **~800m** | **~1.5Gi** | | + +#### Observability Components + +| Component | CPU | Memory | Purpose | +|-----------|-----|---------|----------| +| grafana | 100m | 200Mi | Visualization | + +## Operations + +### Verification Commands + +```bash +# Check Prometheus is running +kubectl get pods -n eoapi -l app.kubernetes.io/name=prometheus + +# Verify metrics-server +kubectl get apiservice v1beta1.metrics.k8s.io + +# List available custom metrics +kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" | jq '.resources[].name' + +# Test metrics collection +kubectl port-forward svc/eoapi-prometheus-server 9090:80 -n eoapi +# Visit http://localhost:9090/targets +``` + +### Monitoring Health + +```bash +# Check Prometheus targets +curl -X GET 'http://localhost:9090/api/v1/query?query=up' + +# Verify Grafana datasource connectivity +kubectl exec -it deployment/eoapi-obs-grafana -n eoapi -- \ + wget -O- http://eoapi-prometheus-server/api/v1/label/__name__/values +``` + +## Advanced Features + +### Alerting Setup + +Enable alertmanager for alert management: + +```yaml +prometheus: + enabled: true + alertmanager: + enabled: true + config: + global: + # Configure with your SMTP server details + smtp_smarthost: 'your-smtp-server:587' + smtp_from: 'alertmanager@yourdomain.com' + route: + receiver: 'default-receiver' + receivers: + - name: 'default-receiver' + webhook_configs: + - url: 'http://your-webhook-endpoint:5001/' +``` + +**Note**: Replace example values with your actual SMTP server and webhook endpoints. + +### Batch Job Metrics + +Enable pushgateway for batch job metrics: + +```yaml +prometheus: + enabled: true + prometheus-pushgateway: + enabled: true # For batch job metrics collection +``` + +### Custom Dashboards + +Add custom dashboards by creating ConfigMaps with the appropriate label: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: custom-dashboard + namespace: eoapi + labels: + eoapi_dashboard: "1" +data: + custom.json: | + { + "dashboard": { + "id": null, + "title": "Custom eoAPI Dashboard", + "tags": ["eoapi"], + "panels": [] + } + } +``` + +The ConfigMap must be in the same namespace as the Grafana deployment and include the `eoapi_dashboard: "1"` label. + +## Troubleshooting + +### Common Issues + +**Missing Metrics** +1. Check Prometheus service discovery: + ```bash + kubectl port-forward svc/eoapi-prometheus-server 9090:80 -n eoapi + # Visit http://localhost:9090/service-discovery + ``` + +2. Verify target endpoints: + ```bash + kubectl get endpoints -n eoapi + ``` + +**Grafana Connection Issues** +1. Check datasource connectivity in Grafana UI → Configuration → Data Sources +2. Verify Prometheus URL accessibility from Grafana pod + +**Resource Issues** +- Monitor current usage: `kubectl top pods -n eoapi` +- Check for OOMKilled containers: `kubectl describe pods -n eoapi | grep -A 5 "Last State"` +- Verify resource limits are appropriate for your workload size +- Consider increasing Prometheus retention settings if storage is full + +## Security Considerations + +- **Network Security**: Use `ClusterIP` services for Prometheus in production +- **Access Control**: Configure network policies to restrict metrics access +- **Authentication**: Enable authentication for Grafana (LDAP, OAuth, etc.) +- **Data Privacy**: Consider metrics data sensitivity and retention policies + +## Related Documentation + +- For autoscaling configuration using these metrics: [autoscaling.md](autoscaling.md) From 42d92cab31f956ba493f2d0e158c84a8f8a769d9 Mon Sep 17 00:00:00 2001 From: Felix Delattre Date: Fri, 24 Oct 2025 15:25:29 +0200 Subject: [PATCH 3/3] Added tests for autoscaling and observability. --- .github/workflows/helm-tests.yml | 119 ++++ .github/workflows/tests/test_autoscaling.py | 654 ++++++++++++++++++ .github/workflows/tests/test_observability.py | 522 ++++++++++++++ scripts/README.md | 53 +- scripts/lib/common.sh | 4 + scripts/lib/observability.sh | 530 ++++++++++++++ scripts/test.sh | 216 +++++- 7 files changed, 2092 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/tests/test_autoscaling.py create mode 100644 .github/workflows/tests/test_observability.py create mode 100644 scripts/lib/observability.sh diff --git a/.github/workflows/helm-tests.yml b/.github/workflows/helm-tests.yml index e18fef38..7c452968 100644 --- a/.github/workflows/helm-tests.yml +++ b/.github/workflows/helm-tests.yml @@ -217,3 +217,122 @@ jobs: if: always() run: | helm uninstall "$RELEASE_NAME" || true + + observability-tests: + name: Observability Tests + if: github.event.pull_request.head.repo.full_name == github.repository + permissions: + contents: 'read' + id-token: 'write' + needs: k3s-integration-tests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5 + + - name: Start K3s cluster + uses: jupyterhub/action-k3s-helm@v4 + with: + k3s-channel: latest + helm-version: ${{ env.HELM_VERSION }} + metrics-enabled: false + docker-enabled: true + + - name: Set release name + run: echo "RELEASE_NAME=eoapi-$(echo "${{ github.sha }}" | cut -c1-8)" >> "$GITHUB_ENV" + + - name: Wait for K3s to be fully ready + run: | + echo "=== Waiting for K3s to be fully ready ===" + kubectl wait --for=condition=Ready pod -l k8s-app=kube-dns -n kube-system --timeout=300s + kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=traefik -n kube-system --timeout=300s + kubectl get nodes + kubectl get pods --all-namespaces + sleep 10 + echo "✅ K3s is ready" + + - name: Deploy eoAPI with monitoring + run: | + echo "=== Deploying eoAPI with monitoring stack ===" + export RELEASE_NAME="$RELEASE_NAME" + export PGO_VERSION="${{ env.PGO_VERSION }}" + export GITHUB_SHA="${{ github.sha }}" + ./scripts/deploy.sh --ci + + # Enable monitoring components + helm upgrade "$RELEASE_NAME" ./charts/eoapi \ + --set monitoring.prometheus.enabled=true \ + --set monitoring.prometheusAdapter.enabled=true \ + --set monitoring.kube-state-metrics.enabled=true \ + --set monitoring.prometheus-node-exporter.enabled=true \ + --set observability.grafana.enabled=true \ + --set stac.autoscaling.enabled=true \ + --set raster.autoscaling.enabled=true \ + --set vector.autoscaling.enabled=true \ + --namespace eoapi \ + --wait --timeout=10m + + - name: Wait for monitoring stack to be ready + run: | + echo "=== Waiting for monitoring components ===" + + # Wait for Prometheus + kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=prometheus -n eoapi --timeout=300s || echo "Prometheus not ready" + + # Wait for Grafana + kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=grafana -n eoapi --timeout=300s || echo "Grafana not ready" + + # Wait for prometheus-adapter + kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=prometheus-adapter -n eoapi --timeout=300s || echo "prometheus-adapter not ready" + + # Wait for HPA to be created + sleep 30 + + echo "=== Final monitoring stack status ===" + kubectl get pods -n eoapi -l 'app.kubernetes.io/component in (server,grafana,prometheus-adapter)' || true + kubectl get hpa -n eoapi || true + + - name: Run observability tests + run: | + echo "=== Running observability test suite ===" + export RELEASE_NAME="$RELEASE_NAME" + export NAMESPACE="eoapi" + + # Install python dependencies for testing + python -m pip install --upgrade pip + pip install pytest requests + + # Run observability tests + python -m pytest .github/workflows/tests/test_observability.py -v --tb=short + + # Run autoscaling tests + python -m pytest .github/workflows/tests/test_autoscaling.py -v --tb=short -m "not slow" + + - name: Debug observability stack on failure + if: failure() + run: | + echo "=== Observability Debug Information ===" + + echo "=== Monitoring Pods Status ===" + kubectl get pods -n eoapi -l 'app.kubernetes.io/name in (prometheus,grafana,prometheus-adapter)' -o wide || true + + echo "=== HPA Status ===" + kubectl get hpa -n eoapi -o wide || true + kubectl describe hpa -n eoapi || true + + echo "=== Custom Metrics API ===" + kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" || true + + echo "=== Pod Metrics ===" + kubectl top pods -n eoapi || true + + echo "=== Recent Events ===" + kubectl get events -n eoapi --sort-by='.lastTimestamp' | tail -20 || true + + echo "=== Component Logs ===" + kubectl logs -l app.kubernetes.io/name=prometheus-adapter -n eoapi --tail=50 || true + kubectl logs -l app.kubernetes.io/name=grafana -n eoapi --tail=30 || true + + - name: Cleanup observability test + if: always() + run: | + helm uninstall "$RELEASE_NAME" || true diff --git a/.github/workflows/tests/test_autoscaling.py b/.github/workflows/tests/test_autoscaling.py new file mode 100644 index 00000000..593125b0 --- /dev/null +++ b/.github/workflows/tests/test_autoscaling.py @@ -0,0 +1,654 @@ +"""Test autoscaling behavior and HPA functionality.""" + +import json +import os +import subprocess +import threading +import time + +import pytest +import requests + + +def get_namespace(): + """Get the target namespace from environment or default.""" + return os.environ.get("NAMESPACE", "eoapi") + + +def get_release_name(): + """Get the release name from environment or default.""" + return os.environ.get("RELEASE_NAME", "eoapi") + + +def get_base_url(): + """Get the base URL for API endpoints.""" + # Try to detect ingress or use port-forward + namespace = get_namespace() + + # Check if we have an ingress + result = subprocess.run( + ["kubectl", "get", "ingress", "-n", namespace, "-o", "json"], + capture_output=True, + text=True, + ) + + if result.returncode == 0: + ingress_data = json.loads(result.stdout) + if ingress_data["items"]: + ingress = ingress_data["items"][0] + rules = ingress.get("spec", {}).get("rules", []) + if rules: + host = rules[0].get("host", "localhost") + # Check if host is accessible + try: + response = requests.get( + f"http://{host}/stac/collections", timeout=5 + ) + if response.status_code == 200: + return f"http://{host}" + except requests.RequestException: + pass + + # Fallback to localhost (assuming port-forward) + return "http://localhost:8080" + + +def kubectl_get(resource, namespace=None, label_selector=None, output="json"): + """Execute kubectl get command with optional parameters.""" + cmd = ["kubectl", "get", resource] + + if namespace: + cmd.extend(["-n", namespace]) + + if label_selector: + cmd.extend(["-l", label_selector]) + + if output: + cmd.extend(["-o", output]) + + result = subprocess.run(cmd, capture_output=True, text=True) + return result + + +def get_pod_metrics(namespace, service_name): + """Get current CPU and memory metrics for service pods.""" + result = subprocess.run( + [ + "kubectl", + "top", + "pods", + "-n", + namespace, + "-l", + f"app=eoapi-{service_name}", + "--no-headers", + ], + capture_output=True, + text=True, + ) + + if result.returncode != 0: + return [] + + metrics = [] + for line in result.stdout.strip().split("\n"): + if line.strip(): + parts = line.split() + if len(parts) >= 3: + pod_name = parts[0] + cpu = parts[1] # e.g., "25m" + memory = parts[2] # e.g., "128Mi" + metrics.append({"pod": pod_name, "cpu": cpu, "memory": memory}) + + return metrics + + +def get_hpa_status(namespace, hpa_name): + """Get current HPA status and metrics.""" + result = kubectl_get("hpa", namespace=namespace, output="json") + if result.returncode != 0: + return None + + hpas = json.loads(result.stdout) + for hpa in hpas["items"]: + if hpa["metadata"]["name"] == hpa_name: + return hpa + + return None + + +def get_pod_count(namespace, service_name): + """Get current number of running pods for a service.""" + result = kubectl_get( + "pods", namespace=namespace, label_selector=f"app=eoapi-{service_name}" + ) + + if result.returncode != 0: + return 0 + + pods = json.loads(result.stdout) + running_pods = [ + pod for pod in pods["items"] if pod["status"]["phase"] == "Running" + ] + + return len(running_pods) + + +def make_request(url, timeout=10): + """Make a single HTTP request and return success status.""" + try: + response = requests.get(url, timeout=timeout) + return response.status_code == 200 + except requests.RequestException: + return False + + +def generate_load( + base_url, endpoints, duration=60, concurrent_requests=5, delay=0.1 +): + """Generate HTTP load against specified endpoints.""" + end_time = time.time() + duration + success_count = 0 + error_count = 0 + + def worker(): + nonlocal success_count, error_count + while time.time() < end_time: + for endpoint in endpoints: + url = f"{base_url}{endpoint}" + if make_request(url): + success_count += 1 + else: + error_count += 1 + time.sleep(delay) + + # Start concurrent workers + threads = [] + for _ in range(concurrent_requests): + thread = threading.Thread(target=worker) + thread.start() + threads.append(thread) + + # Wait for all threads to complete + for thread in threads: + thread.join() + + return { + "total_requests": success_count + error_count, + "successful_requests": success_count, + "failed_requests": error_count, + "success_rate": success_count / (success_count + error_count) + if (success_count + error_count) > 0 + else 0, + } + + +class TestHPAConfiguration: + """Test HPA resource configuration and basic functionality.""" + + def test_hpa_resources_properly_configured(self): + """Verify HPA resources have correct configuration.""" + namespace = get_namespace() + result = kubectl_get("hpa", namespace=namespace) + + if result.returncode != 0: + pytest.skip("No HPA resources found - autoscaling not enabled") + + hpas = json.loads(result.stdout) + assert len(hpas["items"]) > 0, "No HPA resources configured" + + for hpa in hpas["items"]: + spec = hpa["spec"] + hpa_name = hpa["metadata"]["name"] + + # Check required fields + assert "scaleTargetRef" in spec, ( + f"HPA {hpa_name} missing scaleTargetRef" + ) + assert "minReplicas" in spec, f"HPA {hpa_name} missing minReplicas" + assert "maxReplicas" in spec, f"HPA {hpa_name} missing maxReplicas" + assert "metrics" in spec, ( + f"HPA {hpa_name} missing metrics configuration" + ) + + # Validate replica bounds + min_replicas = spec["minReplicas"] + max_replicas = spec["maxReplicas"] + assert min_replicas > 0, f"HPA {hpa_name} minReplicas must be > 0" + assert max_replicas > min_replicas, ( + f"HPA {hpa_name} maxReplicas must be > minReplicas" + ) + + # Check metrics configuration + metrics = spec["metrics"] + assert len(metrics) > 0, f"HPA {hpa_name} has no metrics configured" + + # Verify at least one metric is CPU + cpu_metrics = [ + m + for m in metrics + if m.get("type") == "Resource" + and m.get("resource", {}).get("name") == "cpu" + ] + assert len(cpu_metrics) > 0, ( + f"HPA {hpa_name} must have CPU metric configured" + ) + + print( + f"✅ HPA {hpa_name}: {min_replicas}-{max_replicas} replicas, {len(metrics)} metrics" + ) + + def test_target_deployments_exist(self): + """Verify HPA target deployments exist and are ready.""" + namespace = get_namespace() + result = kubectl_get("hpa", namespace=namespace) + + if result.returncode != 0: + pytest.skip("No HPA resources found") + + hpas = json.loads(result.stdout) + + for hpa in hpas["items"]: + target_ref = hpa["spec"]["scaleTargetRef"] + target_name = target_ref["name"] + hpa_name = hpa["metadata"]["name"] + + # Check target deployment exists + deploy_result = kubectl_get( + "deployment", namespace=namespace, output="json" + ) + assert deploy_result.returncode == 0, "Cannot list deployments" + + deployments = json.loads(deploy_result.stdout) + target_deployment = next( + ( + d + for d in deployments["items"] + if d["metadata"]["name"] == target_name + ), + None, + ) + + assert target_deployment is not None, ( + f"HPA {hpa_name} target deployment {target_name} not found" + ) + + # Check deployment has ready replicas + status = target_deployment.get("status", {}) + ready_replicas = status.get("readyReplicas", 0) + assert ready_replicas > 0, ( + f"Target deployment {target_name} has no ready replicas" + ) + + print( + f"✅ HPA {hpa_name} target deployment {target_name} is ready ({ready_replicas} replicas)" + ) + + +class TestCPUScaling: + """Test CPU-based autoscaling functionality.""" + + def test_cpu_metrics_collection(self): + """Verify CPU metrics are being collected for HPA targets.""" + namespace = get_namespace() + services = ["stac", "raster", "vector"] + + metrics_available = [] + + for service in services: + try: + pod_metrics = get_pod_metrics(namespace, service) + if pod_metrics: + metrics_available.append(service) + for metric in pod_metrics: + print( + f"✅ {service} pod {metric['pod']}: CPU={metric['cpu']}, Memory={metric['memory']}" + ) + except Exception as e: + print(f"⚠️ Cannot get metrics for {service}: {e}") + + assert len(metrics_available) > 0, ( + "No CPU metrics available for any service" + ) + + def test_hpa_cpu_utilization_calculation(self): + """Verify HPA calculates CPU utilization correctly.""" + namespace = get_namespace() + result = kubectl_get("hpa", namespace=namespace) + + if result.returncode != 0: + pytest.skip("No HPA resources found") + + hpas = json.loads(result.stdout) + + for hpa in hpas["items"]: + hpa_name = hpa["metadata"]["name"] + status = hpa.get("status", {}) + + # Check if HPA has current metrics + current_metrics = status.get("currentMetrics", []) + cpu_metrics = [ + m + for m in current_metrics + if m.get("type") == "Resource" + and m.get("resource", {}).get("name") == "cpu" + ] + + if cpu_metrics: + cpu_utilization = cpu_metrics[0]["resource"]["current"].get( + "averageUtilization" + ) + if cpu_utilization is not None: + assert 0 <= cpu_utilization <= 1000, ( + f"Invalid CPU utilization: {cpu_utilization}%" + ) + print( + f"✅ HPA {hpa_name} CPU utilization: {cpu_utilization}%" + ) + else: + print( + f"⚠️ HPA {hpa_name} CPU metric exists but no utilization value" + ) + else: + # Check conditions for why metrics might not be available + conditions = status.get("conditions", []) + for condition in conditions: + if ( + condition["type"] == "ScalingActive" + and condition["status"] == "False" + ): + print( + f"⚠️ HPA {hpa_name} scaling not active: {condition.get('message', 'Unknown reason')}" + ) + break + else: + print(f"⚠️ HPA {hpa_name} no CPU metrics available yet") + + def test_cpu_resource_requests_alignment(self): + """Verify CPU resource requests are properly set for percentage calculations.""" + namespace = get_namespace() + services = ["stac", "raster", "vector"] + + for service in services: + result = kubectl_get( + "pods", + namespace=namespace, + label_selector=f"app=eoapi-{service}", + ) + + if result.returncode != 0: + continue + + pods = json.loads(result.stdout) + running_pods = [ + p for p in pods["items"] if p["status"]["phase"] == "Running" + ] + + if not running_pods: + continue + + pod = running_pods[0] # Check first running pod + containers = pod["spec"]["containers"] + + main_container = next( + (c for c in containers if c["name"] == service), None + ) + if not main_container: + continue + + resources = main_container.get("resources", {}) + requests = resources.get("requests", {}) + + if "cpu" not in requests: + print( + f"⚠️ Service {service} missing CPU requests - HPA percentage calculation may be inaccurate" + ) + continue + + cpu_request = requests["cpu"] + print(f"✅ Service {service} CPU request: {cpu_request}") + + # Parse CPU request to verify it's reasonable + if cpu_request.endswith("m"): + cpu_millicores = int(cpu_request[:-1]) + assert cpu_millicores > 0, ( + f"Service {service} has zero CPU request" + ) + assert cpu_millicores <= 2000, ( + f"Service {service} has very high CPU request: {cpu_millicores}m" + ) + + +class TestScalingBehavior: + """Test actual scaling behavior under load.""" + + @pytest.mark.slow + def test_load_response_scaling(self): + """Generate load and verify scaling response (when possible).""" + namespace = get_namespace() + base_url = get_base_url() + + # Test endpoints that should generate CPU load + load_endpoints = [ + "/stac/collections", + "/stac/search?collections=noaa-emergency-response&limit=50", + "/raster/collections", + "/vector/collections", + ] + + # Check initial state + initial_pod_counts = {} + services = ["stac", "raster", "vector"] + + for service in services: + initial_pod_counts[service] = get_pod_count(namespace, service) + + print(f"Initial pod counts: {initial_pod_counts}") + + # Skip test if we can't connect to services + try: + response = requests.get(f"{base_url}/stac/collections", timeout=5) + if response.status_code != 200: + pytest.skip("Cannot access API endpoints for load testing") + except requests.RequestException: + pytest.skip("API endpoints not accessible for load testing") + + # Generate moderate load for limited time (suitable for CI) + load_duration = 90 # 1.5 minutes + concurrent_requests = 8 + + print( + f"Generating load: {concurrent_requests} concurrent requests for {load_duration}s" + ) + + # Start load generation + load_stats = generate_load( + base_url=base_url, + endpoints=load_endpoints, + duration=load_duration, + concurrent_requests=concurrent_requests, + delay=0.05, # 20 requests/second per worker + ) + + print(f"Load test completed: {load_stats}") + + # Wait a bit for metrics to propagate and scaling to potentially occur + print("Waiting for metrics to propagate and potential scaling...") + time.sleep(30) + + # Check final state + final_pod_counts = {} + for service in services: + final_pod_counts[service] = get_pod_count(namespace, service) + + print(f"Final pod counts: {final_pod_counts}") + + # Check HPA metrics after load + result = kubectl_get("hpa", namespace=namespace) + if result.returncode == 0: + hpas = json.loads(result.stdout) + for hpa in hpas["items"]: + hpa_name = hpa["metadata"]["name"] + status = hpa.get("status", {}) + current_metrics = status.get("currentMetrics", []) + + cpu_metrics = [ + m + for m in current_metrics + if m.get("type") == "Resource" + and m.get("resource", {}).get("name") == "cpu" + ] + + if cpu_metrics: + cpu_utilization = cpu_metrics[0]["resource"]["current"].get( + "averageUtilization" + ) + print(f"Post-load HPA {hpa_name} CPU: {cpu_utilization}%") + + # Verify load test was successful + assert load_stats["success_rate"] > 0.8, ( + f"Load test had low success rate: {load_stats['success_rate']:.2%}" + ) + assert load_stats["total_requests"] > 100, ( + "Load test generated insufficient requests" + ) + + # Note: In CI environments with limited resources, actual scaling may not occur + # The important thing is that the system handled the load successfully + scaling_occurred = any( + final_pod_counts[svc] > initial_pod_counts[svc] + for svc in services + if svc in initial_pod_counts and svc in final_pod_counts + ) + + if scaling_occurred: + print("✅ Scaling occurred during load test") + else: + print( + "⚠️ No scaling occurred - may be due to CI resource constraints or low load thresholds" + ) + + def test_scaling_stabilization_windows(self): + """Verify HPA respects stabilization windows in configuration.""" + namespace = get_namespace() + result = kubectl_get("hpa", namespace=namespace) + + if result.returncode != 0: + pytest.skip("No HPA resources found") + + hpas = json.loads(result.stdout) + + for hpa in hpas["items"]: + hpa_name = hpa["metadata"]["name"] + spec = hpa["spec"] + + # Check if behavior is configured + behavior = spec.get("behavior", {}) + if not behavior: + print(f"⚠️ HPA {hpa_name} has no scaling behavior configured") + continue + + # Check scale up behavior + scale_up = behavior.get("scaleUp", {}) + if scale_up: + stabilization = scale_up.get("stabilizationWindowSeconds", 0) + policies = scale_up.get("policies", []) + print( + f"✅ HPA {hpa_name} scale-up: {stabilization}s stabilization, {len(policies)} policies" + ) + + # Check scale down behavior + scale_down = behavior.get("scaleDown", {}) + if scale_down: + stabilization = scale_down.get("stabilizationWindowSeconds", 0) + policies = scale_down.get("policies", []) + print( + f"✅ HPA {hpa_name} scale-down: {stabilization}s stabilization, {len(policies)} policies" + ) + + +class TestRequestRateScaling: + """Test request rate-based autoscaling (when available).""" + + def test_custom_metrics_for_request_rate(self): + """Check if custom metrics for request rate scaling are available.""" + namespace = get_namespace() + + # Check if custom metrics API has request rate metrics + result = subprocess.run( + ["kubectl", "get", "--raw", "/apis/custom.metrics.k8s.io/v1beta1"], + capture_output=True, + text=True, + ) + + if result.returncode != 0: + pytest.skip("Custom metrics API not available") + + api_response = json.loads(result.stdout) + resources = api_response.get("resources", []) + + # Look for nginx ingress controller metrics + request_rate_metrics = [ + r + for r in resources + if "nginx_ingress_controller" in r.get("name", "") + and "requests" in r.get("name", "") + ] + + if request_rate_metrics: + print(f"✅ Found {len(request_rate_metrics)} request rate metrics") + for metric in request_rate_metrics: + print(f" - {metric['name']}") + else: + print( + "⚠️ No request rate metrics available - may require ingress controller metrics configuration" + ) + + def test_hpa_request_rate_metrics(self): + """Verify HPA can access request rate metrics (when configured).""" + namespace = get_namespace() + result = kubectl_get("hpa", namespace=namespace) + + if result.returncode != 0: + pytest.skip("No HPA resources found") + + hpas = json.loads(result.stdout) + + for hpa in hpas["items"]: + hpa_name = hpa["metadata"]["name"] + status = hpa.get("status", {}) + current_metrics = status.get("currentMetrics", []) + + # Look for custom metrics (request rate) + custom_metrics = [ + m + for m in current_metrics + if m.get("type") in ["Pods", "Object"] + and "nginx_ingress_controller" in str(m) + ] + + if custom_metrics: + print(f"✅ HPA {hpa_name} has custom metrics available") + for metric in custom_metrics: + print(f" - {metric}") + else: + # Check if it's configured but not yet available + spec_metrics = hpa["spec"]["metrics"] + configured_custom = [ + m + for m in spec_metrics + if m.get("type") in ["Pods", "Object"] + ] + + if configured_custom: + print( + f"⚠️ HPA {hpa_name} has custom metrics configured but not available yet" + ) + else: + print( + f"ℹ️ HPA {hpa_name} uses only CPU metrics (no request rate scaling)" + ) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/.github/workflows/tests/test_observability.py b/.github/workflows/tests/test_observability.py new file mode 100644 index 00000000..7edca73c --- /dev/null +++ b/.github/workflows/tests/test_observability.py @@ -0,0 +1,522 @@ +"""Test observability stack deployment and functionality.""" + +import json +import os +import subprocess +import time + +import pytest +import requests + + +def get_namespace(): + """Get the target namespace from environment or default.""" + return os.environ.get("NAMESPACE", "eoapi") + + +def get_release_name(): + """Get the release name from environment or default.""" + return os.environ.get("RELEASE_NAME", "eoapi") + + +def kubectl_get(resource, namespace=None, label_selector=None, output="json"): + """Execute kubectl get command with optional parameters.""" + cmd = ["kubectl", "get", resource] + + if namespace: + cmd.extend(["-n", namespace]) + + if label_selector: + cmd.extend(["-l", label_selector]) + + if output: + cmd.extend(["-o", output]) + + result = subprocess.run(cmd, capture_output=True, text=True) + return result + + +def kubectl_port_forward(service, local_port, remote_port, namespace): + """Start kubectl port-forward in background.""" + cmd = [ + "kubectl", + "port-forward", + f"svc/{service}", + f"{local_port}:{remote_port}", + "-n", + namespace, + ] + + process = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + + # Give it time to establish connection + time.sleep(3) + return process + + +def wait_for_url(url, timeout=30, interval=2): + """Wait for URL to become available.""" + start_time = time.time() + while time.time() - start_time < timeout: + try: + response = requests.get(url, timeout=5) + if response.status_code == 200: + return True + except (requests.RequestException, requests.ConnectionError): + pass + time.sleep(interval) + return False + + +class TestMonitoringStackDeployment: + """Test core monitoring components deployment.""" + + def test_prometheus_server_deployment(self): + """Verify Prometheus server is deployed and running.""" + namespace = get_namespace() + result = kubectl_get( + "pods", + namespace=namespace, + label_selector="app.kubernetes.io/name=prometheus,app.kubernetes.io/component=server", + ) + + if result.returncode != 0: + pytest.skip("Prometheus server not deployed - monitoring disabled") + + pods = json.loads(result.stdout) + assert len(pods["items"]) > 0, "No Prometheus server pods found" + + # Check pod is running + for pod in pods["items"]: + assert pod["status"]["phase"] == "Running", ( + f"Prometheus pod {pod['metadata']['name']} not running" + ) + + # Check readiness + conditions = pod["status"].get("conditions", []) + ready_condition = next( + (c for c in conditions if c["type"] == "Ready"), None + ) + assert ready_condition and ready_condition["status"] == "True", ( + "Prometheus pod not ready" + ) + + def test_grafana_deployment(self): + """Verify Grafana is deployed and running.""" + namespace = get_namespace() + result = kubectl_get( + "pods", + namespace=namespace, + label_selector="app.kubernetes.io/name=grafana", + ) + + if result.returncode != 0: + pytest.skip("Grafana not deployed - observability disabled") + + pods = json.loads(result.stdout) + assert len(pods["items"]) > 0, "No Grafana pods found" + + # Check pod is running + for pod in pods["items"]: + assert pod["status"]["phase"] == "Running", ( + f"Grafana pod {pod['metadata']['name']} not running" + ) + + def test_prometheus_adapter_deployment(self): + """Verify prometheus-adapter is deployed and provides custom metrics API.""" + namespace = get_namespace() + result = kubectl_get( + "pods", + namespace=namespace, + label_selector="app.kubernetes.io/name=prometheus-adapter", + ) + + if result.returncode != 0: + pytest.skip("prometheus-adapter not deployed") + + pods = json.loads(result.stdout) + assert len(pods["items"]) > 0, "No prometheus-adapter pods found" + + # Check pod is running + for pod in pods["items"]: + assert pod["status"]["phase"] == "Running", ( + f"prometheus-adapter pod {pod['metadata']['name']} not running" + ) + + def test_kube_state_metrics_deployment(self): + """Verify kube-state-metrics is collecting Kubernetes object metrics.""" + namespace = get_namespace() + result = kubectl_get( + "pods", + namespace=namespace, + label_selector="app.kubernetes.io/name=kube-state-metrics", + ) + + if result.returncode != 0: + pytest.skip("kube-state-metrics not deployed") + + pods = json.loads(result.stdout) + assert len(pods["items"]) > 0, "No kube-state-metrics pods found" + + # Check pod is running + for pod in pods["items"]: + assert pod["status"]["phase"] == "Running", ( + f"kube-state-metrics pod {pod['metadata']['name']} not running" + ) + + def test_node_exporter_deployment(self): + """Verify node-exporter is collecting node metrics.""" + namespace = get_namespace() + result = kubectl_get( + "pods", + namespace=namespace, + label_selector="app.kubernetes.io/name=prometheus-node-exporter", + ) + + if result.returncode != 0: + pytest.skip("prometheus-node-exporter not deployed") + + pods = json.loads(result.stdout) + assert len(pods["items"]) > 0, "No prometheus-node-exporter pods found" + + # Check pods are running (should be one per node in DaemonSet) + for pod in pods["items"]: + assert pod["status"]["phase"] == "Running", ( + f"node-exporter pod {pod['metadata']['name']} not running" + ) + + +class TestMetricsCollection: + """Test metrics collection functionality.""" + + def test_custom_metrics_api_available(self): + """Verify custom metrics API is available.""" + result = subprocess.run( + ["kubectl", "get", "--raw", "/apis/custom.metrics.k8s.io/v1beta1"], + capture_output=True, + text=True, + ) + + if result.returncode != 0: + pytest.skip( + "Custom metrics API not available - prometheus-adapter may not be configured" + ) + + api_response = json.loads(result.stdout) + assert api_response["kind"] == "APIResourceList", ( + "Invalid custom metrics API response" + ) + assert ( + api_response["groupVersion"] == "custom.metrics.k8s.io/v1beta1" + ), "Wrong API version" + + def test_metrics_server_integration(self): + """Verify metrics-server is working for resource metrics.""" + # Test if we can get pod metrics + result = subprocess.run( + ["kubectl", "top", "pods", "-n", get_namespace(), "--no-headers"], + capture_output=True, + text=True, + ) + + if result.returncode != 0: + pytest.skip("metrics-server not available or not ready") + + # Should have some metrics output + lines = result.stdout.strip().split("\n") + assert len(lines) > 0, "No pod metrics available" + + # Check format includes CPU and Memory columns + for line in lines: + if line.strip(): # Skip empty lines + parts = line.split() + assert len(parts) >= 3, f"Invalid metrics format: {line}" + + def test_prometheus_targets_reachable(self): + """Test that Prometheus can reach its scrape targets (when accessible).""" + namespace = get_namespace() + + # Check if Prometheus service exists + result = kubectl_get( + "svc", + namespace=namespace, + label_selector="app.kubernetes.io/name=prometheus", + ) + if result.returncode != 0: + pytest.skip("Prometheus service not found") + + services = json.loads(result.stdout) + if not services["items"]: + pytest.skip("No Prometheus services found") + + prometheus_service = None + for svc in services["items"]: + if "server" in svc["metadata"]["name"]: + prometheus_service = svc["metadata"]["name"] + break + + if not prometheus_service: + pytest.skip("Prometheus server service not found") + + # Try to port-forward and check targets (with timeout) + port_forward = None + try: + port_forward = kubectl_port_forward( + prometheus_service, 9090, 80, namespace + ) + + if wait_for_url("http://localhost:9090", timeout=15): + # Try to get targets endpoint + try: + response = requests.get( + "http://localhost:9090/api/v1/targets", timeout=10 + ) + if response.status_code == 200: + targets_data = response.json() + assert targets_data["status"] == "success", ( + "Prometheus targets API error" + ) + + # Check we have some targets + targets = targets_data.get("data", {}).get( + "activeTargets", [] + ) + healthy_targets = [ + t for t in targets if t.get("health") == "up" + ] + + # Should have at least some healthy targets + assert len(healthy_targets) > 0, ( + "No healthy Prometheus targets found" + ) + print( + f"✅ Found {len(healthy_targets)}/{len(targets)} healthy targets" + ) + + else: + pytest.skip( + f"Cannot access Prometheus API: {response.status_code}" + ) + + except requests.RequestException: + pytest.skip( + "Cannot connect to Prometheus API via port-forward" + ) + else: + pytest.skip("Cannot establish port-forward to Prometheus") + + finally: + if port_forward: + port_forward.terminate() + port_forward.wait(timeout=5) + + +class TestAutoscalingIntegration: + """Test HPA and autoscaling functionality.""" + + def test_hpa_resources_exist(self): + """Verify HPA resources are configured.""" + namespace = get_namespace() + result = kubectl_get("hpa", namespace=namespace) + + if result.returncode != 0: + pytest.skip("No HPA resources found - autoscaling not enabled") + + hpas = json.loads(result.stdout) + assert len(hpas["items"]) > 0, "No HPA resources configured" + + # Check common HPA resources + hpa_names = [hpa["metadata"]["name"] for hpa in hpas["items"]] + expected_hpas = [ + "eoapi-stac-hpa", + "eoapi-raster-hpa", + "eoapi-vector-hpa", + ] + + found_hpas = [ + name + for name in expected_hpas + if any(name in hpa_name for hpa_name in hpa_names) + ] + assert len(found_hpas) > 0, ( + f"No expected HPA resources found. Available: {hpa_names}" + ) + + print(f"✅ Found HPA resources: {found_hpas}") + + def test_hpa_metrics_available(self): + """Verify HPA can read required metrics.""" + namespace = get_namespace() + result = kubectl_get("hpa", namespace=namespace) + + if result.returncode != 0: + pytest.skip("No HPA resources found") + + hpas = json.loads(result.stdout) + + for hpa in hpas["items"]: + hpa_name = hpa["metadata"]["name"] + + # Check HPA status has current metrics + status = hpa.get("status", {}) + current_metrics = status.get("currentMetrics", []) + + # Should have at least CPU metrics + cpu_metrics = [ + m + for m in current_metrics + if m.get("type") == "Resource" + and m.get("resource", {}).get("name") == "cpu" + ] + + if not cpu_metrics: + # Check if it's still initializing + conditions = status.get("conditions", []) + scaling_active = next( + (c for c in conditions if c["type"] == "ScalingActive"), + None, + ) + + if scaling_active and scaling_active["status"] == "False": + print( + f"⚠️ HPA {hpa_name} not yet active: {scaling_active.get('message', 'Unknown')}" + ) + else: + print( + f"✅ HPA {hpa_name} is configured but may still be initializing" + ) + else: + cpu_value = cpu_metrics[0]["resource"]["current"][ + "averageUtilization" + ] + print(f"✅ HPA {hpa_name} CPU metric: {cpu_value}%") + + def test_service_resource_requests_configured(self): + """Verify services have resource requests (required for HPA CPU metrics).""" + namespace = get_namespace() + services = ["stac", "raster", "vector"] + + for service in services: + result = kubectl_get( + "pods", + namespace=namespace, + label_selector=f"app=eoapi-{service}", + ) + + if result.returncode != 0: + continue + + pods = json.loads(result.stdout) + running_pods = [ + p for p in pods["items"] if p["status"]["phase"] == "Running" + ] + + if not running_pods: + continue + + # Check first running pod for resource requests + pod = running_pods[0] + containers = pod["spec"]["containers"] + + for container in containers: + if container["name"] == service: # Main service container + resources = container.get("resources", {}) + requests = resources.get("requests", {}) + + assert "cpu" in requests, ( + f"Service {service} missing CPU resource requests (required for HPA)" + ) + assert "memory" in requests, ( + f"Service {service} missing memory resource requests" + ) + + print( + f"✅ Service {service} has resource requests: CPU={requests['cpu']}, Memory={requests['memory']}" + ) + break + + +class TestGrafanaDashboards: + """Test Grafana dashboard functionality (when accessible).""" + + def test_grafana_service_accessibility(self): + """Test if Grafana service is accessible.""" + namespace = get_namespace() + result = kubectl_get( + "svc", + namespace=namespace, + label_selector="app.kubernetes.io/name=grafana", + ) + + if result.returncode != 0: + pytest.skip("Grafana service not found") + + services = json.loads(result.stdout) + if not services["items"]: + pytest.skip("No Grafana services found") + + grafana_service = services["items"][0]["metadata"]["name"] + + # Try port-forward to test accessibility + port_forward = None + try: + port_forward = kubectl_port_forward( + grafana_service, 3000, 80, namespace + ) + + if wait_for_url("http://localhost:3000", timeout=15): + # Try to access login page + response = requests.get( + "http://localhost:3000/login", timeout=10 + ) + assert response.status_code == 200, ( + "Cannot access Grafana login page" + ) + assert "Grafana" in response.text, "Invalid Grafana response" + print("✅ Grafana service is accessible") + else: + pytest.skip("Cannot establish connection to Grafana") + + except requests.RequestException as e: + pytest.skip(f"Cannot access Grafana: {e}") + finally: + if port_forward: + port_forward.terminate() + port_forward.wait(timeout=5) + + def test_grafana_admin_secret_exists(self): + """Verify Grafana admin password secret exists.""" + namespace = get_namespace() + release_name = get_release_name() + + result = kubectl_get("secret", namespace=namespace, output="json") + if result.returncode != 0: + pytest.skip("Cannot list secrets") + + secrets = json.loads(result.stdout) + grafana_secrets = [ + s + for s in secrets["items"] + if "grafana" in s["metadata"]["name"].lower() + ] + + if not grafana_secrets: + pytest.skip("No Grafana secrets found") + + # Check for admin password key + found_password = False + for secret in grafana_secrets: + if "admin-password" in secret.get("data", {}): + found_password = True + print( + f"✅ Found Grafana admin password in secret: {secret['metadata']['name']}" + ) + break + + assert found_password, "Grafana admin password secret not found" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/scripts/README.md b/scripts/README.md index acd72a48..6f36947e 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -8,8 +8,9 @@ Automation scripts for deploying, testing, and managing eoAPI on Kubernetes. |--------|---------|-------| | **`deploy.sh`** | Deploy eoAPI to Kubernetes | `./deploy.sh [deploy\|setup\|cleanup] [--ci]` | | **`ingest.sh`** | Ingest STAC data into deployed eoAPI | `./ingest.sh [collections.json] [items.json]` | -| **`test.sh`** | Run Helm and integration tests | `./test.sh [helm\|integration\|all] [--debug]` | -| **`lib/`** | Shared utility functions | See [lib/README.md](lib/README.md) | +| **`test.sh`** | Run Helm, integration, and observability tests | `./test.sh [helm\|integration\|observability\|all] [--debug]` | +| **`lib/common.sh`** | Core utility functions and logging | Shared functions for all scripts | +| **`lib/observability.sh`** | Monitoring and autoscaling utilities | Functions for testing observability stack | ## Quick Start @@ -50,6 +51,35 @@ export RASTER_ENDPOINT=http://... # Override Raster API endpoint export VECTOR_ENDPOINT=http://... # Override Vector API endpoint ``` +## Observability Testing + +The test suite includes comprehensive observability validation: + +**Monitoring Stack Tests:** +- Prometheus server deployment and metrics collection +- Grafana dashboard accessibility and data source connectivity +- Custom metrics API availability via prometheus-adapter +- HPA (Horizontal Pod Autoscaler) functionality with CPU metrics +- kube-state-metrics and node-exporter deployment + +**Autoscaling Tests:** +- HPA configuration validation for STAC, Raster, and Vector services +- CPU-based scaling threshold verification +- Request-rate scaling metrics (when ingress metrics available) +- Scaling behavior and stabilization window testing + +**Run observability tests:** +```bash +# Run only observability tests +./scripts/test.sh observability + +# Run with enhanced monitoring output +./scripts/test.sh observability --debug + +# Run all tests including observability +./scripts/test.sh all +``` + ## Common Examples **Deploy with custom namespace:** @@ -67,6 +97,25 @@ NAMESPACE=my-eoapi ./scripts/deploy.sh ./scripts/test.sh all --debug ``` +**Run only observability tests:** +```bash +./scripts/test.sh observability --debug +``` + +**Test monitoring stack health:** +```bash +# Source observability functions +source ./scripts/lib/observability.sh + +# Check individual components +check_prometheus_health +check_grafana_health +check_hpa_status + +# Get comprehensive status +get_monitoring_stack_status +``` + **Cleanup deployment:** ```bash ./scripts/deploy.sh cleanup diff --git a/scripts/lib/common.sh b/scripts/lib/common.sh index dcaf18d6..d5015db5 100755 --- a/scripts/lib/common.sh +++ b/scripts/lib/common.sh @@ -225,6 +225,10 @@ preflight_test() { validate_tools kubectl python3 || return 1 validate_cluster || return 1 ;; + observability) + validate_tools kubectl python3 || return 1 + validate_cluster || return 1 + ;; *) log_error "Unknown test type: $test_type" return 1 diff --git a/scripts/lib/observability.sh b/scripts/lib/observability.sh new file mode 100644 index 00000000..2fc1c7d9 --- /dev/null +++ b/scripts/lib/observability.sh @@ -0,0 +1,530 @@ +#!/bin/bash +# Observability utility functions for eoAPI Kubernetes deployments +# Provides monitoring, metrics, and autoscaling validation capabilities + +# Colors for output formatting +readonly OBS_RED='\033[0;31m' +readonly OBS_GREEN='\033[0;32m' +readonly OBS_YELLOW='\033[1;33m' +readonly OBS_BLUE='\033[0;34m' +readonly OBS_NC='\033[0m' # No Color + +# Logging functions +obs_log_info() { + printf "${OBS_BLUE}[OBS-INFO]${OBS_NC} %s\n" "$1" +} + +obs_log_success() { + printf "${OBS_GREEN}[OBS-SUCCESS]${OBS_NC} %s\n" "$1" +} + +obs_log_warning() { + printf "${OBS_YELLOW}[OBS-WARNING]${OBS_NC} %s\n" "$1" +} + +obs_log_error() { + printf "${OBS_RED}[OBS-ERROR]${OBS_NC} %s\n" "$1" +} + +# Check if a command exists +obs_command_exists() { + command -v "$1" >/dev/null 2>&1 +} + +# Get namespace with fallback +get_obs_namespace() { + echo "${NAMESPACE:-eoapi}" +} + +# Get release name with fallback +get_obs_release_name() { + echo "${RELEASE_NAME:-eoapi}" +} + +# Check if monitoring components are deployed +check_monitoring_deployment() { + local namespace + namespace=$(get_obs_namespace) + local component="$1" + local label_selector="$2" + + if [ -z "$component" ] || [ -z "$label_selector" ]; then + obs_log_error "check_monitoring_deployment requires component name and label selector" + return 1 + fi + + local pod_count + pod_count=$(kubectl get pods -n "$namespace" -l "$label_selector" --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l) + + if [ "$pod_count" -gt 0 ]; then + obs_log_success "$component is running ($pod_count pods)" + return 0 + else + # Check if pods exist but not running + local total_pods + total_pods=$(kubectl get pods -n "$namespace" -l "$label_selector" --no-headers 2>/dev/null | wc -l) + if [ "$total_pods" -gt 0 ]; then + obs_log_warning "$component pods exist but not running ($total_pods pods)" + return 1 + else + obs_log_info "$component not deployed" + return 2 + fi + fi +} + +# Check Prometheus deployment and health +check_prometheus_health() { + local namespace + namespace=$(get_obs_namespace) + + obs_log_info "Checking Prometheus health..." + + # Check deployment + if ! check_monitoring_deployment "Prometheus" "app.kubernetes.io/name=prometheus,app.kubernetes.io/component=server"; then + return $? + fi + + # Check service exists + if ! kubectl get svc -n "$namespace" -l "app.kubernetes.io/name=prometheus" >/dev/null 2>&1; then + obs_log_warning "Prometheus service not found" + return 1 + fi + + obs_log_success "Prometheus is healthy" + return 0 +} + +# Check Grafana deployment and health +check_grafana_health() { + local namespace + namespace=$(get_obs_namespace) + + obs_log_info "Checking Grafana health..." + + # Check deployment + if ! check_monitoring_deployment "Grafana" "app.kubernetes.io/name=grafana"; then + return $? + fi + + # Check service exists + if ! kubectl get svc -n "$namespace" -l "app.kubernetes.io/name=grafana" >/dev/null 2>&1; then + obs_log_warning "Grafana service not found" + return 1 + fi + + # Check for admin secret + if ! kubectl get secret -n "$namespace" -o name | grep -q grafana; then + obs_log_warning "Grafana admin secret not found" + fi + + obs_log_success "Grafana is healthy" + return 0 +} + +# Check prometheus-adapter health +check_prometheus_adapter_health() { + local namespace + namespace=$(get_obs_namespace) + + obs_log_info "Checking prometheus-adapter health..." + + # Check deployment + if ! check_monitoring_deployment "prometheus-adapter" "app.kubernetes.io/name=prometheus-adapter"; then + return $? + fi + + # Check custom metrics API + if kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" >/dev/null 2>&1; then + obs_log_success "Custom metrics API is available" + else + obs_log_warning "Custom metrics API not accessible" + return 1 + fi + + obs_log_success "prometheus-adapter is healthy" + return 0 +} + +# Check HPA resources and status +check_hpa_status() { + local namespace + namespace=$(get_obs_namespace) + local service_name="$1" # optional: check specific service + + obs_log_info "Checking HPA status..." + + local hpa_selector="" + if [ -n "$service_name" ]; then + hpa_selector="-l app.kubernetes.io/component=$service_name-hpa" + fi + + if ! kubectl get hpa -n "$namespace" "$hpa_selector" >/dev/null 2>&1; then + obs_log_info "No HPA resources found" + return 2 + fi + + local hpa_count + hpa_count=$(kubectl get hpa -n "$namespace" "$hpa_selector" --no-headers 2>/dev/null | wc -l) + obs_log_info "Found $hpa_count HPA resource(s)" + + # Check HPA status details + local unhealthy_hpas="" + local unhealthy_count=0 + while IFS= read -r line; do + if [ -n "$line" ]; then + local hpa_name + local targets + hpa_name=$(echo "$line" | awk '{print $1}') + targets=$(echo "$line" | awk '{print $4}') + + if echo "$targets" | grep -q ""; then + unhealthy_hpas="$unhealthy_hpas $hpa_name" + unhealthy_count=$((unhealthy_count + 1)) + obs_log_warning "HPA $hpa_name has unknown metrics" + else + obs_log_success "HPA $hpa_name is reporting metrics: $targets" + fi + fi + done << EOF +$(kubectl get hpa -n "$namespace" "$hpa_selector" --no-headers 2>/dev/null) +EOF + + if [ $unhealthy_count -eq 0 ]; then + obs_log_success "All HPA resources are healthy" + return 0 + else + obs_log_warning "$unhealthy_count HPA resource(s) have issues:$unhealthy_hpas" + return 1 + fi +} + +# Get pod resource metrics +get_pod_metrics() { + local namespace + namespace=$(get_obs_namespace) + local service_name="$1" + + if [ -z "$service_name" ]; then + obs_log_error "get_pod_metrics requires service name" + return 1 + fi + + obs_log_info "Getting metrics for $service_name..." + + if ! obs_command_exists kubectl; then + obs_log_error "kubectl not found" + return 1 + fi + + if ! kubectl top pods -n "$namespace" -l "app=eoapi-$service_name" --no-headers 2>/dev/null; then + obs_log_warning "Cannot get pod metrics for $service_name (metrics-server may not be ready)" + return 1 + fi + + return 0 +} + +# Validate custom metrics API +validate_custom_metrics_api() { + obs_log_info "Validating custom metrics API..." + + if ! kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" >/dev/null 2>&1; then + obs_log_error "Custom metrics API not available" + return 1 + fi + + # Get available metrics + local metrics_json + metrics_json=$(kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" 2>/dev/null) + if [ -n "$metrics_json" ]; then + local metric_count + metric_count=$(echo "$metrics_json" | grep -o '"name"' | wc -l) + obs_log_success "Custom metrics API is available with $metric_count metric types" + else + obs_log_warning "Custom metrics API available but no metrics registered" + fi + + return 0 +} + +# Setup port forwarding for monitoring services +setup_monitoring_port_forward() { + local service="$1" + local local_port="$2" + local remote_port="$3" + local namespace + namespace=$(get_obs_namespace) + + if [ -z "$service" ] || [ -z "$local_port" ] || [ -z "$remote_port" ]; then + obs_log_error "setup_monitoring_port_forward requires service, local_port, and remote_port" + return 1 + fi + + obs_log_info "Setting up port forward for $service ($local_port:$remote_port)..." + + # Check if service exists + if ! kubectl get svc "$service" -n "$namespace" >/dev/null 2>&1; then + obs_log_error "Service $service not found in namespace $namespace" + return 1 + fi + + # Start port forwarding in background + kubectl port-forward "svc/$service" "$local_port:$remote_port" -n "$namespace" >/dev/null 2>&1 & + local pf_pid=$! + + # Give it time to establish + sleep 3 + + # Check if port forward is working + if kill -0 $pf_pid 2>/dev/null; then + obs_log_success "Port forward established (PID: $pf_pid)" + echo $pf_pid # Return PID for cleanup + return 0 + else + obs_log_error "Failed to establish port forward" + return 1 + fi +} + +# Wait for monitoring stack to be ready +wait_for_monitoring_stack() { + local timeout="${1:-300}" # 5 minutes default + local namespace + namespace=$(get_obs_namespace) + + obs_log_info "Waiting for monitoring stack to be ready (timeout: ${timeout}s)..." + + local start_time + start_time=$(date +%s) + local components="prometheus:app.kubernetes.io/name=prometheus,app.kubernetes.io/component=server grafana:app.kubernetes.io/name=grafana prometheus-adapter:app.kubernetes.io/name=prometheus-adapter" + + while [ $(($(date +%s) - start_time)) -lt "$timeout" ]; do + local all_ready=true + + for component_spec in $components; do + local component_name=${component_spec%%:*} + local selector=${component_spec#*:} + + if ! kubectl wait --for=condition=Ready pod -l "$selector" -n "$namespace" --timeout=10s >/dev/null 2>&1; then + obs_log_info "Waiting for $component_name to be ready..." + all_ready=false + break + fi + done + + if [ "$all_ready" = true ]; then + obs_log_success "Monitoring stack is ready" + return 0 + fi + + sleep 10 + done + + obs_log_error "Timeout waiting for monitoring stack to be ready" + return 1 +} + +# Generate synthetic load for testing autoscaling +generate_synthetic_load() { + local base_url="$1" + local duration="${2:-60}" + local concurrent_requests="${3:-5}" + local delay="${4:-0.1}" + + if [ -z "$base_url" ]; then + obs_log_error "generate_synthetic_load requires base_url" + return 1 + fi + + obs_log_info "Generating synthetic load..." + obs_log_info "URL: $base_url, Duration: ${duration}s, Concurrent: $concurrent_requests, Delay: ${delay}s" + + if ! obs_command_exists curl; then + obs_log_error "curl not found" + return 1 + fi + + # Test endpoints for load generation + local endpoints="/stac/collections /stac/search?collections=noaa-emergency-response&limit=50 /raster/collections /vector/collections" + + local success_count=0 + local error_count=0 + local pids="" + + # Worker function for load generation + load_worker() { + local end_time=$(($(date +%s) + duration)) + + while [ "$(date +%s)" -lt "$end_time" ]; do + for endpoint in $endpoints; do + local url="$base_url$endpoint" + if curl -s -f "$url" >/dev/null 2>&1; then + success_count=$((success_count + 1)) + else + error_count=$((error_count + 1)) + fi + sleep "$delay" + done + done + } + + # Start concurrent workers + local i=1 + while [ "$i" -le "$concurrent_requests" ]; do + load_worker $i & + pids="$pids $!" + i=$((i + 1)) + done + + # Wait for all workers to complete + for pid in $pids; do + wait "$pid" + done + + local total_requests=$((success_count + error_count)) + local success_rate=0 + if [ $total_requests -gt 0 ]; then + success_rate=$(( (success_count * 100) / total_requests )) + fi + + obs_log_info "Load test completed: $total_requests requests ($success_count success, $error_count errors, ${success_rate}% success rate)" + + return 0 +} + +# Get comprehensive monitoring stack status +get_monitoring_stack_status() { + local namespace + namespace=$(get_obs_namespace) + + obs_log_info "=== Monitoring Stack Status ===" + + # Check each component + local components="Prometheus:app.kubernetes.io/name=prometheus,app.kubernetes.io/component=server Grafana:app.kubernetes.io/name=grafana prometheus-adapter:app.kubernetes.io/name=prometheus-adapter kube-state-metrics:app.kubernetes.io/name=kube-state-metrics node-exporter:app.kubernetes.io/name=prometheus-node-exporter" + + local healthy_count=0 + local total_count=5 + + for component_spec in $components; do + local component_name=${component_spec%%:*} + local selector=${component_spec#*:} + + if check_monitoring_deployment "$component_name" "$selector"; then + healthy_count=$((healthy_count + 1)) + fi + done + + obs_log_info "Healthy components: $healthy_count/$total_count" + + # Check HPA status + check_hpa_status "$@" + + # Check custom metrics API + validate_custom_metrics_api + + obs_log_info "=== End Monitoring Stack Status ===" + + return 0 +} + +# Cleanup monitoring port forwards +cleanup_monitoring_port_forwards() { + obs_log_info "Cleaning up monitoring port forwards..." + + # Kill any kubectl port-forward processes + pkill -f "kubectl port-forward.*prometheus" 2>/dev/null || true + pkill -f "kubectl port-forward.*grafana" 2>/dev/null || true + + obs_log_info "Port forward cleanup completed" +} + +# Test Prometheus connectivity (with port forward) +test_prometheus_connectivity() { + local namespace + namespace=$(get_obs_namespace) + local timeout="${1:-30}" + + obs_log_info "Testing Prometheus connectivity..." + + # Find Prometheus service + local prom_service + prom_service=$(kubectl get svc -n "$namespace" -l "app.kubernetes.io/name=prometheus,app.kubernetes.io/component=server" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) + + if [ -z "$prom_service" ]; then + obs_log_error "Prometheus service not found" + return 1 + fi + + # Setup port forward + local pf_pid + if ! pf_pid=$(setup_monitoring_port_forward "$prom_service" 9090 80); then + return 1 + fi + + # Test connectivity + local connected=false + local start_time + start_time=$(date +%s) + + while [ $(($(date +%s) - start_time)) -lt "$timeout" ]; do + if curl -s "http://localhost:9090/api/v1/query?query=up" >/dev/null 2>&1; then + connected=true + break + fi + sleep 2 + done + + # Cleanup port forward + kill "$pf_pid" 2>/dev/null || true + + if [ "$connected" = true ]; then + obs_log_success "Prometheus is accessible and responding" + return 0 + else + obs_log_error "Cannot connect to Prometheus API" + return 1 + fi +} + +# Validate observability prerequisites +validate_observability_prerequisites() { + obs_log_info "Validating observability prerequisites..." + + local missing_deps="" + local missing_count=0 + + # Check required tools + local required_tools="kubectl curl python3" + for tool in $required_tools; do + if ! obs_command_exists "$tool"; then + missing_deps="$missing_deps $tool" + missing_count=$((missing_count + 1)) + fi + done + + if [ $missing_count -gt 0 ]; then + obs_log_error "Missing required tools:$missing_deps" + return 1 + fi + + # Check cluster connectivity + if ! kubectl cluster-info >/dev/null 2>&1; then + obs_log_error "Cannot connect to Kubernetes cluster" + return 1 + fi + + # Check namespace exists + local namespace + namespace=$(get_obs_namespace) + if ! kubectl get namespace "$namespace" >/dev/null 2>&1; then + obs_log_error "Namespace $namespace does not exist" + return 1 + fi + + obs_log_success "Observability prerequisites validated" + return 0 +} + +# Functions are available when script is sourced +# Note: Function exports removed for compatibility with different shells diff --git a/scripts/test.sh b/scripts/test.sh index 805c4285..76785af1 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -22,7 +22,7 @@ fi # Show help message show_help() { cat << EOF -eoAPI Test Suite - Combined Helm and Integration Testing +eoAPI Test Suite - Combined Helm, Integration, and Observability Testing USAGE: $(basename "$0") [COMMAND] [OPTIONS] @@ -30,7 +30,8 @@ USAGE: COMMANDS: helm Run Helm tests only (lint, unit tests, template validation) integration Run integration tests only (requires deployed eoAPI) - all Run both Helm and integration tests [default] + observability Run observability and autoscaling tests only + all Run Helm, integration, and observability tests [default] check-deps Check and install dependencies only check-deployment Check eoAPI deployment status only @@ -50,12 +51,19 @@ DESCRIPTION: Integration Tests: - Deployment verification - Service readiness checks + + Observability Tests: + - Monitoring stack deployment verification (Prometheus, Grafana, etc.) + - HPA (Horizontal Pod Autoscaler) configuration validation + - Metrics collection and custom metrics API testing + - Autoscaling behavior validation - API endpoint testing - Comprehensive failure debugging REQUIREMENTS: Helm Tests: helm, helm unittest plugin Integration Tests: kubectl, python/pytest, deployed eoAPI instance + Observability Tests: kubectl, python/pytest, deployed eoAPI with monitoring enabled ENVIRONMENT VARIABLES: RELEASE_NAME Override release name detection @@ -69,10 +77,12 @@ EXAMPLES: $(basename "$0") # Run all tests $(basename "$0") helm # Run only Helm tests $(basename "$0") integration # Run only integration tests + $(basename "$0") observability # Run only observability tests $(basename "$0") check-deps # Check dependencies only $(basename "$0") check-deployment # Check deployment status only $(basename "$0") all --debug # Run all tests with debug output $(basename "$0") integration --debug # Run integration tests with enhanced logging + $(basename "$0") observability --debug # Run observability tests with debug output $(basename "$0") --help # Show this help EOF @@ -82,7 +92,7 @@ EOF parse_args() { while [[ $# -gt 0 ]]; do case $1 in - helm|integration|all|check-deps|check-deployment) + helm|integration|observability|all|check-deps|check-deployment) COMMAND="$1"; shift ;; --debug) DEBUG_MODE=true; shift ;; @@ -118,6 +128,11 @@ check_integration_dependencies() { preflight_test "integration" || exit 1 } +# Check dependencies for observability tests +check_observability_dependencies() { + preflight_test "observability" || exit 1 +} + # Install Python test dependencies install_test_deps() { log_info "Installing Python test dependencies..." @@ -749,6 +764,176 @@ EOF fi } +# Run observability tests +run_observability_tests() { + log_info "=== Running Observability Tests ===" + + local python_cmd="python" + if command_exists python3; then + python_cmd="python3" + fi + + local test_dir=".github/workflows/tests" + if [ ! -d "$test_dir" ]; then + log_error "Test directory not found: $test_dir" + log_info "Expected observability test files:" + log_info " - $test_dir/test_observability.py" + log_info " - $test_dir/test_autoscaling.py" + return 1 + fi + + # Check if observability test files exist + local obs_tests=("test_observability.py" "test_autoscaling.py") + local available_tests=() + + for test_file in "${obs_tests[@]}"; do + if [ -f "$test_dir/$test_file" ]; then + available_tests+=("$test_dir/$test_file") + else + log_warning "Test file not found: $test_dir/$test_file" + fi + done + + if [ ${#available_tests[@]} -eq 0 ]; then + log_error "No observability test files found in $test_dir" + return 1 + fi + + # Install test dependencies + log_info "Installing Python test dependencies..." + $python_cmd -m pip install --upgrade pip >/dev/null 2>&1 || log_warning "Could not upgrade pip" + $python_cmd -m pip install pytest requests >/dev/null 2>&1 || { + log_error "Failed to install pytest and requests" + return 1 + } + + # Check monitoring stack health first + check_monitoring_stack_health + + # Set environment variables for tests + export NAMESPACE="${NAMESPACE:-eoapi}" + export RELEASE_NAME="${RELEASE_NAME:-eoapi}" + + log_info "Running observability tests..." + log_info "Namespace: $NAMESPACE" + log_info "Release: $RELEASE_NAME" + + local failed_tests=() + + # Run each test file + for test_file in "${available_tests[@]}"; do + local test_name + test_name=$(basename "$test_file" .py) + log_info "Running $test_name..." + + local test_result=0 + if [ "$DEBUG_MODE" = true ]; then + $python_cmd -m pytest "$test_file" -v --tb=short || test_result=$? + else + $python_cmd -m pytest "$test_file" -v --tb=line || test_result=$? + fi + + if [ $test_result -ne 0 ]; then + failed_tests+=("$test_name") + log_error "❌ $test_name failed" + else + log_info "✅ $test_name passed" + fi + done + + # Final results + if [ ${#failed_tests[@]} -eq 0 ]; then + log_info "✅ All observability tests completed successfully!" + else + log_error "Some tests failed: ${failed_tests[*]}" + + if [ "$DEBUG_MODE" = true ]; then + show_observability_debug_info + fi + + return 1 + fi +} + +# Check monitoring stack health +check_monitoring_stack_health() { + log_info "Checking monitoring stack health..." + + local components="prometheus:app.kubernetes.io/name=prometheus grafana:app.kubernetes.io/name=grafana prometheus-adapter:app.kubernetes.io/name=prometheus-adapter kube-state-metrics:app.kubernetes.io/name=kube-state-metrics" + + local healthy_components="" + local healthy_count=0 + local unhealthy_components="" + + for component_spec in $components; do + local component_name=${component_spec%%:*} + local selector=${component_spec#*:} + + if kubectl get pods -n "$NAMESPACE" -l "$selector" >/dev/null 2>&1; then + local running_pods + running_pods=$(kubectl get pods -n "$NAMESPACE" -l "$selector" --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l) + if [ "$running_pods" -gt 0 ]; then + healthy_components="$healthy_components $component_name" + healthy_count=$((healthy_count + 1)) + log_info "✅ $component_name is running ($running_pods pods)" + else + unhealthy_components="$unhealthy_components $component_name" + log_warning "⚠️ $component_name found but not running" + fi + else + log_info "ℹ️ $component_name not deployed (monitoring may be disabled)" + fi + done + + # Check HPA resources + if kubectl get hpa -n "$NAMESPACE" >/dev/null 2>&1; then + local hpa_count + hpa_count=$(kubectl get hpa -n "$NAMESPACE" --no-headers 2>/dev/null | wc -l) + log_info "✅ Found $hpa_count HPA resources" + else + log_info "ℹ️ No HPA resources found (autoscaling may be disabled)" + fi + + # Check custom metrics API + if kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" >/dev/null 2>&1; then + log_info "✅ Custom metrics API is available" + else + log_warning "⚠️ Custom metrics API not available" + fi + + if [ $healthy_count -gt 0 ]; then + log_info "Monitoring stack health check completed" + return 0 + else + log_warning "No monitoring components found - some tests may be skipped" + return 0 # Don't fail, just warn + fi +} + +# Show observability debug information +show_observability_debug_info() { + log_info "=== Observability Debug Information ===" + + log_info "=== Monitoring Pods Status ===" + kubectl get pods -n "$NAMESPACE" -l 'app.kubernetes.io/name in (prometheus,grafana,prometheus-adapter,kube-state-metrics)' -o wide 2>/dev/null || true + + log_info "=== HPA Status ===" + kubectl get hpa -n "$NAMESPACE" -o wide 2>/dev/null || true + kubectl describe hpa -n "$NAMESPACE" 2>/dev/null || true + + log_info "=== Custom Metrics API ===" + kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" 2>/dev/null || true + + log_info "=== Pod Resource Metrics ===" + kubectl top pods -n "$NAMESPACE" 2>/dev/null || true + + log_info "=== Monitoring Services ===" + kubectl get services -n "$NAMESPACE" -l 'app.kubernetes.io/name in (prometheus,grafana)' 2>/dev/null || true + + log_info "=== Recent Events ===" + kubectl get events -n "$NAMESPACE" --sort-by='.lastTimestamp' 2>/dev/null | tail -20 || true +} + # Main function main() { parse_args "$@" @@ -799,8 +984,25 @@ main() { run_integration_tests ;; + observability) + log_info "Running observability and autoscaling tests" + + check_observability_dependencies + check_cluster + install_test_deps + detect_deployment + + # Show enhanced debugging in debug mode + if [ "$DEBUG_MODE" = true ]; then + show_debug_info + fi + + check_eoapi_deployment + + run_observability_tests + ;; all) - log_info "Running comprehensive test suite (Helm + Integration tests)" + log_info "Running comprehensive test suite (Helm + Integration + Observability tests)" # Run Helm tests first log_info "=== Phase 1: Helm Tests ===" @@ -825,6 +1027,12 @@ main() { setup_test_environment run_integration_tests + + # Run Observability tests third + log_info "=== Phase 3: Observability Tests ===" + check_observability_dependencies + + run_observability_tests ;; *) log_error "Unknown command: $COMMAND"