From 695abb7ef8e263642bbde1a6495b387e76de60df Mon Sep 17 00:00:00 2001
From: Felix Delattre <felix@developmentseed.org>
Date: Fri, 24 Oct 2025 12:00:49 +0200
Subject: [PATCH 1/3] feat: add Knative integration for notifications

- Add CloudEvents sink deployment for eoapi-notifier integration
- Configure dynamic secret name for PostgreSQL connection
- Add local development configuration with reduced resources
- Support both CI and local test environments
---
 .github/workflows/helm-tests.yml           | 210 ++++++++++++++-------
 .gitignore                                 |   2 +
 CHANGELOG.md                               |   2 +
 charts/eoapi/test-k3s-unittest-values.yaml |   2 +-
 charts/eoapi/test-local-values.yaml        | 109 +++++++++++
 scripts/deploy.sh                          |  87 ++++++++-
 scripts/test.sh                            |   6 +-
 7 files changed, 340 insertions(+), 78 deletions(-)
 create mode 100644 charts/eoapi/test-local-values.yaml

diff --git a/.github/workflows/helm-tests.yml b/.github/workflows/helm-tests.yml
index 2af833fc..e18fef38 100644
--- a/.github/workflows/helm-tests.yml
+++ b/.github/workflows/helm-tests.yml
@@ -25,8 +25,8 @@ jobs:
       - name: Run Helm unit tests
         run: make tests
 
-  integration:
-    name: Integration Tests (K3s)
+  k3s-integration-tests:
+    name: K3s Integration Tests
     if: github.event.pull_request.head.repo.full_name == github.repository
     permissions:
       contents: 'read'
@@ -47,106 +47,170 @@ jobs:
       - name: Set release name
         run: echo "RELEASE_NAME=eoapi-$(echo "${{ github.sha }}" | cut -c1-8)" >> "$GITHUB_ENV"
 
-      - name: Deploy eoAPI
-        id: deploy
-        continue-on-error: true
+      - name: Wait for K3s to be fully ready
         run: |
-          echo "=== Starting eoAPI deployment ==="
-          export RELEASE_NAME="$RELEASE_NAME"
-          export PGO_VERSION="${{ env.PGO_VERSION }}"
-          export GITHUB_SHA="${{ github.sha }}"
-          ./scripts/deploy.sh --ci
+          echo "=== Waiting for K3s to be fully ready ==="
 
-      - name: Check deployment status
-        id: check
-        if: steps.deploy.outcome == 'success'
-        run: |
-          echo "=== Checking deployment status ==="
-          export RELEASE_NAME="$RELEASE_NAME"
-          ./scripts/test.sh check-deployment --debug
+          # Wait for core K3s components to be ready
+          echo "Waiting for kube-system pods to be ready..."
+          kubectl wait --for=condition=Ready pod -l k8s-app=kube-dns -n kube-system --timeout=300s
+          kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=traefik -n kube-system --timeout=300s
 
-      - name: Debug pgstac jobs if deployment failed
-        if: steps.deploy.outcome == 'failure'
-        continue-on-error: true
-        run: |
-          echo "=== Debugging pgstac job failures ==="
-
-          # Check pgstac-migrate job
-          echo "===== pgstac-migrate Job Status ====="
-          kubectl get jobs -l app.kubernetes.io/name=pgstac-migrate -o wide || echo "No pgstac-migrate jobs found"
-
-          MIGRATE_PODS=$(kubectl get pods -l app.kubernetes.io/name=pgstac-migrate -o jsonpath='{.items[*].metadata.name}' 2>/dev/null)
-          if [ -n "$MIGRATE_PODS" ]; then
-            for POD in $MIGRATE_PODS; do
-              echo "--- Logs from migrate pod $POD ---"
-              kubectl logs "$POD" --tail=100 || true
-              echo "--- Description of migrate pod $POD ---"
-              kubectl describe pod "$POD"
-            done
-          fi
-
-          # Check pgstac-load-samples job
-          echo "===== pgstac-load-samples Job Status ====="
-          kubectl get jobs -l app.kubernetes.io/name=pgstac-load-samples -o wide || echo "No pgstac-load-samples jobs found"
-
-          SAMPLES_PODS=$(kubectl get pods -l app.kubernetes.io/name=pgstac-load-samples -o jsonpath='{.items[*].metadata.name}' 2>/dev/null)
-          if [ -n "$SAMPLES_PODS" ]; then
-            for POD in $SAMPLES_PODS; do
-              echo "--- Logs from samples pod $POD ---"
-              kubectl logs "$POD" --tail=100 || true
-              echo "--- Description of samples pod $POD ---"
-              kubectl describe pod "$POD"
-            done
-          fi
+          # Wait for API server to be fully responsive
+          echo "Checking API server responsiveness..."
+          kubectl get nodes
+          kubectl get pods --all-namespaces
 
-          # Check database status
-          echo "===== Database Pod Status ====="
-          kubectl get pods -l postgres-operator.crunchydata.com/cluster -o wide
-          kubectl get postgrescluster -o wide
+          # Give K3s a moment to initialize all CRDs
+          echo "Waiting for K3s initialization to complete..."
+          sleep 10
 
-          # Check ConfigMaps
-          echo "===== Relevant ConfigMaps ====="
-          kubectl get configmaps | grep -E "initdb|pgstac" || echo "No relevant configmaps found"
+          echo "✅ K3s is ready"
 
-          # Check for any related events
-          echo "===== Related Kubernetes Events ====="
-          kubectl get events | grep -E "pgstac|initdb" || echo "No relevant events found"
+      - name: Install Knative Serving
+        run: |
+          echo "=== Installing Knative Serving ==="
+          # Install Knative Serving CRDs
+          kubectl apply -f https://github.com/knative/serving/releases/download/knative-v1.17.0/serving-crds.yaml
+          # Install Knative Serving core components
+          kubectl apply -f https://github.com/knative/serving/releases/download/knative-v1.17.0/serving-core.yaml
+          # Install Kourier networking layer for Knative
+          kubectl apply -f https://github.com/knative/net-kourier/releases/download/knative-v1.17.0/kourier.yaml
+          # Configure Knative to use Kourier
+          kubectl patch configmap/config-network \
+            --namespace knative-serving \
+            --type merge \
+            --patch '{"data":{"ingress-class":"kourier.ingress.networking.knative.dev"}}'
+          # Wait for Knative Serving to be ready
+          echo "Waiting for Knative Serving to be ready..."
+          kubectl wait --for=condition=Ready pod -l app=controller -n knative-serving --timeout=300s
+          kubectl wait --for=condition=Ready pod -l app=webhook -n knative-serving --timeout=300s
+          kubectl wait --for=condition=Ready pod -l app=3scale-kourier-gateway -n kourier-system --timeout=300s
+
+      - name: Install Knative Eventing
+        run: |
+          echo "=== Installing Knative Eventing ==="
+          # Install Knative Eventing CRDs (includes SinkBinding)
+          kubectl apply -f https://github.com/knative/eventing/releases/download/knative-v1.17.0/eventing-crds.yaml
+          # Install Knative Eventing core components
+          kubectl apply -f https://github.com/knative/eventing/releases/download/knative-v1.17.0/eventing-core.yaml
+          # Wait for Knative Eventing to be ready
+          echo "Waiting for Knative Eventing to be ready..."
+          kubectl wait --for=condition=Ready pod -l app=eventing-controller -n knative-eventing --timeout=300s
+          kubectl wait --for=condition=Ready pod -l app=eventing-webhook -n knative-eventing --timeout=300s
+
+      - name: Deploy CloudEvents sink for eoapi-notifier
+        run: |
+          echo "=== Deploying CloudEvents sink ==="
+          # Create the namespace first
+          kubectl create namespace eoapi || true
+          # Deploy the CloudEvents sink service
+          kubectl apply -f charts/eoapi/samples/cloudevents-sink.yaml
+          # Wait for the Knative service to be ready
+          echo "Waiting for CloudEvents sink to be ready..."
+          kubectl wait --for=condition=Ready ksvc/eoapi-cloudevents-sink -n eoapi --timeout=300s
+
+      - name: Wait for Traefik to be ready
+        run: |
+          echo "=== Waiting for Traefik to be ready ==="
+
+          # Wait for Traefik pods to be ready first
+          echo "Waiting for Traefik controller to be ready..."
+          kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=traefik -n kube-system --timeout=300s
+
+          # Wait for essential Traefik CRDs to be available
+          echo "Checking for Traefik CRDs..."
+          timeout=300
+          counter=0
+          required_crds=("middlewares.traefik.io" "ingressroutes.traefik.io")
+
+          for crd in "${required_crds[@]}"; do
+            echo "Checking for CRD: $crd"
+            counter=0
+            while [ $counter -lt $timeout ]; do
+              if kubectl get crd "$crd" &>/dev/null; then
+                echo "✅ $crd is available"
+                break
+              fi
+              echo "⏳ Waiting for $crd... ($counter/$timeout)"
+              sleep 3
+              counter=$((counter + 3))
+            done
 
-          # Check notification system status
-          echo "===== Notification System Status ====="
-          kubectl get deployments -l app.kubernetes.io/name=eoapi-notifier -o wide || echo "No eoapi-notifier deployment found"
-          kubectl get ksvc -l app.kubernetes.io/component=cloudevents-sink -o wide || echo "No Knative CloudEvents sink found"
+            if [ $counter -ge $timeout ]; then
+              echo "❌ Timeout waiting for $crd"
+              echo "Available Traefik CRDs:"
+              kubectl get crd | grep traefik || echo "No Traefik CRDs found"
+              echo "All CRDs:"
+              kubectl get crd
+              exit 1
+            fi
+          done
 
-          exit 1
+          echo "✅ All required Traefik CRDs are ready"
+
+      - name: Deploy eoAPI
+        id: deploy
+        run: |
+          echo "=== Starting eoAPI deployment ==="
+          export RELEASE_NAME="$RELEASE_NAME"
+          export PGO_VERSION="${{ env.PGO_VERSION }}"
+          export GITHUB_SHA="${{ github.sha }}"
+          ./scripts/deploy.sh --ci
 
       - name: Run integration tests
-        if: steps.deploy.outcome == 'success'
         run: |
           echo "=== Running integration tests ==="
           export RELEASE_NAME="$RELEASE_NAME"
           ./scripts/test.sh integration --debug
 
-      - name: Debug deployment status
-        if: always()
+      - name: Debug failed deployment
+        if: failure()
         run: |
-          echo "=== Final Deployment Status ==="
+          echo "=== Deployment failed - collecting debug information ==="
           kubectl get pods -o wide
           kubectl get jobs -o wide
           kubectl get services -o wide
-          kubectl get ingress
+          kubectl get events --sort-by='.lastTimestamp' | tail -20 || true
+
+          # Check Knative installation status
+          echo "=== Knative Installation Status ==="
+          kubectl get pods -n knative-serving -o wide || echo "Knative Serving not installed"
+          kubectl get pods -n knative-eventing -o wide || echo "Knative Eventing not installed"
+          kubectl get pods -n kourier-system -o wide || echo "Kourier not installed"
+          # Check Knative CRDs
+          echo "=== Knative CRDs Status ==="
+          kubectl get crd | grep knative || echo "No Knative CRDs found"
+          kubectl get crd sinkbindings.sources.knative.dev || echo "SinkBinding CRD not found"
+
+          # Check Traefik status
+          echo "=== Traefik Status ==="
+          kubectl get pods -n kube-system -l app.kubernetes.io/name=traefik -o wide || echo "No Traefik pods found"
+          kubectl get crd | grep traefik || echo "No Traefik CRDs found"
+          kubectl get crd middlewares.traefik.io || echo "Middleware CRD not found"
+          kubectl get crd ingressroutes.traefik.io || echo "IngressRoute CRD not found"
 
           # Check notification system final status
           echo "=== Notification System Final Status ==="
           kubectl get deployments -l app.kubernetes.io/name=eoapi-notifier -o wide || echo "No eoapi-notifier deployment"
           kubectl get pods -l app.kubernetes.io/name=eoapi-notifier -o wide || echo "No eoapi-notifier pods"
-          kubectl get ksvc -l app.kubernetes.io/component=cloudevents-sink -o wide || echo "No Knative CloudEvents sink"
-          kubectl get pods -l serving.knative.dev/service -o wide || echo "No Knative CloudEvents sink pods"
+          kubectl get ksvc -n eoapi -o wide || echo "No Knative services in eoapi namespace"
+          kubectl get ksvc eoapi-cloudevents-sink -n eoapi -o wide || echo "No eoapi-cloudevents-sink Knative service"
+          kubectl get pods -l serving.knative.dev/service=eoapi-cloudevents-sink -n eoapi -o wide || echo "No CloudEvents sink pods"
+          # Check SinkBinding resources
+          echo "=== SinkBinding Resources ==="
+          kubectl get sinkbindings -A -o wide || echo "No SinkBinding resources found"
 
           # Show notification logs if they exist
           echo "=== eoapi-notifier Logs ==="
           kubectl logs -l app.kubernetes.io/name=eoapi-notifier --tail=20 || echo "No eoapi-notifier logs"
           echo "=== Knative CloudEvents Sink Logs ==="
-          kubectl logs -l serving.knative.dev/service --tail=20 || echo "No Knative CloudEvents sink logs"
+          kubectl logs -l serving.knative.dev/service=eoapi-cloudevents-sink -n eoapi --tail=20 || echo "No CloudEvents sink logs"
+          # Show Knative system logs if there are issues
+          echo "=== Knative Serving Controller Logs ==="
+          kubectl logs -n knative-serving -l app=controller --tail=20 || echo "No Knative Serving controller logs"
+          echo "=== Knative Eventing Controller Logs ==="
+          kubectl logs -n knative-eventing -l app=eventing-controller --tail=20 || echo "No Knative Eventing controller logs"
 
 
       - name: Cleanup
diff --git a/.gitignore b/.gitignore
index 35f7b4e8..469ec3dc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,5 @@ charts/config.yaml
 charts/eoapi/charts/*.tgz
 config_ingress.yaml
 __pycache__
+
+CLAUDE.md
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6a09ab58..cf7a966f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- Added knative in CI to test eoapi-notifier.
+
 ## [0.7.12] - 2025-10-17
 
 - Bumped eoapi-notifier dependency version to 0.0.7
diff --git a/charts/eoapi/test-k3s-unittest-values.yaml b/charts/eoapi/test-k3s-unittest-values.yaml
index 33a4ec56..a1e83a16 100644
--- a/charts/eoapi/test-k3s-unittest-values.yaml
+++ b/charts/eoapi/test-k3s-unittest-values.yaml
@@ -62,7 +62,7 @@ eoapi-notifier:
           channel: pgstac_items_change
           connection:
             existingSecret:
-              name: "eoapi-test-pguser-eoapi"
+              name: ""
               keys:
                 username: "user"
                 password: "password"
diff --git a/charts/eoapi/test-local-values.yaml b/charts/eoapi/test-local-values.yaml
new file mode 100644
index 00000000..59e139dd
--- /dev/null
+++ b/charts/eoapi/test-local-values.yaml
@@ -0,0 +1,109 @@
+# Local test configuration for minikube/local development
+# Based on test-k3s-unittest-values.yaml with minimal changes for local environment
+
+testing: true
+ingress:
+  enabled: true
+  className: "nginx"  # Changed from "traefik" for minikube
+  pathType: "Prefix"
+  host: "eoapi.local"
+
+pgstacBootstrap:
+  enabled: true
+  settings:
+    resources:
+      requests:
+        cpu: "256m"
+        memory: "1024Mi"
+      limits:
+        cpu: "512m"
+        memory: "1024Mi"
+
+raster:
+  enabled: true
+  settings:
+    resources:
+      limits:
+        cpu: "768m"
+        memory: "2048Mi"  # Reduced from 4096Mi for local
+      requests:
+        cpu: "256m"
+        memory: "1024Mi"
+
+stac:
+  enabled: true
+  settings:
+    resources:
+      limits:
+        cpu: "1280m"
+        memory: "1536Mi"
+      requests:
+        cpu: "512m"
+        memory: "1024Mi"
+
+vector:
+  enabled: true
+  settings:
+    resources:
+      limits:
+        cpu: "768m"
+        memory: "1536Mi"
+      requests:
+        cpu: "256m"
+        memory: "1024Mi"
+    envVars:
+      TIPG_DEBUG: "True"
+
+eoapi-notifier:
+  enabled: true
+  config:
+    logLevel: DEBUG
+    sources:
+      - type: pgstac
+        config:
+          channel: pgstac_items_change
+          connection:
+            existingSecret:
+              name: ""  # Set dynamically by deploy script
+              keys:
+                username: "user"
+                password: "password"
+                host: "host"
+                port: "port"
+                database: "dbname"
+    outputs:
+      - type: cloudevents
+        config:
+          source: /eoapi/pgstac
+          event_type: org.eoapi.stac.item
+          destination:
+            ref:
+              apiVersion: serving.knative.dev/v1
+              kind: Service
+              name: eoapi-cloudevents-sink
+  resources:
+    requests:
+      cpu: "50m"
+      memory: "64Mi"
+    limits:
+      cpu: "200m"
+      memory: "128Mi"
+
+# Reduce PostgreSQL resources for local development
+postgrescluster:
+  instances:
+    - name: "postgres"
+      replicas: 1
+      dataVolumeClaimSpec:
+        accessModes:
+          - "ReadWriteOnce"
+        resources:
+          requests:
+            storage: "1Gi"  # Reduced for local
+      resources:
+        requests:
+          cpu: "100m"     # Reduced for local
+          memory: "512Mi" # Reduced for local
+        limits:
+          cpu: "500m"     # Reduced for local
+          memory: "1Gi"   # Reduced for local
diff --git a/scripts/deploy.sh b/scripts/deploy.sh
index 0b356109..2eca31dd 100755
--- a/scripts/deploy.sh
+++ b/scripts/deploy.sh
@@ -81,6 +81,55 @@ install_pgo() {
     kubectl get pods -l postgres-operator.crunchydata.com/control-plane=postgres-operator
 }
 
+# Setup Knative for local development
+setup_knative() {
+    log_info "Setting up Knative for local development..."
+
+    if kubectl get namespace knative-serving &>/dev/null && kubectl get namespace knative-eventing &>/dev/null; then
+        log_info "Knative already installed, skipping installation"
+        return 0
+    fi
+
+    log_info "Installing Knative Serving..."
+    kubectl apply -f https://github.com/knative/serving/releases/download/knative-v1.17.0/serving-crds.yaml
+    kubectl apply -f https://github.com/knative/serving/releases/download/knative-v1.17.0/serving-core.yaml
+    kubectl apply -f https://github.com/knative/net-kourier/releases/download/knative-v1.17.0/kourier.yaml
+    # Configure Knative to use Kourier
+    kubectl patch configmap/config-network \
+        --namespace knative-serving \
+        --type merge \
+        --patch '{"data":{"ingress-class":"kourier.ingress.networking.knative.dev"}}'
+
+    log_info "Installing Knative Eventing..."
+    # Install Knative Eventing CRDs (includes SinkBinding)
+    kubectl apply -f https://github.com/knative/eventing/releases/download/knative-v1.17.0/eventing-crds.yaml
+    # Install Knative Eventing core components
+    kubectl apply -f https://github.com/knative/eventing/releases/download/knative-v1.17.0/eventing-core.yaml
+
+    log_info "Waiting for Knative components to be ready..."
+    kubectl wait --for=condition=Ready pod -l app=controller -n knative-serving --timeout=300s
+    kubectl wait --for=condition=Ready pod -l app=webhook -n knative-serving --timeout=300s
+    kubectl wait --for=condition=Ready pod -l app=3scale-kourier-gateway -n kourier-system --timeout=300s
+    kubectl wait --for=condition=Ready pod -l app=eventing-controller -n knative-eventing --timeout=300s
+    kubectl wait --for=condition=Ready pod -l app=eventing-webhook -n knative-eventing --timeout=300s
+
+    log_info "✅ Knative installation complete"
+}
+
+deploy_cloudevents_sink() {
+    log_info "Deploying CloudEvents sink for notifications..."
+
+    kubectl create namespace "$NAMESPACE" --dry-run=client -o yaml | kubectl apply -f -
+
+    if kubectl apply -f charts/eoapi/samples/cloudevents-sink.yaml; then
+        log_info "Waiting for CloudEvents sink to be ready..."
+        kubectl wait --for=condition=Ready ksvc/eoapi-cloudevents-sink -n "$NAMESPACE" --timeout=300s
+        log_info "✅ CloudEvents sink deployed successfully"
+    else
+        log_warn "Failed to deploy CloudEvents sink, continuing without it"
+    fi
+}
+
 # Integrated Helm dependency setup
 setup_helm_dependencies() {
     log_info "Setting up Helm dependencies..."
@@ -140,10 +189,17 @@ deploy_eoapi() {
         HELM_CMD="$HELM_CMD -f ./eoapi/values.yaml"
     fi
 
-    # CI-specific configuration
+    # Environment-specific configuration
     if [ "$CI_MODE" = true ] && [ -f "./eoapi/test-k3s-unittest-values.yaml" ]; then
         log_info "Using CI test configuration..."
         HELM_CMD="$HELM_CMD -f ./eoapi/test-k3s-unittest-values.yaml"
+        # Fix eoapi-notifier secret name dynamically
+        HELM_CMD="$HELM_CMD --set eoapi-notifier.config.sources[0].config.connection.existingSecret.name=$RELEASE_NAME-pguser-eoapi"
+    elif [ -f "./eoapi/test-local-values.yaml" ]; then
+        log_info "Using local test configuration..."
+        HELM_CMD="$HELM_CMD -f ./eoapi/test-local-values.yaml"
+        # Fix eoapi-notifier secret name dynamically for local mode too
+        HELM_CMD="$HELM_CMD --set eoapi-notifier.config.sources[0].config.connection.existingSecret.name=$RELEASE_NAME-pguser-eoapi"
     fi
 
     # Set git SHA if available
@@ -160,6 +216,31 @@ deploy_eoapi() {
 
     cd .. || exit
 
+    # Wait for pgstac jobs to complete first
+    if kubectl get job -n "$NAMESPACE" -l "app=$RELEASE_NAME-pgstac-migrate" >/dev/null 2>&1; then
+        log_info "Waiting for pgstac-migrate job to complete..."
+        if ! kubectl wait --for=condition=complete job -l "app=$RELEASE_NAME-pgstac-migrate" -n "$NAMESPACE" --timeout=600s; then
+            log_error "pgstac-migrate job failed to complete"
+            kubectl describe job -l "app=$RELEASE_NAME-pgstac-migrate" -n "$NAMESPACE"
+            kubectl logs -l "app=$RELEASE_NAME-pgstac-migrate" -n "$NAMESPACE" --tail=50 || true
+            exit 1
+        fi
+    fi
+
+    if kubectl get job -n "$NAMESPACE" -l "app=$RELEASE_NAME-pgstac-load-samples" >/dev/null 2>&1; then
+        log_info "Waiting for pgstac-load-samples job to complete..."
+        if ! kubectl wait --for=condition=complete job -l "app=$RELEASE_NAME-pgstac-load-samples" -n "$NAMESPACE" --timeout=600s; then
+            log_error "pgstac-load-samples job failed to complete"
+            kubectl describe job -l "app=$RELEASE_NAME-pgstac-load-samples" -n "$NAMESPACE"
+            kubectl logs -l "app=$RELEASE_NAME-pgstac-load-samples" -n "$NAMESPACE" --tail=50 || true
+            exit 1
+        fi
+    fi
+
+    if [ "$CI_MODE" != true ]; then
+        deploy_cloudevents_sink
+    fi
+
     # Verify deployment
     log_info "Verifying deployment..."
     kubectl get pods -n "$NAMESPACE" -o wide
@@ -225,6 +306,10 @@ case $COMMAND in
         ;;
     deploy)
         install_pgo
+
+        if [ "$CI_MODE" != true ]; then
+            setup_knative
+        fi
         setup_helm_dependencies
         deploy_eoapi
         ;;
diff --git a/scripts/test.sh b/scripts/test.sh
index 9fea0df8..805c4285 100755
--- a/scripts/test.sh
+++ b/scripts/test.sh
@@ -127,9 +127,9 @@ install_test_deps() {
         python_cmd="python3"
     fi
 
-    if ! $python_cmd -m pip install --quiet pytest httpx >/dev/null 2>&1; then
-        log_error "Failed to install test dependencies (pytest, httpx)"
-        log_error "Please install manually: pip install pytest httpx"
+    if ! $python_cmd -m pip install --quiet pytest httpx psycopg2-binary >/dev/null 2>&1; then
+        log_error "Failed to install test dependencies (pytest, httpx, psycopg2-binary)"
+        log_error "Please install manually: pip install pytest httpx psycopg2-binary"
         exit 1
     fi
 

From 5829f02e9088a1aabf50e9ce6e1d2a87c5fd1f8d Mon Sep 17 00:00:00 2001
From: Felix Delattre <felix@developmentseed.org>
Date: Thu, 23 Oct 2025 15:47:32 +0200
Subject: [PATCH 2/3] feat: integrate observability into main eoapi chart

- Remove separate eoapi-support chart and integrate all observability features into main chart
- Add optional monitoring dependencies: metrics-server, prometheus, prometheus-adapter, grafana
- Add monitoring helper templates (_monitoring.yaml, _resources.yaml, observability.yaml)
- Add comprehensive observability documentation with deployment examples
- Add reusable monitoring configuration base (values/monitoring.yaml)
- Add autoscaling test suite for HPA validation
- Move Grafana dashboard from eoapi-support to main chart
- Update main values.yaml with observability configuration options

This consolidation simplifies deployment by eliminating the need for a separate observability chart while maintaining full flexibility for enabling monitoring features.
---
 CHANGELOG.md                                  |   3 +-
 charts/eoapi-support/.gitignore               |   2 -
 charts/eoapi-support/.helmignore              |  30 -
 charts/eoapi-support/Chart.yaml               |  33 -
 charts/eoapi-support/README.md                |   5 -
 charts/eoapi-support/values.yaml              | 178 -----
 charts/eoapi/Chart.yaml                       |  24 +
 charts/eoapi/README.md                        |   2 +
 .../dashboards/eoAPI-Dashboard.json           |   0
 charts/eoapi/templates/_monitoring.yaml       |  80 +++
 charts/eoapi/templates/_resources.yaml        |  51 ++
 .../templates/observability.yaml}             |   2 +
 charts/eoapi/tests/autoscaling_tests.yaml     | 241 +++++++
 charts/eoapi/values.yaml                      | 115 ++-
 charts/eoapi/values/monitoring.yaml           |  59 ++
 docs/examples/values-autoscaling.yaml         | 212 ++++++
 docs/examples/values-full-observability.yaml  | 303 ++++++++
 docs/index.md                                 |  11 +-
 docs/operations/autoscaling.md                | 677 ++++++++++--------
 docs/operations/observability.md              | 334 +++++++++
 20 files changed, 1789 insertions(+), 573 deletions(-)
 delete mode 100644 charts/eoapi-support/.gitignore
 delete mode 100644 charts/eoapi-support/.helmignore
 delete mode 100644 charts/eoapi-support/Chart.yaml
 delete mode 100644 charts/eoapi-support/README.md
 delete mode 100644 charts/eoapi-support/values.yaml
 rename charts/{eoapi-support => eoapi}/dashboards/eoAPI-Dashboard.json (100%)
 create mode 100644 charts/eoapi/templates/_monitoring.yaml
 create mode 100644 charts/eoapi/templates/_resources.yaml
 rename charts/{eoapi-support/templates/dashboard.config.yaml => eoapi/templates/observability.yaml} (77%)
 create mode 100644 charts/eoapi/tests/autoscaling_tests.yaml
 create mode 100644 charts/eoapi/values/monitoring.yaml
 create mode 100644 docs/examples/values-autoscaling.yaml
 create mode 100644 docs/examples/values-full-observability.yaml
 create mode 100644 docs/operations/observability.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cf7a966f..309770a3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -36,7 +36,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
-- Excluded renovate.json from CHANGELOG.md edits [#301](https://github.com/developmentseed/eoapi-k8s/pull/301)
+- Refactores eoapi-support into core eoapi chart [#262](https://github.com/developmentseed/eoapi-k8s/pull/262)
+
 
 ## [0.7.8] - 2025-09-10
 
diff --git a/charts/eoapi-support/.gitignore b/charts/eoapi-support/.gitignore
deleted file mode 100644
index 082a7414..00000000
--- a/charts/eoapi-support/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-Chart.lock
-/charts
diff --git a/charts/eoapi-support/.helmignore b/charts/eoapi-support/.helmignore
deleted file mode 100644
index ada987c3..00000000
--- a/charts/eoapi-support/.helmignore
+++ /dev/null
@@ -1,30 +0,0 @@
-# Non default entries manually added by support developers
-
-# Ignore the .yaml that generates the .json, only the .json is relevant to
-# bundle with the Helm chart when it is packaged or "helm dep up" is used to
-# copy it over to another location where it is referenced.
-values.schema.yaml
-
-# -----------------------------------------------------------------------------
-
-# Patterns to ignore when building packages.
-# This supports shell glob matching, relative path matching, and
-# negation (prefixed with !). Only one pattern per line.
-.DS_Store
-# Common VCS dirs
-.git/
-.gitignore
-.bzr/
-.bzrignore
-.hg/
-.hgignore
-.svn/
-# Common backup files
-*.swp
-*.bak
-*.tmp
-*~
-# Various IDEs
-.project
-.idea/
-*.tmproj
diff --git a/charts/eoapi-support/Chart.yaml b/charts/eoapi-support/Chart.yaml
deleted file mode 100644
index 38bcca10..00000000
--- a/charts/eoapi-support/Chart.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
-apiVersion: v2
-name: eoapi-support
-
-appVersion: "0.1.7"
-version: "0.1.7"
-
-dependencies:
-  - name: metrics-server
-    version: 7.4.12
-    repository: https://charts.bitnami.com/bitnami
-
-  # Prometheus for collection of metrics.
-  # https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus
-  #
-  - name: prometheus
-    # NOTE: configuration for this dependency is handled in `eoapi-support/values.yaml.prometheus` values
-    version: 27.41.2
-    repository: https://prometheus-community.github.io/helm-charts
-
-  # used to create custom metrics to autoscale on
-  #
-  - name: prometheus-adapter
-    # NOTE: configuration for this dependency is handled in `eoapi-support/values.yaml.prometheus-adapter` values
-    version: 5.2.0
-    repository: https://prometheus-community.github.io/helm-charts
-
-  # Grafana for dashboarding of metrics
-  # https://github.com/grafana/helm-charts/tree/main/charts/grafana
-  #
-  - name: grafana
-    # NOTE: configuration for this dependency is handled in `eoapi-support/values.yaml.grafana` values
-    version: 10.1.2
-    repository: https://grafana.github.io/helm-charts
diff --git a/charts/eoapi-support/README.md b/charts/eoapi-support/README.md
deleted file mode 100644
index b218eb69..00000000
--- a/charts/eoapi-support/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-#### eoAPI Support
-
-observability, monitoring and some custom metrics for autoscaling
-
-(please see documentation about `helm install` and configuration at ../../docs/autoscaling.md)
diff --git a/charts/eoapi-support/values.yaml b/charts/eoapi-support/values.yaml
deleted file mode 100644
index febe6af3..00000000
--- a/charts/eoapi-support/values.yaml
+++ /dev/null
@@ -1,178 +0,0 @@
-# most of this was cribbed from https://github.com/2i2c-org/infrastructure/blob/master/helm-charts/support/
-# so giving props where props are due to Yuvi Panda :sparkles:
-prometheus-adapter:
-  prometheus:
-    # NOTE: the `url` below makes assumptions about release name and namespace:
-    # 1) Release name is "eoapi-support" (follows RELEASE_NAME-prometheus-server pattern)
-    # 2) Deployed in "eoapi" namespace
-    # 3) If using different release name, update to: http://YOUR_RELEASE_NAME-prometheus-server.YOUR_NAMESPACE.svc.cluster.local
-    url: http://eoapi-support-prometheus-server.eoapi.svc.cluster.local
-    port: 80
-    path: ""
-  rules:
-    default: false
-    # NOTE: the `name.as` values below make some assumptions about your release name
-    # namely that you have run `helm install eoapi eoapi/eoapi --create-namespace=eoapi`
-    custom:
-    - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}'
-      seriesFilters: []
-      resources:
-        template: <<.Resource>>
-      name:
-        matches: ""
-        as: "nginx_ingress_controller_requests_rate_vector_eoapi"
-      metricsQuery: round(sum(rate(<<.Series>>{service="vector",path=~"/vector.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001)
-    - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}'
-      seriesFilters: []
-      resources:
-        template: <<.Resource>>
-      name:
-        matches: ""
-        as: "nginx_ingress_controller_requests_rate_raster_eoapi"
-      metricsQuery: round(sum(rate(<<.Series>>{service="raster",path=~"/raster.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001)
-    - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}'
-      seriesFilters: []
-      resources:
-        template: <<.Resource>>
-      name:
-        matches: ""
-        as: "nginx_ingress_controller_requests_rate_stac_eoapi"
-      metricsQuery: round(sum(rate(<<.Series>>{service="stac",path=~"/stac.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001)
-
-prometheus:
-  # alertmanager is an optional prometheus chart dependency that we opt-out from
-  # as we favor Grafana for this functionality. Grafana provides alerts and does
-  # so with a better UI that we expose publicly behind auth anyhow.
-  #
-  alertmanager:
-    enabled: false
-
-  # prometheus-pushgateway is an optional prometheus chart dependency that we
-  # opt-out from. pushgateway provides a way to complement prometheus server's
-  # behavior of scraping metrics from services by allowing services to push
-  # metrics to prometheus.
-  #
-  prometheus-pushgateway:
-    enabled: false
-
-  # kube-state-metrics is deployed by default but listing here just so we know it is
-  kube-state-metrics:
-    enabled: true
-
-  # prometheus-node-exporter is an optional prometheus chart dependency that we
-  # rely on to collect metrics about the nodes
-  #
-  # values ref: https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-node-exporter/values.yaml
-  #
-  prometheus-node-exporter:
-    # resources for the node-exporter was set after inspecting cpu and memory
-    # use via prometheus and grafana.
-    #
-    # node-exporter is typically found using between 0-3m CPU and 2-22Mi memory,
-    # but we've seen it fail to report cpu/memory use metrics from time to time
-    # when requesting and limiting to 5m, so we've increased requests/limit it
-    # to 10m.
-    #
-    # PromQL queries for CPU and memory use:
-    # - CPU:    sum(rate(container_cpu_usage_seconds_total{container="node-exporter", namespace="support"}[5m])) by (pod)
-    # - Memory: sum(container_memory_usage_bytes{container="node-exporter", namespace="support"}) by (pod)
-    #
-    resources:
-      limits:
-        cpu: 10m
-        memory: 30Mi
-      requests:
-        cpu: 10m
-        memory: 30Mi
-  server:
-    service:
-      annotations:
-        service.beta.kubernetes.io/aws-load-balancer-type: "nlb"
-        service.beta.kubernetes.io/aws-load-balancer-internal: "false"
-      type: LoadBalancer
-
-grafana:
-  persistence:
-    enabled: false
-  deploymentStrategy:
-    type: Recreate
-  service:
-    type: LoadBalancer
-    annotations:
-      service.beta.kubernetes.io/aws-load-balancer-type: "nlb"
-      service.beta.kubernetes.io/aws-load-balancer-internal: "false"
-  rbac:
-    namespaced: true
-    pspEnabled: false
-  # initChownData refers to an init container enabled by default that isn't
-  # needed as we don't reconfigure the linux user the grafana server will run
-  # as.
-  initChownData:
-    enabled: false
-
-  # resources for grafana was set after inspecting cpu and memory use via
-  # prometheus and grafana.
-  #
-  # Grafana's memory use seems to increase over time but seems reasonable to
-  # stay below 200Mi for years to come. Grafana's CPU use seems miniscule with
-  # peaks at up to 9m CPU from one user is browsing its dashboards.
-  #
-  # PromQL queries for CPU and memory use:
-  # - CPU:    sum(rate(container_cpu_usage_seconds_total{container="grafana", namespace="support"}[5m])) by (pod)
-  # - Memory: sum(container_memory_usage_bytes{container="grafana", namespace="support"}) by (pod)
-  #
-  resources:
-    limits:
-      cpu: 100m
-      memory: 200Mi
-    requests:
-      cpu: 10m
-      memory: 200Mi
-
-  datasources:
-    datasources.yaml:
-      apiVersion: 1
-      datasources:
-      # Automatically add the prometheus server in the same namespace as the grafana as a datasource
-      - name: prometheus
-        orgId: 1
-        type: prometheus
-        # NOTE: the `url` below makes assumptions about release name and namespace:
-        # 1) Release name is "eoapi-support" (follows RELEASE_NAME-prometheus-server pattern)
-        # 2) Deployed in "eoapi" namespace
-        # 3) If using different release name, update to: http://YOUR_RELEASE_NAME-prometheus-server.YOUR_NAMESPACE.svc.cluster.local
-        url: http://eoapi-support-prometheus-server.eoapi.svc.cluster.local
-        access: proxy
-        jsonData:
-          timeInterval: "5s"
-        isDefault: true
-        editable: true
-        version: 1 # This number should be increased when changes are made to update the datasource
-
-  dashboardProviders:
-    dashboardproviders.yaml:
-      apiVersion: 1
-      providers:
-      - name: 'default'
-        orgId: 1
-        folder: ''
-        type: file
-        disableDeletion: false
-        editable: true
-        options:
-          path: /var/lib/grafana/dashboards/default
-
-  dashboardsConfigMaps:
-    # NOTE: This must match the ConfigMap name created in templates/dashboard.config.yaml
-    # The template creates: {{ .Release.Name }}-dashboards
-    # If release name is "eoapi-support", this should be "eoapi-support-dashboards"
-    # Update this value to match your actual release name + "-dashboards"
-    default: "eoapi-support-dashboards"
-
-metrics-server:
-  image:
-    registry: docker.io
-    repository: bitnamilegacy/metrics-server
-    tag: "0.8.0-debian-12-r4"
-  apiService:
-    create: true
diff --git a/charts/eoapi/Chart.yaml b/charts/eoapi/Chart.yaml
index c94a7b07..8fe33af0 100644
--- a/charts/eoapi/Chart.yaml
+++ b/charts/eoapi/Chart.yaml
@@ -57,3 +57,27 @@ dependencies:
     version: 0.0.7
     repository: "oci://ghcr.io/developmentseed/charts"
     condition: eoapi-notifier.enabled
+
+  # Optional monitoring components for metrics collection and autoscaling
+  # These are disabled by default to keep deployments lightweight
+  # Enable via: monitoring.prometheus.enabled=true, monitoring.metricsServer.enabled=true
+  - name: metrics-server
+    version: 7.2.8
+    repository: https://charts.bitnami.com/bitnami
+    condition: monitoring.metricsServer.enabled
+
+  - name: prometheus
+    version: 25.3.1
+    repository: https://prometheus-community.github.io/helm-charts
+    condition: monitoring.prometheus.enabled
+
+  - name: prometheus-adapter
+    version: 4.7.1
+    repository: https://prometheus-community.github.io/helm-charts
+    condition: monitoring.prometheusAdapter.enabled
+
+  # Observability components - Grafana dashboards and visualization
+  - name: grafana
+    version: 7.3.3
+    repository: https://grafana.github.io/helm-charts
+    condition: observability.grafana.enabled
diff --git a/charts/eoapi/README.md b/charts/eoapi/README.md
index a7a95cdf..a76520ed 100644
--- a/charts/eoapi/README.md
+++ b/charts/eoapi/README.md
@@ -14,6 +14,8 @@ A Helm chart for deploying Earth Observation APIs with integrated STAC, raster,
 - Flexible database configuration
 - Real-time PostgreSQL notifications for STAC item changes
 - Unified ingress system
+- Autoscaling
+- Integrated observability (Prometheus & Grafana)
 
 ## TL;DR
 
diff --git a/charts/eoapi-support/dashboards/eoAPI-Dashboard.json b/charts/eoapi/dashboards/eoAPI-Dashboard.json
similarity index 100%
rename from charts/eoapi-support/dashboards/eoAPI-Dashboard.json
rename to charts/eoapi/dashboards/eoAPI-Dashboard.json
diff --git a/charts/eoapi/templates/_monitoring.yaml b/charts/eoapi/templates/_monitoring.yaml
new file mode 100644
index 00000000..7f3bb1e3
--- /dev/null
+++ b/charts/eoapi/templates/_monitoring.yaml
@@ -0,0 +1,80 @@
+{{/*
+Common monitoring configurations to avoid duplication across values files
+*/}}
+
+{{/*
+Basic monitoring stack configuration
+*/}}
+{{- define "eoapi.monitoring.basic" -}}
+metricsServer:
+  enabled: true
+  apiService:
+    create: true
+prometheus:
+  enabled: true
+  alertmanager:
+    enabled: false
+  prometheus-pushgateway:
+    enabled: false
+  kube-state-metrics:
+    enabled: true
+  prometheus-node-exporter:
+    enabled: true
+    resources: {{- include "eoapi.resources.small" . | nindent 6 }}
+  server:
+    service:
+      type: ClusterIP
+{{- end -}}
+
+{{/*
+Production monitoring with persistence
+*/}}
+{{- define "eoapi.monitoring.production" -}}
+metricsServer:
+  enabled: true
+  apiService:
+    create: true
+prometheus:
+  enabled: true
+  alertmanager:
+    enabled: true
+  prometheus-pushgateway:
+    enabled: false
+  kube-state-metrics:
+    enabled: true
+  prometheus-node-exporter:
+    enabled: true
+    resources: {{- include "eoapi.resources.small" . | nindent 6 }}
+  server:
+    service:
+      type: ClusterIP
+    persistentVolume:
+      enabled: true
+      size: 10Gi
+{{- end -}}
+
+{{/*
+Testing monitoring with minimal resources
+*/}}
+{{- define "eoapi.monitoring.testing" -}}
+metricsServer:
+  enabled: true
+  apiService:
+    create: true
+prometheus:
+  enabled: true
+  alertmanager:
+    enabled: false
+  prometheus-pushgateway:
+    enabled: false
+  kube-state-metrics:
+    enabled: true
+  prometheus-node-exporter:
+    enabled: true
+    resources: {{- include "eoapi.resources.small" . | nindent 6 }}
+  server:
+    service:
+      type: ClusterIP
+    persistentVolume:
+      enabled: false
+{{- end -}}
diff --git a/charts/eoapi/templates/_resources.yaml b/charts/eoapi/templates/_resources.yaml
new file mode 100644
index 00000000..2c4ab5b7
--- /dev/null
+++ b/charts/eoapi/templates/_resources.yaml
@@ -0,0 +1,51 @@
+{{/*
+Common resource definitions to avoid duplication across values files
+*/}}
+
+{{/*
+Small resource allocation for lightweight components
+*/}}
+{{- define "eoapi.resources.small" -}}
+limits:
+  cpu: 10m
+  memory: 30Mi
+requests:
+  cpu: 10m
+  memory: 30Mi
+{{- end -}}
+
+{{/*
+Medium resource allocation for standard services
+*/}}
+{{- define "eoapi.resources.medium" -}}
+limits:
+  cpu: 100m
+  memory: 128Mi
+requests:
+  cpu: 50m
+  memory: 64Mi
+{{- end -}}
+
+{{/*
+Large resource allocation for heavy workloads
+*/}}
+{{- define "eoapi.resources.large" -}}
+limits:
+  cpu: 500m
+  memory: 512Mi
+requests:
+  cpu: 250m
+  memory: 256Mi
+{{- end -}}
+
+{{/*
+Grafana specific resources based on observed usage patterns
+*/}}
+{{- define "eoapi.resources.grafana" -}}
+limits:
+  cpu: 100m
+  memory: 200Mi
+requests:
+  cpu: 50m
+  memory: 100Mi
+{{- end -}}
diff --git a/charts/eoapi-support/templates/dashboard.config.yaml b/charts/eoapi/templates/observability.yaml
similarity index 77%
rename from charts/eoapi-support/templates/dashboard.config.yaml
rename to charts/eoapi/templates/observability.yaml
index 6c0f2382..fdf132a2 100644
--- a/charts/eoapi-support/templates/dashboard.config.yaml
+++ b/charts/eoapi/templates/observability.yaml
@@ -1,3 +1,4 @@
+{{- if .Values.observability.grafana.enabled }}
 apiVersion: v1
 kind: ConfigMap
 metadata:
@@ -7,3 +8,4 @@ metadata:
 data:
   kubernetes.json: |-
 {{ .Files.Get "dashboards/eoAPI-Dashboard.json" | indent 4 }}
+{{- end }}
diff --git a/charts/eoapi/tests/autoscaling_tests.yaml b/charts/eoapi/tests/autoscaling_tests.yaml
new file mode 100644
index 00000000..18cd9452
--- /dev/null
+++ b/charts/eoapi/tests/autoscaling_tests.yaml
@@ -0,0 +1,241 @@
+suite: autoscaling tests
+templates:
+  - templates/services/stac/hpa.yaml
+  - templates/services/raster/hpa.yaml
+  - templates/services/vector/hpa.yaml
+  - templates/services/multidim/hpa.yaml
+tests:
+  - it: "autoscaling disabled by default"
+    set:
+      stac.autoscaling.enabled: false
+      raster.autoscaling.enabled: false
+      vector.autoscaling.enabled: false
+      multidim.autoscaling.enabled: false
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: "stac hpa not created when autoscaling disabled"
+    set:
+      stac.enabled: true
+      stac.autoscaling.enabled: false
+    template: templates/services/stac/hpa.yaml
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: "stac hpa created with cpu autoscaling"
+    set:
+      stac.enabled: true
+      stac.autoscaling.enabled: true
+      stac.autoscaling.type: "cpu"
+      stac.autoscaling.targets.cpu: 70
+    template: templates/services/stac/hpa.yaml
+    asserts:
+      - isKind:
+          of: HorizontalPodAutoscaler
+      - equal:
+          path: metadata.name
+          value: "RELEASE-NAME-stac-hpa"
+      - equal:
+          path: spec.minReplicas
+          value: 1
+      - isNotEmpty:
+          path: spec.maxReplicas
+      - equal:
+          path: spec.metrics[0].type
+          value: "Resource"
+      - equal:
+          path: spec.metrics[0].resource.name
+          value: "cpu"
+      - equal:
+          path: spec.metrics[0].resource.target.averageUtilization
+          value: 70
+
+  - it: "stac hpa created with request rate autoscaling"
+    set:
+      stac.enabled: true
+      stac.autoscaling.enabled: true
+      stac.autoscaling.type: "requestRate"
+      stac.autoscaling.targets.requestRate: "50000m"
+    template: templates/services/stac/hpa.yaml
+    asserts:
+      - isKind:
+          of: HorizontalPodAutoscaler
+      - equal:
+          path: spec.minReplicas
+          value: 1
+      - isNotEmpty:
+          path: spec.maxReplicas
+      - equal:
+          path: spec.metrics[0].type
+          value: "Pods"
+      - equal:
+          path: spec.metrics[0].pods.metric.name
+          value: "nginx_ingress_controller_requests"
+      - equal:
+          path: spec.metrics[0].pods.target.averageValue
+          value: "50000m"
+
+  - it: "stac hpa created with both cpu and request rate autoscaling"
+    set:
+      stac.enabled: true
+      stac.autoscaling.enabled: true
+      stac.autoscaling.type: "both"
+      stac.autoscaling.targets.cpu: 70
+      stac.autoscaling.targets.requestRate: "50000m"
+    template: templates/services/stac/hpa.yaml
+    asserts:
+      - isKind:
+          of: HorizontalPodAutoscaler
+      - equal:
+          path: spec.metrics[0].type
+          value: "Resource"
+      - equal:
+          path: spec.metrics[0].resource.name
+          value: "cpu"
+      - equal:
+          path: spec.metrics[1].type
+          value: "Pods"
+      - equal:
+          path: spec.metrics[1].pods.metric.name
+          value: "nginx_ingress_controller_requests"
+
+  - it: "raster hpa created with request rate autoscaling"
+    set:
+      raster.enabled: true
+      raster.autoscaling.enabled: true
+      raster.autoscaling.type: "requestRate"
+      raster.autoscaling.targets.requestRate: "30000m"
+    template: templates/services/raster/hpa.yaml
+    asserts:
+      - isKind:
+          of: HorizontalPodAutoscaler
+      - equal:
+          path: spec.metrics[0].pods.metric.name
+          value: "nginx_ingress_controller_requests"
+      - equal:
+          path: spec.metrics[0].pods.target.averageValue
+          value: "30000m"
+
+  - it: "vector hpa created with request rate autoscaling"
+    set:
+      vector.enabled: true
+      vector.autoscaling.enabled: true
+      vector.autoscaling.type: "requestRate"
+      vector.autoscaling.targets.requestRate: "40000m"
+    template: templates/services/vector/hpa.yaml
+    asserts:
+      - isKind:
+          of: HorizontalPodAutoscaler
+      - equal:
+          path: spec.metrics[0].pods.metric.name
+          value: "nginx_ingress_controller_requests"
+      - equal:
+          path: spec.metrics[0].pods.target.averageValue
+          value: "40000m"
+
+  - it: "multidim hpa not created when service disabled"
+    set:
+      multidim.enabled: false
+      multidim.autoscaling.enabled: true
+    template: templates/services/multidim/hpa.yaml
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: "multidim hpa created when enabled"
+    set:
+      multidim.enabled: true
+      multidim.autoscaling.enabled: true
+      multidim.autoscaling.type: "cpu"
+      multidim.autoscaling.targets.cpu: 80
+    template: templates/services/multidim/hpa.yaml
+    asserts:
+      - isKind:
+          of: HorizontalPodAutoscaler
+      - equal:
+          path: spec.metrics[0].resource.target.averageUtilization
+          value: 80
+
+  - it: "hpa scaleTargetRef points to correct deployment"
+    set:
+      stac.enabled: true
+      stac.autoscaling.enabled: true
+      stac.autoscaling.type: "cpu"
+    template: templates/services/stac/hpa.yaml
+    asserts:
+      - equal:
+          path: spec.scaleTargetRef.name
+          value: "RELEASE-NAME-stac"
+      - equal:
+          path: spec.scaleTargetRef.kind
+          value: "Deployment"
+
+  - it: "hpa custom replica configuration"
+    set:
+      stac.enabled: true
+      stac.autoscaling.enabled: true
+      stac.autoscaling.type: "cpu"
+      stac.autoscaling.minReplicas: 2
+      stac.autoscaling.maxReplicas: 20
+    template: templates/services/stac/hpa.yaml
+    asserts:
+      - equal:
+          path: spec.minReplicas
+          value: 2
+      - equal:
+          path: spec.maxReplicas
+          value: 20
+
+  - it: "hpa includes proper labels"
+    set:
+      stac.enabled: true
+      stac.autoscaling.enabled: true
+      stac.autoscaling.type: "cpu"
+    template: templates/services/stac/hpa.yaml
+    asserts:
+      - equal:
+          path: metadata.labels.app
+          value: "RELEASE-NAME-stac"
+
+  - it: "hpa behavior configuration applied when set"
+    set:
+      stac.enabled: true
+      stac.autoscaling.enabled: true
+      stac.autoscaling.type: "cpu"
+      stac.autoscaling.behavior.scaleUp.stabilizationWindowSeconds: 120
+      stac.autoscaling.behavior.scaleDown.stabilizationWindowSeconds: 300
+    template: templates/services/stac/hpa.yaml
+    asserts:
+      - equal:
+          path: spec.behavior.scaleUp.stabilizationWindowSeconds
+          value: 120
+      - equal:
+          path: spec.behavior.scaleDown.stabilizationWindowSeconds
+          value: 300
+
+  - it: "stac hpa production configuration with higher minReplicas"
+    set:
+      stac.enabled: true
+      stac.autoscaling.enabled: true
+      stac.autoscaling.minReplicas: 2
+      stac.autoscaling.maxReplicas: 20
+      stac.autoscaling.type: "requestRate"
+      stac.autoscaling.targets.requestRate: "50000m"
+    template: templates/services/stac/hpa.yaml
+    asserts:
+      - isKind:
+          of: HorizontalPodAutoscaler
+      - equal:
+          path: spec.minReplicas
+          value: 2
+      - equal:
+          path: spec.maxReplicas
+          value: 20
+      - equal:
+          path: spec.metrics[0].type
+          value: "Pods"
+      - equal:
+          path: spec.metrics[0].pods.target.averageValue
+          value: "50000m"
diff --git a/charts/eoapi/values.yaml b/charts/eoapi/values.yaml
index dfcfd615..b07e70b4 100644
--- a/charts/eoapi/values.yaml
+++ b/charts/eoapi/values.yaml
@@ -202,7 +202,7 @@ raster:
     enabled: true  # Control ingress specifically for raster service
     path: "/raster"  # Configurable path prefix for the raster service
   autoscaling:
-    # NOTE: to have autoscaling working you'll need to install the `eoapi-support` chart
+    # NOTE: to have autoscaling working you'll need to enable monitoring
     # see ../../../docs/autoscaling.md for more information
     enabled: false
     minReplicas: 1
@@ -274,7 +274,7 @@ multidim:
     enabled: true  # Control ingress specifically for multidim service
     path: "/multidim"  # Configurable path prefix for the multidim service
   autoscaling:
-    # NOTE: to have autoscaling working you'll need to install the `eoapi-support` chart
+    # NOTE: to have autoscaling working you'll need to enable monitoring
     # see ../../../docs/autoscaling.md for more information
     enabled: false
     minReplicas: 1
@@ -346,7 +346,7 @@ stac:
     enabled: true  # Control ingress specifically for stac service
     path: "/stac"  # Configurable path prefix for the stac service
   autoscaling:
-    # NOTE: to have autoscaling working you'll need to install the `eoapi-support` chart
+    # NOTE: to have autoscaling working you'll need to enable monitoring
     # see ../../../docs/autoscaling.md for more information
     enabled: false
     minReplicas: 1
@@ -406,7 +406,7 @@ vector:
     enabled: true  # Control ingress specifically for vector service
     path: "/vector"  # Configurable path prefix for the vector service
   autoscaling:
-    # NOTE: to have autoscaling working you'll need to install the `eoapi-support` chart
+    # NOTE: to have autoscaling working you'll need to enable monitoring
     # see ../../../docs/autoscaling.md for more information
     enabled: false
     minReplicas: 1
@@ -522,9 +522,114 @@ eoapi-notifier:
             namespace: serverless
         # For HTTP endpoints, use: endpoint: https://webhook.example.com
 
+######################
+# MONITORING
+######################
+# Core monitoring components for metrics collection and autoscaling
+monitoring:
+  # Metrics server - essential for HPA functionality
+  metricsServer:
+    enabled: false
+    apiService:
+      create: true
+
+  # Prometheus - core metrics collection for autoscaling
+  prometheus:
+    enabled: false
+    alertmanager:
+      enabled: false
+    prometheus-pushgateway:
+      enabled: false
+    kube-state-metrics:
+      enabled: true
+    prometheus-node-exporter:
+      enabled: true
+      resources:
+        limits:
+          cpu: 10m
+          memory: 30Mi
+        requests:
+          cpu: 10m
+          memory: 30Mi
+    server:
+      service:
+        type: ClusterIP  # Internal service, no external exposure by default
+
+  # Prometheus adapter - enables custom HPA metrics
+  prometheusAdapter:
+    enabled: false
+    prometheus:
+      # URL to Prometheus server - will be auto-configured for same-release Prometheus
+      # If using external Prometheus, set this to your Prometheus URL
+      # Example: http://my-prometheus-server.monitoring.svc.cluster.local
+      url: http://eoapi-prometheus-server.eoapi.svc.cluster.local
+      port: 80
+      path: ""
+    rules:
+      default: false
+      # Custom metrics for eoapi service autoscaling
+      custom:
+        - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}'
+          seriesFilters: []
+          resources:
+            template: <<.Resource>>
+          name:
+            matches: ""
+            as: "nginx_ingress_controller_requests_rate_vector_eoapi"
+          metricsQuery: round(sum(rate(<<.Series>>{service="vector",path=~"/vector.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001)
+        - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}'
+          seriesFilters: []
+          resources:
+            template: <<.Resource>>
+          name:
+            matches: ""
+            as: "nginx_ingress_controller_requests_rate_raster_eoapi"
+          metricsQuery: round(sum(rate(<<.Series>>{service="raster",path=~"/raster.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001)
+        - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}'
+          seriesFilters: []
+          resources:
+            template: <<.Resource>>
+          name:
+            matches: ""
+            as: "nginx_ingress_controller_requests_rate_stac_eoapi"
+          metricsQuery: round(sum(rate(<<.Series>>{service="stac",path=~"/stac.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001)
+
+######################
+# OBSERVABILITY
+######################
+# Grafana dashboards and visualization (requires monitoring.prometheus.enabled=true)
+observability:
+  grafana:
+    enabled: false
+    persistence:
+      enabled: false
+    service:
+      type: LoadBalancer
+      annotations:
+        service.beta.kubernetes.io/aws-load-balancer-type: "nlb"
+        service.beta.kubernetes.io/aws-load-balancer-internal: "false"
+    resources:
+      limits:
+        cpu: 100m
+        memory: 200Mi
+      requests:
+        cpu: 50m
+        memory: 100Mi
+    datasources:
+      datasources.yaml:
+        apiVersion: 1
+        datasources:
+          - name: Prometheus
+            type: prometheus
+            url: "http://{{ .Release.Name }}-prometheus-server"
+            access: proxy
+            isDefault: true
+    dashboardsConfigMaps:
+      default: "{{ .Release.Name }}-dashboards"
+
 # Version being upgraded from, used for migration purposes
 # Dont set the value in the values.yaml file
-# prefer to set it in the command line
+# Instead, set it during upgrade using --set previousVersion=<version>
 # helm upgrade --set previousVersion=$PREVIOUS_VERSION
 # or in the CI/CD pipeline
 previousVersion: ""
diff --git a/charts/eoapi/values/monitoring.yaml b/charts/eoapi/values/monitoring.yaml
new file mode 100644
index 00000000..519d34bd
--- /dev/null
+++ b/charts/eoapi/values/monitoring.yaml
@@ -0,0 +1,59 @@
+######################
+# MONITORING BASE CONFIG
+######################
+# Base monitoring configuration - import in values files with:
+# monitoring: !include values/monitoring.yaml
+
+monitoring:
+  enabled: true
+
+  # Metrics server for HPA
+  metricsServer:
+    enabled: true
+    apiService:
+      create: true
+    resources: &small_resources
+      limits:
+        cpu: 10m
+        memory: 30Mi
+      requests:
+        cpu: 10m
+        memory: 30Mi
+
+  # Prometheus stack
+  prometheus:
+    enabled: true
+    alertmanager:
+      enabled: false
+    prometheus-pushgateway:
+      enabled: false
+
+    kube-state-metrics:
+      enabled: true
+      resources: *small_resources
+
+    prometheus-node-exporter:
+      enabled: true
+      resources: *small_resources
+
+    server:
+      service:
+        type: ClusterIP
+      persistentVolume:
+        enabled: false
+        size: 8Gi
+      resources:
+        limits:
+          cpu: 500m
+          memory: 512Mi
+        requests:
+          cpu: 200m
+          memory: 256Mi
+
+# Autoscaling defaults
+autoscaling:
+  enabled: false
+  minReplicas: 1
+  maxReplicas: 10
+  targetCPUUtilizationPercentage: 80
+  targetMemoryUtilizationPercentage: 80
diff --git a/docs/examples/values-autoscaling.yaml b/docs/examples/values-autoscaling.yaml
new file mode 100644
index 00000000..8abdeab4
--- /dev/null
+++ b/docs/examples/values-autoscaling.yaml
@@ -0,0 +1,212 @@
+# Example values for eoAPI with core monitoring and autoscaling enabled
+
+gitSha: "latest"
+
+######################
+# INGRESS
+######################
+ingress:
+  enabled: true
+  className: "nginx"
+  # IMPORTANT: Set a proper hostname for metrics collection
+  # nginx ingress controller requires a specific host (not wildcard) to expose metrics
+  host: "your-eoapi.example.com"  # Replace with your domain
+  tls:
+    enabled: true
+    secretName: eoapi-tls
+
+######################
+# DATABASE
+######################
+# Using default PostgreSQL cluster configuration
+postgrescluster:
+  enabled: true
+  instances:
+  - name: eoapi
+    replicas: 1
+    dataVolumeClaimSpec:
+      accessModes:
+      - "ReadWriteOnce"
+      resources:
+        requests:
+          storage: "50Gi"  # Increased for production workloads
+          cpu: "2048m"     # More CPU for database under load
+          memory: "4096Mi" # More memory for database performance
+
+######################
+# MONITORING & AUTOSCALING
+######################
+# Essential monitoring components for autoscaling
+monitoring:
+  metricsServer:
+    enabled: true
+    apiService:
+      create: true
+  prometheus:
+    enabled: true
+    alertmanager:
+      enabled: false
+    prometheus-pushgateway:
+      enabled: false
+    kube-state-metrics:
+      enabled: true
+    prometheus-node-exporter:
+      enabled: true
+      resources:
+        limits:
+          cpu: 10m
+          memory: 30Mi
+        requests:
+          cpu: 10m
+          memory: 30Mi
+    server:
+      service:
+        type: ClusterIP
+
+# Custom metrics for request-rate based autoscaling
+prometheusAdapter:
+  enabled: true
+
+######################
+# SERVICE CONFIGURATION WITH AUTOSCALING
+######################
+
+# STAC API Service
+stac:
+  enabled: true
+  autoscaling:
+    enabled: true
+    minReplicas: 2      # Start with 2 replicas for availability
+    maxReplicas: 20     # Scale up to handle high loads
+    type: "requestRate" # Scale based on request rate
+    behaviour:
+      scaleDown:
+        stabilizationWindowSeconds: 300  # Wait 5 minutes before scaling down
+      scaleUp:
+        stabilizationWindowSeconds: 30   # Scale up quickly (30 seconds)
+    targets:
+      requestRate: 50000m  # Scale when average > 50 requests/second
+  settings:
+    resources:
+      limits:
+        cpu: "1000m"
+        memory: "2048Mi"
+      requests:
+        cpu: "500m"      # Higher baseline for autoscaling
+        memory: "1024Mi"
+
+# Raster Service (TiTiler)
+raster:
+  enabled: true
+  autoscaling:
+    enabled: true
+    minReplicas: 1
+    maxReplicas: 15
+    type: "requestRate"
+    behaviour:
+      scaleDown:
+        stabilizationWindowSeconds: 180  # Scale down slower for raster (3 min)
+      scaleUp:
+        stabilizationWindowSeconds: 60   # Scale up moderately fast
+    targets:
+      requestRate: 30000m  # Scale when average > 30 requests/second (raster is more resource intensive)
+  settings:
+    resources:
+      limits:
+        cpu: "1536m"     # Raster processing needs more CPU
+        memory: "6144Mi" # Raster processing needs more memory
+      requests:
+        cpu: "768m"
+        memory: "3072Mi"
+    envVars:
+      # Optimized GDAL settings for autoscaling
+      GDAL_CACHEMAX: "512"  # Increased cache for better performance
+      WEB_CONCURRENCY: "8"  # More workers for higher throughput
+
+# Vector Service (TIPG)
+vector:
+  enabled: true
+  autoscaling:
+    enabled: true
+    minReplicas: 1
+    maxReplicas: 10
+    type: "requestRate"
+    behaviour:
+      scaleDown:
+        stabilizationWindowSeconds: 240
+      scaleUp:
+        stabilizationWindowSeconds: 45
+    targets:
+      requestRate: 75000m  # Vector is typically lighter, can handle more requests
+  settings:
+    resources:
+      limits:
+        cpu: "1000m"
+        memory: "2048Mi"
+      requests:
+        cpu: "512m"
+        memory: "1024Mi"
+
+# Multidimensional Service (optional)
+multidim:
+  enabled: false  # Disabled by default
+  autoscaling:
+    enabled: true
+    minReplicas: 1
+    maxReplicas: 8
+    type: "requestRate"
+    targets:
+      requestRate: 25000m  # Conservative scaling for multidim
+  settings:
+    resources:
+      limits:
+        cpu: "2048m"     # Multidim can be very CPU intensive
+        memory: "8192Mi" # Large memory requirements for multidim data
+      requests:
+        cpu: "1024m"
+        memory: "4096Mi"
+
+######################
+# STAC BROWSER
+######################
+browser:
+  enabled: true
+  replicaCount: 2  # Static replicas (browser is just static files)
+
+######################
+# PGSTAC BOOTSTRAP
+######################
+pgstacBootstrap:
+  enabled: true
+  settings:
+    loadSamples: false  # Disable sample data for production
+    resources:
+      requests:
+        cpu: "1024m"
+        memory: "2048Mi"
+      limits:
+        cpu: "1024m"
+        memory: "2048Mi"
+
+######################
+# ADDITIONAL NOTES
+######################
+#
+# To use this configuration:
+#
+# 1. Update the ingress.host to your actual domain
+# 2. Adjust scaling targets based on your load testing results
+# 3. Monitor resource usage and adjust requests/limits accordingly
+# 4. Consider enabling TLS for production deployments
+#
+# IMPORTANT: This configuration enables monitoring components that are
+# disabled by default. This is required for autoscaling to work.
+#
+# For observability and dashboards, install the separate eoapi-observability chart:
+# helm install eoapi-obs eoapi/eoapi-observability --namespace eoapi
+#
+# Load testing recommendations:
+# - Test each service endpoint individually
+# - Monitor HPA metrics: kubectl get hpa -n eoapi -w
+# - Check custom metrics: kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1"
+# - Review Prometheus targets to ensure metrics collection is working
diff --git a/docs/examples/values-full-observability.yaml b/docs/examples/values-full-observability.yaml
new file mode 100644
index 00000000..993ca9f4
--- /dev/null
+++ b/docs/examples/values-full-observability.yaml
@@ -0,0 +1,303 @@
+# Example values for eoAPI with full observability stack
+# This configuration includes both core monitoring (in main chart) and observability tools
+# Deploy main chart with these values, then deploy eoapi-observability chart separately
+
+# Git SHA for deployments (set via CI/CD or command line)
+gitSha: "latest"
+
+######################
+# INGRESS
+######################
+ingress:
+  enabled: true
+  className: "nginx"
+  # IMPORTANT: Set a proper hostname for metrics collection
+  host: "eoapi.example.com"  # Replace with your domain
+  tls:
+    enabled: true
+    secretName: eoapi-tls
+
+######################
+# DATABASE
+######################
+postgrescluster:
+  enabled: true
+  monitoring: true  # Enable PostgreSQL monitoring
+  instances:
+  - name: eoapi
+    replicas: 2  # HA setup for production
+    dataVolumeClaimSpec:
+      accessModes:
+      - "ReadWriteOnce"
+      resources:
+        requests:
+          storage: "100Gi"
+          cpu: "2048m"
+          memory: "8192Mi"
+
+######################
+# COMPREHENSIVE MONITORING
+######################
+monitoring:
+  # Essential components
+  metricsServer:
+    enabled: true
+    apiService:
+      create: true
+
+  # Full Prometheus setup with all collectors
+  prometheus:
+    enabled: true
+    # Keep alertmanager disabled - we'll use Grafana alerting instead
+    alertmanager:
+      enabled: false
+    # Enable pushgateway for advanced metrics
+    prometheus-pushgateway:
+      enabled: true
+    # Full metrics collection
+    kube-state-metrics:
+      enabled: true
+    prometheus-node-exporter:
+      enabled: true
+      # Production-ready resource allocation
+      resources:
+        limits:
+          cpu: 50m
+          memory: 64Mi
+        requests:
+          cpu: 50m
+          memory: 64Mi
+    # Prometheus server configuration
+    server:
+      # Expose Prometheus for external access (optional)
+      service:
+        type: LoadBalancer
+        annotations:
+          service.beta.kubernetes.io/aws-load-balancer-type: "nlb"
+          service.beta.kubernetes.io/aws-load-balancer-internal: "true"
+      # Persistent storage for metrics
+      persistentVolume:
+        enabled: true
+        size: 50Gi
+        storageClass: "gp3"  # Adjust for your cloud provider
+      # Retention and performance settings
+      retention: "30d"  # Keep 30 days of metrics
+      resources:
+        limits:
+          cpu: "2000m"
+          memory: "4096Mi"
+        requests:
+          cpu: "1000m"
+          memory: "2048Mi"
+
+  # Advanced prometheus-adapter configuration
+  prometheusAdapter:
+    enabled: true
+    # Enhanced resource allocation
+    resources:
+      limits:
+        cpu: 250m
+        memory: 256Mi
+      requests:
+        cpu: 100m
+        memory: 128Mi
+
+######################
+# SERVICES WITH ADVANCED AUTOSCALING
+######################
+
+stac:
+  enabled: true
+  autoscaling:
+    enabled: true
+    minReplicas: 3  # Higher minimum for HA
+    maxReplicas: 30
+    type: "both"    # Scale on both CPU and request rate
+    behaviour:
+      scaleDown:
+        stabilizationWindowSeconds: 600  # 10 minutes
+        policies:
+        - type: Percent
+          value: 50
+          periodSeconds: 300
+      scaleUp:
+        stabilizationWindowSeconds: 60
+        policies:
+        - type: Percent
+          value: 100
+          periodSeconds: 60
+    targets:
+      cpu: 70
+      requestRate: 40000m
+  settings:
+    resources:
+      limits:
+        cpu: "1500m"
+        memory: "3072Mi"
+      requests:
+        cpu: "750m"
+        memory: "1536Mi"
+
+raster:
+  enabled: true
+  autoscaling:
+    enabled: true
+    minReplicas: 2
+    maxReplicas: 25
+    type: "both"
+    behaviour:
+      scaleDown:
+        stabilizationWindowSeconds: 900  # 15 minutes - raster workloads are bursty
+      scaleUp:
+        stabilizationWindowSeconds: 120  # 2 minutes
+    targets:
+      cpu: 60  # Lower CPU target due to intensive processing
+      requestRate: 20000m
+  settings:
+    resources:
+      limits:
+        cpu: "2048m"
+        memory: "8192Mi"
+      requests:
+        cpu: "1024m"
+        memory: "4096Mi"
+    envVars:
+      GDAL_CACHEMAX: "1024"  # 1GB cache
+      WEB_CONCURRENCY: "4"   # Conservative for memory usage
+      GDAL_HTTP_MAX_RETRY: "3"
+      GDAL_HTTP_RETRY_DELAY: "1"
+
+vector:
+  enabled: true
+  autoscaling:
+    enabled: true
+    minReplicas: 2
+    maxReplicas: 15
+    type: "both"
+    targets:
+      cpu: 75
+      requestRate: 60000m
+  settings:
+    resources:
+      limits:
+        cpu: "1200m"
+        memory: "2560Mi"
+      requests:
+        cpu: "600m"
+        memory: "1280Mi"
+
+multidim:
+  enabled: true  # Enable for comprehensive setup
+  autoscaling:
+    enabled: true
+    minReplicas: 1
+    maxReplicas: 10
+    type: "cpu"  # CPU-based scaling for multidim workloads
+    targets:
+      cpu: 50  # Very conservative due to resource intensity
+  settings:
+    resources:
+      limits:
+        cpu: "4096m"
+        memory: "16384Mi"  # 16GB for large multidim datasets
+      requests:
+        cpu: "2048m"
+        memory: "8192Mi"
+
+######################
+# STAC BROWSER
+######################
+browser:
+  enabled: true
+  replicaCount: 3  # HA setup
+
+######################
+# PGSTAC BOOTSTRAP
+######################
+pgstacBootstrap:
+  enabled: true
+  settings:
+    loadSamples: false  # No samples in production
+  waitConfig:
+    timeout: 1800  # 30 minutes timeout for large migrations
+  resources:
+    requests:
+      cpu: "1024m"
+      memory: "2048Mi"
+    limits:
+      cpu: "2048m"
+      memory: "4096Mi"
+
+######################
+# INTEGRATED OBSERVABILITY
+######################
+# Grafana dashboards integrated with main chart (replaces separate eoapi-observability chart)
+observability:
+  grafana:
+    enabled: true
+    persistence:
+      enabled: true
+      size: 10Gi
+    service:
+      type: LoadBalancer
+      annotations:
+        service.beta.kubernetes.io/aws-load-balancer-type: "nlb"
+        service.beta.kubernetes.io/aws-load-balancer-internal: "false"
+    resources:
+      limits:
+        cpu: 100m
+        memory: 200Mi
+      requests:
+        cpu: 50m
+        memory: 100Mi
+
+######################
+# ADDITIONAL PRODUCTION SETTINGS
+######################
+
+# Service account with monitoring permissions
+serviceAccount:
+  create: true
+  annotations:
+  # Add cloud provider annotations if needed
+  # eks.amazonaws.com/role-arn: arn:aws:iam::ACCOUNT:role/eoapi-monitoring-role
+
+######################
+# DEPLOYMENT NOTES
+######################
+#
+# This configuration provides comprehensive observability including:
+# - Core metrics collection and autoscaling (included in main chart)
+# - Persistent Prometheus storage with 30-day retention
+# - Advanced HPA policies with both CPU and request-rate scaling
+# - Production-ready resource allocations
+# - High availability setup with multiple replicas
+#
+# To deploy the full stack:
+#
+# 1. Deploy main chart with monitoring:
+#    helm install eoapi eoapi/eoapi -f values-full-observability.yaml --namespace eoapi --create-namespace
+#
+# 2. Deploy observability chart separately:
+#    helm install eoapi-obs eoapi/eoapi-observability --namespace eoapi
+#
+# 3. Optional: Configure external integrations
+#    - DataDog: Set up prometheus scraping
+#    - New Relic: Deploy NR Kubernetes integration
+#    - External Grafana: Point to the exposed Prometheus service
+#
+# Monitoring endpoints (if LoadBalancer is used):
+# - Prometheus: http://<prometheus-lb-ip>:9090
+# - Grafana: http://<grafana-lb-ip> (from observability chart)
+#
+# Security considerations:
+# - Use internal LoadBalancers for Prometheus in production
+# - Set up proper RBAC for service accounts
+# - Configure network policies to restrict access
+# - Enable TLS for all external endpoints
+#
+# Performance tuning:
+# - Monitor actual resource usage and adjust requests/limits
+# - Tune HPA scaling policies based on traffic patterns
+# - Adjust Prometheus retention based on storage costs
+# - Consider using remote storage for Prometheus (S3, GCS, etc.)
diff --git a/docs/index.md b/docs/index.md
index 1e5ca6c1..41ea94f6 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -27,11 +27,12 @@ Please refer to our [quick start guide](./installation/quick-start.md)
 2. Install the PostgreSQL Operator dependency
 3. Configure your deployment using the [Configuration Options](./installation/configuration.md)
 4. Deploy using [Helm Installation](./installation/helm-install.md) instructions
-5. Set up monitoring with [Autoscaling & Monitoring](./operations/autoscaling.md)
+5. Set up monitoring with [Autoscaling](./operations/autoscaling.md) & [Observability](./operations/observability.md)
 
 ## Detailed documenation
 
 ### Cloud Provider Guides
+
 - **[AWS EKS Setup](./installation/providers/aws-eks.md)** - Complete EKS cluster setup with OIDC, node autoscaling, EBS CSI, and NGINX ingress
 - **[GCP GKE Setup](./installation/providers/gcp-gke.md)** - GKE cluster creation with CSI driver, NGINX ingress, and cert-manager
 - **[Azure AKS Setup](./installation/providers/azure.md)** - Azure configuration with managed PostgreSQL, Key Vault integration, and Workload Identity
@@ -42,13 +43,11 @@ Please refer to our [quick start guide](./installation/quick-start.md)
 - **[Manual Helm Installation](./installation/helm-install.md)** - Step-by-step Helm deployment process with custom configurations
 - **[Unified Ingress Configuration](./installation/unified-ingress.md)** - NGINX and Traefik ingress setup with TLS and cert-manager integration
 
-## Database Management
-
-- **[Data Management](./operations/manage-data.md)** - Loading STAC collections and items into PostgreSQL using pypgstac
-
 ## Operations & Monitoring
 
-- **[Autoscaling & Monitoring](./operations/autoscaling.md)** - HPA setup with custom metrics, Grafana dashboards, Prometheus configuration, and load testing
+- **[Autoscaling](./operations/autoscaling.md)** - Horizontal Pod Autoscaler configuration with CPU and request-rate metrics, scaling policies, and load testing strategies
+- **[Observability](./operations/observability.md)** - Monitoring stack with Prometheus, Grafana dashboards, metrics collection, and custom metrics API integration
+- **[Data Management](./operations/manage-data.md)** - Loading STAC collections and items into PostgreSQL using pypgstac
 
 ## Advanced Features
 
diff --git a/docs/operations/autoscaling.md b/docs/operations/autoscaling.md
index 2ad25aa2..5c119abd 100644
--- a/docs/operations/autoscaling.md
+++ b/docs/operations/autoscaling.md
@@ -1,25 +1,40 @@
-# Autoscaling / Monitoring / Observability
+# Autoscaling
 
-Autoscaling is both art and science. To test out your application's autoscaling requirements you often need to consider
-your data volume, data usage patterns, bottlenecks (such as the database) among many, many other things. Load testing,
-metrics, monitoring and observability will help you explore what those needs are.
+Horizontal Pod Autoscaler (HPA) configuration for eoAPI services. Autoscaling requires monitoring components to be enabled in the main chart.
 
+## Prerequisites
 
-> &#9432; The `eoapi-support` chart in this repository (see `../charts/eoapi-support`) is required to be installed to
-enable any of the eoAPI service autoscaling. It cannot be listed as a dependecy of `eoapi` chart
-b/c of the limitations in `prometheus-adapter` and `grafana` for constructing the Prometheus internal
-service domains dynamically.
+Enable monitoring in your main eoapi installation:
 
-If you are comfortable with k8s you probably only need to `helm install` the support chart and be on your way. Other folks
-might want to read through the verbose walkthrough material below to familiarize yourself with how things work.
+```yaml
+monitoring:
+  prometheus:
+    enabled: true
+  prometheusAdapter:
+    enabled: true  # Required for request-rate scaling
+  metricsServer:
+    enabled: true   # Required for CPU scaling
+```
 
----
+## Configuration
 
-## Helm Install `eoapi-support`
+### Basic Autoscaling
 
 The following instructions assume you've gone through the [AWS](../installation/providers/aws-eks.md) or [GCP](../installation/providers/gcp-gke.md) cluster set up
 and installed the `eoapi` chart.
 
+```yaml
+stac:
+  autoscaling:
+    enabled: true
+    minReplicas: 2
+    maxReplicas: 20
+    type: "requestRate"  # Options: "cpu", "requestRate", "both"
+    targets:
+      requestRate: 50000m  # 50 requests/second
+```
+
+### Scaling Policies
 
 1. Go to the [releases section](https://github.com/developmentseed/eoapi-k8s/releases) of this repository and find the latest
 `eoapi-support-<version>` version to install, or use the following command to get the latest version:
@@ -29,361 +44,397 @@ and installed the `eoapi` chart.
    export SUPPORT_VERSION=$(helm search repo eoapi/eoapi-support --versions | head -2 | tail -1 | awk '{print $2}')
    ```
 
+```yaml
+stac:
+  autoscaling:
+    enabled: true
+    minReplicas: 2
+    maxReplicas: 20
+    type: "both"
+    behaviour:
+      scaleDown:
+        stabilizationWindowSeconds: 300  # 5min cooldown
+        policies:
+        - type: Percent
+          value: 50      # Max 50% pods removed per period
+          periodSeconds: 300
+      scaleUp:
+        stabilizationWindowSeconds: 60   # 1min cooldown
+        policies:
+        - type: Percent
+          value: 100     # Max 100% pods added per period
+          periodSeconds: 60
+    targets:
+      cpu: 70
+      requestRate: 50000m
+```
+
+## Metrics Types
+
+### CPU-based Scaling
+```yaml
+type: "cpu"
+targets:
+  cpu: 70
+```
+
+### Request Rate Scaling
+```yaml
+type: "requestRate"
+targets:
+  requestRate: 50000m  # 50 requests/second
+```
+
+
+### Combined Scaling
+```yaml
+type: "both"
+targets:
+  cpu: 70
+  requestRate: 100000m  # 100 requests/second
+```
+
+## Custom Metrics Configuration
+
+When using request rate scaling, the prometheus-adapter needs to be configured to expose custom metrics. This is handled automatically when you enable monitoring in the main chart:
+
+```yaml
+# In your main eoapi values file
+ingress:
+  host: your-domain.com
+
+monitoring:
+  prometheusAdapter:
+    enabled: true
+    resources:
+      limits:
+        cpu: 250m
+        memory: 256Mi
+      requests:
+        cpu: 100m
+        memory: 128Mi
+```
+
+## Service-Specific Examples
+
+### STAC (High throughput)
+```yaml
+stac:
+  autoscaling:
+    enabled: true
+    minReplicas: 3
+    maxReplicas: 20
+    type: "requestRate"
+    targets:
+      requestRate: 40000m
+```
+
+### Raster (Resource intensive)
+```yaml
+raster:
+  autoscaling:
+    enabled: true
+    minReplicas: 2
+    maxReplicas: 8
+    type: "cpu"
+    behaviour:
+      scaleDown:
+        stabilizationWindowSeconds: 300
+    targets:
+      cpu: 75
+```
+
+### Vector (Balanced)
+```yaml
+vector:
+  autoscaling:
+    enabled: true
+    minReplicas: 2
+    maxReplicas: 12
+    type: "both"
+    targets:
+      cpu: 70
+      requestRate: 75000m
+```
+
+## Configuration Examples
+
+For complete configuration examples, see the [examples directory](../examples/).
+
+## Resource Requirements
+
+### Autoscaling Components
+- **metrics-server**: ~100m CPU, ~300Mi memory per node
+- **prometheus-adapter**: ~250m CPU, ~256Mi memory
+- **prometheus-server**: ~500m CPU, ~512Mi memory (varies with retention)
+
+## Verification
+
+### Check HPA Status
+
+```bash
+# Check HPA status for all services
+kubectl get hpa -n eoapi
+
+# Get detailed HPA information
+kubectl describe hpa eoapi-stac -n eoapi
+```
+
+### Verify Custom Metrics API
+
+```bash
+# Check if custom metrics API is available
+kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" | jq .
+
+# Check specific request rate metrics
+kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1/namespaces/eoapi/ingresses/*/requests_per_second" | jq .
+```
+
+### Check Prometheus Adapter
+
+```bash
+# Check prometheus-adapter logs
+kubectl logs -l app.kubernetes.io/name=prometheus-adapter -n eoapi
+```
 
-2. Decide on a release name and `namespace` for your support chart. The next steps assume we've
-chosen a release name of `eoapi-support` and a similar namespace of `eoapi-support`
+## Load Testing
 
+For load testing your autoscaling setup:
 
-3. Then do a normal `helm install` but you'll want to parameterize and pass overrides for the prometheus URL to include
-the release name and namespace chosen above. This allows other third-party dependencies used in the chart
-(`prometheus-adpater` and `grafana`) know where to find the prometheus service internally. This is unfortunately a
-manual step that cannot be automated
+```yaml
+ingress:
+  host: your-test-domain.com
+```
 
+3. Check ingress configuration:
    ```bash
-   helm upgrade --install -n eoapi-support \
-     --create-namespace eoapi-support eoapi/eoapi-support --version $SUPPORT_VERSION \
-     --set prometheus-adapter.prometheus.url='http://eoapi-support-prometheus-server.eoapi-support.svc.cluster.local' \
-     --set grafana.datasources.datasources\\.yaml.datasources[0].url='http://eoapi-support-prometheus-server.eoapi-support.svc.cluster.local'
-   ```
-
-
-4. verify that everything is set up correctly and no deployments are not failing:
-
-   ```sh
-   watch -n 1 "kubectl -n eoapi-support get deploy,pod,svc"
-   NAME                                                    READY   STATUS    RESTARTS   AGE
-   pod/eoapi-support-grafana-7fdc9688dd-wkw7p              1/1     Running   0          79s
-   pod/eoapi-support-kube-state-metrics-54d75784db-ghgbd   1/1     Running   0          79s
-   pod/eoapi-support-prometheus-adapter-668b6bd89c-kb25q   1/1     Running   0          79s
-   pod/eoapi-support-prometheus-node-exporter-6f96z        1/1     Running   0          79s
-   pod/eoapi-support-prometheus-node-exporter-fr96x        1/1     Running   0          79s
-   pod/eoapi-support-prometheus-node-exporter-pdvvp        1/1     Running   0          79s
-   pod/eoapi-support-prometheus-server-76dcfc684b-wmk5c    2/2     Running   0          79s
-
-   NAME                                             TYPE           CLUSTER-IP       EXTERNAL-IP      PORT(S)        AGE
-   service/eoapi-support-grafana                    LoadBalancer   10.123.248.75    104.154.59.180   80:30821/TCP   79s
-   service/eoapi-support-kube-state-metrics         ClusterIP      10.123.241.247   <none>           8080/TCP       79s
-   service/eoapi-support-prometheus-adapter         ClusterIP      10.123.249.21    <none>           443/TCP        79s
-   service/eoapi-support-prometheus-node-exporter   ClusterIP      10.123.249.90    <none>           9100/TCP       79s
-   service/eoapi-support-prometheus-server          ClusterIP      10.123.247.255   <none>           80/TCP         79s
-   ```
-
-
-5. If anything in steps 1 through 3 seems confusing then here is a quick bash script to clear it up:
-
-   ```shell
-   export RELEASE_NAME=eoapi
-   export RELEASE_NS=eoapi
-   export SUPPORT_RELEASE_NAME=eoapi-support
-   export SUPPORT_RELEASE_NS=eoapi-support
-
-   # Get latest chart versions
-   export SUPPORT_VERSION=$(helm search repo eoapi/eoapi-support --versions | head -2 | tail -1 | awk '{print $2}')
-   export EOAPI_VERSION=$(helm search repo eoapi/eoapi --versions | head -2 | tail -1 | awk '{print $2}')
-
-   PROMETHEUS_URL="http://${SUPPORT_RELEASE_NAME}-prometheus-server.${SUPPORT_RELEASE_NS}.svc.cluster.local"
-
-   helm upgrade --install \
-     -n $SUPPORT_RELEASE_NS --create-namespace $SUPPORT_RELEASE_NAME \
-     eoapi/eoapi-support --version $SUPPORT_VERSION \
-     --set prometheus-adapter.prometheus.url=$PROMETHEUS_URL \
-     --set grafana.datasources.datasources\\.yaml.datasources[0].url=$PROMETHEUS_URL \
-     -f /tmp/values-overrides.yaml
-
-   helm upgrade --install \
-     -n $RELEASE_NS --create-namespace $RELEASE_NAME \
-     eoapi/eoapi --version $EOAPI_VERSION \
-     -f /tmp/support-values-overrides.yaml
-   ```
-
-
----
-
-### Review [Default Configuration and Options](../installation/configuration.md)
-
-[This document](../installation/configuration.md) will explain the differences in the `autoscaling` block for each service:
-
-   ```yaml
-   autoscaling:
-       enabled: false
-       minReplicas: 1
-       maxReplicas: 10
-       # `type`: "cpu" || "requestRate" || "both"
-       type: "requestRate"
-       behaviour: {}
-         scaleDown:
-           stabilizationWindowSeconds: 60
-         scaleUp:
-           stabilizationWindowSeconds: 0
-       targets:
-         # matches `type` value above unless `type: "both"` is selected
-         cpu: 85
-         requestRate: 15000
-   ```
-
----
-
-### How Autoscaling Works
-
-If you grok the default `eoapi-support` values in `values.yaml` you'll see we use custom metrics and prometheus queries
-based on the nginx ingress controller's request rate under the `prometheus-adpater.prometheus:` key:
-
-   ```yaml
-   prometheus-adapter:
-     prometheus:
-       # NOTE: the `url` below make some assumptions about the namespace where you released eoapi and prometheus
-       # 1) that you didn't change the default name of the `prometheus-server` or the port and installed in eoapi namespace
-       # 2) namely that you ran `helm install eoapi --create-namespace=eoapi`  with the `eoapi` namespace
-       url: http://eoapi-support-prometheus-server.eoapi.svc.cluster.local
-       port: 80
-       path: ""
-     rules:
-       default: false
-       # NOTE: the `name.as` values below make some assumptions about your release name
-       # namely that you have run `helm install eoapi eoapi/eoapi --create-namespace=eoapi`
-       custom:
-       - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}'
-         seriesFilters: []
-         resources:
-           template: <<.Resource>>
-         name:
-           matches: ""
-           as: "nginx_ingress_controller_requests_rate_vector_eoapi"
-         metricsQuery: round(sum(rate(<<.Series>>{service="vector",path=~"/vector.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001)
-       - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}'
-         seriesFilters: []
-         resources:
-           template: <<.Resource>>
-         name:
-           matches: ""
-           as: "nginx_ingress_controller_requests_rate_raster_eoapi"
-         metricsQuery: round(sum(rate(<<.Series>>{service="raster",path=~"/raster.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001)
-       - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}'
-         seriesFilters: []
-         resources:
-           template: <<.Resource>>
-         name:
-           matches: ""
-           as: "nginx_ingress_controller_requests_rate_stac_eoapi"
-         metricsQuery: round(sum(rate(<<.Series>>{service="stac",path=~"/stac.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001)
-   ```
-
-Prometheus adapter is a bridge for metrics between Prometheus (which scrapes nginx) and the k8s metrics server so it can autoscale deployments using these custom metrics.
-If you've chosen `both` or `requestRate` as a autoscaling `type:` for those values then these custom metrics are used to template an `hpa.yaml` for each service
-
-### Log into Grafana
-
-When you `helm install` the support chart you by default get a Grafana dashboard set up with different default metrics charts
-to help you load test and explore your service autoscaling. Grafana creates a new username `admin` and password for you
-that you'll have to retrieve to login.
-
-> &#9432; Note that the `service/eoapi-support-grafana` has an EXTERNAL-IP that we can use to view it.
-This is just a quick way to work with it. You'll want to set it up with an ingress in the future
-
-
-1. To log into Grafana you'll need to export the default username/password it came installed with. Note that secret names are prefixed
-with the `release` name we installed the chart with below `<release-name>-grafana`:
-
-   ```sh
-   kubectl get secret eoapi-support-grafana --template='{{index .data "admin-user"}}' -n eoapi | base64 -d
-     # <not-showing-output>
-   kubectl get secret eoapi-support-grafana --template='{{index .data "admin-password"}}' -n eoapi | base64 -d
-     # <not-showing-output>
-   ```
-
-2. To find the URL for the load balancer for where to log in with Grafana you can query the services:
-
-   ```sh
-   kubectl get svc -n eoapi-support
+   kubectl get ingress -n eoapi
    ```
 
-3. Login and you should be default be able to see the eoapi-k8s grafana dashboard. The Prometheus datasource will already be configured for you:
-
-   ![Grafana Datasource Configuration](../images/datasource.png)
-
-   You can then view the main eoAPI dashboard:
-
-   ![](../images/gfdashboard.png)
+## Troubleshooting
 
-   To add additional custom dashboards, you can use the dashboard import functionality:
+### HPA Shows "Unknown" Metrics
 
-   ![Adding Custom Grafana Dashboards](../images/add-grafana-dashboard.png)
+If HPA shows "unknown" for custom metrics:
 
-### Install or Upgrade Autoscaling Changes to `eoapi` Chart
-
-1. If you haven't already decide which services (`vector` || `raster` || `stac`) you want to enable `autoscaling` on change your values yaml for these and redeploy
-
-   ```yaml
-   stac:
-     enabled: true
-     autoscaling:
-       enabled: true
-       type: "requestRate"
-       targets:
-         requestRate: 50000m
-     settings:
-       resources:
-         limits:
-           cpu: "1280m"
-           memory: "1536Mi"
-         requests:
-           cpu: "512m"
-           memory: "1024Mi"
-   vector:
-     enabled: true
-     autoscaling:
-       enabled: true
-       type: "requestRate"
-       targets:
-         requestRate: 50000m
-     settings:
-       resources:
-         limits:
-           cpu: "768m"
-           memory: "1536Mi"
-         requests:
-           cpu: "256m"
-           memory: "1024Mi"
+1. Verify prometheus-adapter is running:
+   ```bash
+   kubectl get pods -l app.kubernetes.io/name=prometheus-adapter -n eoapi
    ```
 
-2. Review what the heck the unit `m` means for your [autoscaling values in the k8s docs](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#quantities)
-
-
-3. Then `helm install` the eoapi chart with these changes
-
-   ```sh
-   helm upgrade --install -n eoapi...
+2. Check prometheus-adapter logs:
+   ```bash
+   kubectl logs -l app.kubernetes.io/name=prometheus-adapter -n eoapi
    ```
 
----
-
-### Add Load Balancer Host as a Host to Your Ingress
-
-Unfortunately, nginx will not expose metrics for ingresses without hosts or hosts with wildcards. You'll either need to deploy
-`eoapi-k8s` chart again with `ingress.tls.enabled` or need to find the `EXTERNAL-IP` for your `ingress-nginx-controller` and use that
-to set up a simple host
-
-1. Find the IP that your `ingress-nginx-controller` service load balancer:
-
-   ```sh
-   kubectl -n ingress-nginx  get svc/ingress-nginx-controller -o=jsonpath='{.status.loadBalancer.ingress[0].hostname}'
-     http://abc5929f88f8c45c38f6cbab2faad43c-776419634.us-west-2.elb.amazonaws.com/
+3. Verify metrics are available in Prometheus:
+   ```bash
+   # Port forward to access Prometheus
+   kubectl port-forward service/eoapi-prometheus-server 9090:80 -n eoapi
+   # Then check metrics at http://localhost:9090
    ```
 
-2. Then live edit your shared ingress for eoapi services to add the host:
+### Review [Default Configuration and Options](../installation/configuration.md)
 
-   ```sh
-   kubectl edit ingress nginx-service-ingress-shared-eoapi -n eoapi
+Default autoscaling configuration:
+
+```yaml
+autoscaling:
+  enabled: false
+  minReplicas: 1
+  maxReplicas: 5
+  # Type can be "cpu", "requestRate", or "both"
+  type: "cpu"
+  # Custom scaling behavior (optional)
+  behaviour: {}
+  # Scaling targets
+  targets:
+    # CPU target percentage (when type is "cpu" or "both")
+    cpu: 80
+    # Request rate target in millirequests per second (when type is "requestRate" or "both")
+    requestRate: 30000m
+```
+
+### No Scaling Activity
+
+If pods aren't scaling:
+
+1. Check HPA events:
+   ```bash
+   kubectl describe hpa eoapi-stac -n eoapi
    ```
 
-   ```yaml
-   # BEFORE
-   spec:
-   ingressClassName: nginx
-   rules:
-   - http:
-       paths:
-       ...
+2. Verify metrics are being collected:
+   ```bash
+   kubectl top pods -n eoapi
    ```
 
-   ```yaml
-   # AFTER
-   spec:
-   ingressClassName: nginx
-   rules:
-   - host: abc5929f88f8c45c38f6cbab2faad43c-776419634.us-west-2.elb.amazonaws.com
-     http:
-       paths:
-       ...
+3. Check resource requests are set:
+   ```bash
+   kubectl describe pod eoapi-stac-xxx -n eoapi | grep -A 10 "Requests"
    ```
 
-And then finally roll out the deployment.
-
-   ```sh
-   kubectl rollout restart deploy/ingress-nginx-controller -n ingress-nginx
-
-   ```
+### Install or Upgrade Autoscaling Changes to `eoapi` Chart
 
----
+When enabling autoscaling, ensure monitoring is also enabled:
+
+```yaml
+# Enable monitoring first
+monitoring:
+  prometheus:
+    enabled: true
+  prometheusAdapter:
+    enabled: true
+
+# Then enable autoscaling
+stac:
+  autoscaling:
+    enabled: true
+    type: "requestRate"
+    targets:
+      requestRate: 50000m
+
+# Configure resources for proper scaling metrics
+stac:
+  settings:
+    resources:
+      limits:
+        cpu: 1000m
+        memory: 512Mi
+      requests:
+        cpu: 100m
+        memory: 128Mi
+```
+
+### Custom Metrics Not Working
+
+If request rate metrics aren't working:
+
+1. Verify nginx ingress controller has metrics enabled
+2. Check prometheus is scraping ingress metrics
+3. Confirm prometheus-adapter configuration
+4. Validate ingress annotations for metrics
+
+### Scaling Too Aggressive/Slow
+
+Adjust scaling behavior:
+
+```yaml
+autoscaling:
+  behaviour:
+    scaleUp:
+      stabilizationWindowSeconds: 60    # Faster scaling up
+      policies:
+      - type: Percent
+        value: 100
+        periodSeconds: 60
+    scaleDown:
+      stabilizationWindowSeconds: 300   # Slower scaling down
+      policies:
+      - type: Percent
+        value: 25                       # More conservative scale down
+        periodSeconds: 300
+```
+
+## Best Practices
+
+1. **Set appropriate resource requests**: HPA needs resource requests to calculate CPU utilization
+2. **Use stabilization windows**: Prevent thrashing with appropriate cooldown periods
+3. **Monitor costs**: Autoscaling can increase costs rapidly
+4. **Test thoroughly**: Validate scaling behavior under realistic load
+5. **Set reasonable limits**: Use `maxReplicas` to prevent runaway scaling
+6. **Use multiple metrics**: Combine CPU and request rate for better scaling decisions
+
+Example ingress configuration for load testing:
+
+```yaml
+# For AWS ALB
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: eoapi-ingress
+  annotations:
+    kubernetes.io/ingress.class: alb
+    alb.ingress.kubernetes.io/scheme: internet-facing
+spec:
+  ingressClassName: nginx
+  rules:
+  - host: your-domain.com
+    http:
+      paths: [...]
+
+# For nginx ingress
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: eoapi-ingress
+spec:
+  ingressClassName: nginx
+  rules:
+  - host: abc5929f88f8c45c38f6cbab2faad43c-776419634.us-west-2.elb.amazonaws.com
+    http:
+      paths: [...]
+```
 
 ## Load Testing
 
 #### Load Testing with `hey`
 
-Everything mentioned below assumes you've already gone through the autoscaling setup above and
-that you're deploying using `ingress.className: "nginx"`.
+The `hey` tool is a simple HTTP load testing tool.
 
 ### Install and Run Load Tests
 
-1. Install `hey` utility locally:
-
+1. Install hey:
    ```bash
    # macOS
    brew install hey
 
    # Linux
-   wget https://github.com/rakyll/hey/releases/latest/download/hey_linux_amd64
-   chmod +x hey_linux_amd64 && sudo mv hey_linux_amd64 /usr/local/bin/hey
+   go install github.com/rakyll/hey@latest
 
-   # Or use Docker
-   alias hey='docker run --rm rcmorano/hey'
+   # Or download from releases
+   wget https://hey-release.s3.us-east-2.amazonaws.com/hey_linux_amd64
+   chmod +x hey_linux_amd64
+   sudo mv hey_linux_amd64 /usr/local/bin/hey
    ```
 
-2. Find the external IP of your shared nginx ingress:
-
+2. Run basic load test:
    ```bash
-   # For GKE clusters
-   export INGRESS_ENDPOINT=$(kubectl -n ingress-nginx get ingress/nginx-service-ingress-shared-eoapi -o=jsonpath='{.spec.rules[0].host}')
-   # Example output: eoapi-35.234.254.12.nip.io
+   # Test STAC endpoint
+   hey -z 5m -c 10 https://your-domain.com/stac/collections
 
-   # For EKS clusters
-   export INGRESS_ENDPOINT=$(kubectl -n ingress-nginx get svc/ingress-nginx-controller -o=jsonpath='{.status.loadBalancer.ingress[0].hostname}')
-   # Example output: k8s-eoapi-ingressn-404721dbb4-e6dec70321c3eddd.elb.us-west-2.amazonaws.com
+   # Test with higher concurrency
+   hey -z 10m -c 50 https://your-domain.com/stac/search
    ```
 
-3. Run load tests against different endpoints in separate terminals:
-
+3. Monitor during load test:
    ```bash
-   # Test Vector API
-   hey -n 2000000 -q 150 -c 20 "http://${INGRESS_ENDPOINT}/vector/collections/public.my_data/items?f=geojson"
-
-   # Test STAC API
-   hey -n 2000000 -q 150 -c 20 "http://${INGRESS_ENDPOINT}/stac/"
+   # Watch HPA scaling
+   watch kubectl get hpa -n eoapi
 
-   # Test Raster API
-   hey -n 2000000 -q 150 -c 20 "http://${INGRESS_ENDPOINT}/raster/collections"
+   # Monitor pods
+   watch kubectl get pods -n eoapi
    ```
 
-   **Load testing parameters:**
-   - `-n`: Total number of requests (2M for sustained testing)
-   - `-q`: Rate limit (150 requests/second per worker)
-   - `-c`: Number of concurrent workers (20)
-
-4. **Monitor autoscaling in Grafana** - Go back to your Grafana dashboard and watch your services autoscale for the endpoints you're hitting:
-
-   ![Grafana Autoscaling Dashboard](../images/grafanaautoscale.png)
-
 ### Load Testing Best Practices
 
-- **Start small**: Begin with lower request rates and gradually increase
-- **Monitor resources**: Watch CPU, memory, and request rate metrics
-- **Test realistic scenarios**: Use actual data access patterns when possible
-- **Verify autoscaling**: Ensure HPA triggers and pods scale up/down appropriately
-- **Database bottlenecks**: Monitor PostgreSQL performance under load
-- **Clean up**: Stop load tests gracefully to avoid overwhelming services
+1. **Start small**: Begin with low concurrency and short duration
+2. **Monitor resources**: Watch CPU, memory, and network usage
+3. **Test realistic scenarios**: Use actual API endpoints and payloads
+4. **Gradual increase**: Slowly increase load to find breaking points
+5. **Test different endpoints**: Each service may have different characteristics
 
 ### Troubleshooting Load Tests
 
-If autoscaling isn't triggering:
-- Verify HPA is configured: `kubectl get hpa -n eoapi`
-- Check custom metrics: `kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" | jq .`
-- Ensure prometheus-adapter is running: `kubectl get pods -n eoapi-support`
-- Validate ingress metrics: Check Grafana for nginx request rates
+- **High response times**: May indicate need for more replicas or resources
+- **Error rates**: Could suggest database bottlenecks or resource limits
+- **No scaling**: Check HPA metrics and thresholds
 
 ### Advanced Load Testing
 
-For more sophisticated testing consider:
-- **[k6](https://k6.io/)** - JavaScript-based load testing with scenarios
-- **[Artillery](https://artillery.io/)** - Node.js load testing toolkit
-- **[JMeter](https://jmeter.apache.org/)** - GUI-based load testing with complex scenarios
+For more comprehensive testing, consider:
+- **[Artillery](https://artillery.io/)** - Feature-rich load testing toolkit
+- **[k6](https://k6.io/)** - Developer-centric performance testing
 - **[Locust](https://locust.io/)** - Python-based distributed load testing
+
+For monitoring and observability setup, see [observability.md](observability.md).
diff --git a/docs/operations/observability.md b/docs/operations/observability.md
new file mode 100644
index 00000000..06f89ea8
--- /dev/null
+++ b/docs/operations/observability.md
@@ -0,0 +1,334 @@
+ # Observability & Monitoring
+
+This guide covers metrics collection, monitoring, and visualization for eoAPI deployments. All monitoring components are optional and disabled by default.
+
+## Overview
+
+eoAPI observability is implemented through conditional dependencies in the main `eoapi` chart:
+
+### Core Monitoring
+Essential metrics collection infrastructure including Prometheus server, metrics-server, kube-state-metrics, node-exporter, and prometheus-adapter.
+
+### Integrated Observability
+Grafana dashboards and visualization tools are available as conditional dependencies within the main chart, eliminating the need for separate deployments.
+
+## Configuration
+
+**Prerequisites**: Kubernetes cluster with Helm 3 installed.
+
+### Quick Deployment
+
+```bash
+# Deploy with monitoring and observability enabled
+helm install eoapi eoapi/eoapi \
+  --set monitoring.prometheus.enabled=true \
+  --set observability.grafana.enabled=true
+
+# Access Grafana (get password)
+kubectl get secret eoapi-grafana -n eoapi \
+  -o jsonpath="{.data.admin-password}" | base64 -d
+```
+
+### Using Configuration Files
+
+For production deployments, use configuration files instead of command-line flags:
+
+```bash
+# Deploy with integrated monitoring and observability
+helm install eoapi eoapi/eoapi -f values-full-observability.yaml
+```
+
+**For a complete example**: See [examples/values-full-observability.yaml](../examples/values-full-observability.yaml)
+
+## Architecture & Components
+
+**Component Responsibilities:**
+
+- **Prometheus Server**: Central metrics storage and querying engine
+- **metrics-server**: Provides resource metrics for `kubectl top` and HPA
+- **kube-state-metrics**: Exposes Kubernetes object state as metrics
+- **prometheus-node-exporter**: Collects hardware and OS metrics from nodes
+- **prometheus-adapter**: Enables custom metrics for Horizontal Pod Autoscaler
+- **Grafana**: Dashboards and visualization of collected metrics
+
+**Data Flow**: Exporters expose metrics → Prometheus scrapes and stores → Grafana/kubectl query via PromQL → Dashboards visualize data
+
+### Detailed Configuration
+
+#### Basic Monitoring Setup
+
+```yaml
+# values.yaml - Enable core monitoring in main eoapi chart
+monitoring:
+  metricsServer:
+    enabled: true
+  prometheus:
+    enabled: true
+    server:
+      persistentVolume:
+        enabled: true
+        size: 50Gi
+      retention: "30d"
+    kube-state-metrics:
+      enabled: true
+    prometheus-node-exporter:
+      enabled: true
+```
+
+#### Observability Chart Configuration
+
+```yaml
+# Basic Grafana setup
+grafana:
+  enabled: true
+  service:
+    type: LoadBalancer
+
+# Connect to external Prometheus (if not using eoapi's Prometheus)
+prometheusUrl: "http://prometheus.monitoring.svc.cluster.local"
+
+# Production Grafana configuration
+grafana:
+  persistence:
+    enabled: true
+    size: 10Gi
+  resources:
+    limits:
+      cpu: 200m
+      memory: 400Mi
+    requests:
+      cpu: 50m
+      memory: 200Mi
+```
+
+#### PostgreSQL Monitoring
+
+Enable PostgreSQL metrics collection:
+
+```yaml
+postgrescluster:
+  monitoring: true  # Enables postgres_exporter sidecar
+```
+
+## Available Metrics
+
+### Core Infrastructure Metrics
+- **Container resources**: CPU, memory, network usage
+- **Kubernetes state**: Pods, services, deployments status
+- **Node metrics**: Hardware utilization, filesystem usage
+- **PostgreSQL**: Database connections, query performance (when enabled)
+
+### Custom Application Metrics
+
+When prometheus-adapter and nginx ingress are both enabled, these custom metrics become available:
+- `nginx_ingress_controller_requests_rate_stac_eoapi`
+- `nginx_ingress_controller_requests_rate_raster_eoapi`
+- `nginx_ingress_controller_requests_rate_vector_eoapi`
+- `nginx_ingress_controller_requests_rate_multidim_eoapi`
+
+**Requirements**:
+- nginx ingress controller with prometheus metrics enabled
+- Ingress must use specific hostnames (not wildcard patterns)
+- prometheus-adapter must be configured to expose these metrics
+
+## Pre-built Dashboards
+
+The `eoapi-observability` chart provides ready-to-use dashboards:
+
+### eoAPI Services Dashboard
+- Request rates per service
+- Response times and error rates
+- Traffic patterns by endpoint
+
+### Infrastructure Dashboard
+- CPU usage rate by pod
+- CPU throttling metrics
+- Memory usage and limits
+- Pod count tracking
+
+### Container Resources Dashboard
+- Resource consumption by container
+- Resource quotas and limits
+- Performance bottlenecks
+
+### PostgreSQL Dashboard (when enabled)
+- Database connections
+- Query performance
+- Storage utilization
+
+#### Production Configuration
+
+```yaml
+monitoring:
+  prometheus:
+    server:
+      # Persistent storage
+      persistentVolume:
+        enabled: true
+        size: 100Gi
+        storageClass: "gp3"
+      # Retention policy
+      retention: "30d"
+      # Resource allocation
+      resources:
+        limits:
+          cpu: "2000m"
+          memory: "4096Mi"
+        requests:
+          cpu: "1000m"
+          memory: "2048Mi"
+      # Security - internal access only
+      service:
+        type: ClusterIP
+```
+
+### Resource Requirements
+
+#### Core Monitoring Components
+
+Minimum resource requirements (actual usage varies by cluster size and metrics volume):
+
+| Component | CPU | Memory | Purpose |
+|-----------|-----|---------|----------|
+| prometheus-server | 500m | 1Gi | Metrics storage |
+| metrics-server | 100m | 200Mi | Resource metrics |
+| kube-state-metrics | 50m | 150Mi | K8s state |
+| prometheus-node-exporter | 50m | 50Mi | Node metrics |
+| prometheus-adapter | 100m | 128Mi | Custom metrics API |
+| **Total** | **~800m** | **~1.5Gi** | |
+
+#### Observability Components
+
+| Component | CPU | Memory | Purpose |
+|-----------|-----|---------|----------|
+| grafana | 100m | 200Mi | Visualization |
+
+## Operations
+
+### Verification Commands
+
+```bash
+# Check Prometheus is running
+kubectl get pods -n eoapi -l app.kubernetes.io/name=prometheus
+
+# Verify metrics-server
+kubectl get apiservice v1beta1.metrics.k8s.io
+
+# List available custom metrics
+kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" | jq '.resources[].name'
+
+# Test metrics collection
+kubectl port-forward svc/eoapi-prometheus-server 9090:80 -n eoapi
+# Visit http://localhost:9090/targets
+```
+
+### Monitoring Health
+
+```bash
+# Check Prometheus targets
+curl -X GET 'http://localhost:9090/api/v1/query?query=up'
+
+# Verify Grafana datasource connectivity
+kubectl exec -it deployment/eoapi-obs-grafana -n eoapi -- \
+  wget -O- http://eoapi-prometheus-server/api/v1/label/__name__/values
+```
+
+## Advanced Features
+
+### Alerting Setup
+
+Enable alertmanager for alert management:
+
+```yaml
+prometheus:
+  enabled: true
+  alertmanager:
+    enabled: true
+    config:
+      global:
+        # Configure with your SMTP server details
+        smtp_smarthost: 'your-smtp-server:587'
+        smtp_from: 'alertmanager@yourdomain.com'
+      route:
+        receiver: 'default-receiver'
+      receivers:
+      - name: 'default-receiver'
+        webhook_configs:
+        - url: 'http://your-webhook-endpoint:5001/'
+```
+
+**Note**: Replace example values with your actual SMTP server and webhook endpoints.
+
+### Batch Job Metrics
+
+Enable pushgateway for batch job metrics:
+
+```yaml
+prometheus:
+  enabled: true
+  prometheus-pushgateway:
+    enabled: true  # For batch job metrics collection
+```
+
+### Custom Dashboards
+
+Add custom dashboards by creating ConfigMaps with the appropriate label:
+
+```yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: custom-dashboard
+  namespace: eoapi
+  labels:
+    eoapi_dashboard: "1"
+data:
+  custom.json: |
+    {
+      "dashboard": {
+        "id": null,
+        "title": "Custom eoAPI Dashboard",
+        "tags": ["eoapi"],
+        "panels": []
+      }
+    }
+```
+
+The ConfigMap must be in the same namespace as the Grafana deployment and include the `eoapi_dashboard: "1"` label.
+
+## Troubleshooting
+
+### Common Issues
+
+**Missing Metrics**
+1. Check Prometheus service discovery:
+   ```bash
+   kubectl port-forward svc/eoapi-prometheus-server 9090:80 -n eoapi
+   # Visit http://localhost:9090/service-discovery
+   ```
+
+2. Verify target endpoints:
+   ```bash
+   kubectl get endpoints -n eoapi
+   ```
+
+**Grafana Connection Issues**
+1. Check datasource connectivity in Grafana UI → Configuration → Data Sources
+2. Verify Prometheus URL accessibility from Grafana pod
+
+**Resource Issues**
+- Monitor current usage: `kubectl top pods -n eoapi`
+- Check for OOMKilled containers: `kubectl describe pods -n eoapi | grep -A 5 "Last State"`
+- Verify resource limits are appropriate for your workload size
+- Consider increasing Prometheus retention settings if storage is full
+
+## Security Considerations
+
+- **Network Security**: Use `ClusterIP` services for Prometheus in production
+- **Access Control**: Configure network policies to restrict metrics access
+- **Authentication**: Enable authentication for Grafana (LDAP, OAuth, etc.)
+- **Data Privacy**: Consider metrics data sensitivity and retention policies
+
+## Related Documentation
+
+- For autoscaling configuration using these metrics: [autoscaling.md](autoscaling.md)

From 42d92cab31f956ba493f2d0e158c84a8f8a769d9 Mon Sep 17 00:00:00 2001
From: Felix Delattre <felix@developmentseed.org>
Date: Fri, 24 Oct 2025 15:25:29 +0200
Subject: [PATCH 3/3] Added tests for autoscaling and observability.

---
 .github/workflows/helm-tests.yml              | 119 ++++
 .github/workflows/tests/test_autoscaling.py   | 654 ++++++++++++++++++
 .github/workflows/tests/test_observability.py | 522 ++++++++++++++
 scripts/README.md                             |  53 +-
 scripts/lib/common.sh                         |   4 +
 scripts/lib/observability.sh                  | 530 ++++++++++++++
 scripts/test.sh                               | 216 +++++-
 7 files changed, 2092 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/tests/test_autoscaling.py
 create mode 100644 .github/workflows/tests/test_observability.py
 create mode 100644 scripts/lib/observability.sh

diff --git a/.github/workflows/helm-tests.yml b/.github/workflows/helm-tests.yml
index e18fef38..7c452968 100644
--- a/.github/workflows/helm-tests.yml
+++ b/.github/workflows/helm-tests.yml
@@ -217,3 +217,122 @@ jobs:
         if: always()
         run: |
           helm uninstall "$RELEASE_NAME" || true
+
+  observability-tests:
+    name: Observability Tests
+    if: github.event.pull_request.head.repo.full_name == github.repository
+    permissions:
+      contents: 'read'
+      id-token: 'write'
+    needs: k3s-integration-tests
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v5
+
+      - name: Start K3s cluster
+        uses: jupyterhub/action-k3s-helm@v4
+        with:
+          k3s-channel: latest
+          helm-version: ${{ env.HELM_VERSION }}
+          metrics-enabled: false
+          docker-enabled: true
+
+      - name: Set release name
+        run: echo "RELEASE_NAME=eoapi-$(echo "${{ github.sha }}" | cut -c1-8)" >> "$GITHUB_ENV"
+
+      - name: Wait for K3s to be fully ready
+        run: |
+          echo "=== Waiting for K3s to be fully ready ==="
+          kubectl wait --for=condition=Ready pod -l k8s-app=kube-dns -n kube-system --timeout=300s
+          kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=traefik -n kube-system --timeout=300s
+          kubectl get nodes
+          kubectl get pods --all-namespaces
+          sleep 10
+          echo "✅ K3s is ready"
+
+      - name: Deploy eoAPI with monitoring
+        run: |
+          echo "=== Deploying eoAPI with monitoring stack ==="
+          export RELEASE_NAME="$RELEASE_NAME"
+          export PGO_VERSION="${{ env.PGO_VERSION }}"
+          export GITHUB_SHA="${{ github.sha }}"
+          ./scripts/deploy.sh --ci
+
+          # Enable monitoring components
+          helm upgrade "$RELEASE_NAME" ./charts/eoapi \
+            --set monitoring.prometheus.enabled=true \
+            --set monitoring.prometheusAdapter.enabled=true \
+            --set monitoring.kube-state-metrics.enabled=true \
+            --set monitoring.prometheus-node-exporter.enabled=true \
+            --set observability.grafana.enabled=true \
+            --set stac.autoscaling.enabled=true \
+            --set raster.autoscaling.enabled=true \
+            --set vector.autoscaling.enabled=true \
+            --namespace eoapi \
+            --wait --timeout=10m
+
+      - name: Wait for monitoring stack to be ready
+        run: |
+          echo "=== Waiting for monitoring components ==="
+
+          # Wait for Prometheus
+          kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=prometheus -n eoapi --timeout=300s || echo "Prometheus not ready"
+
+          # Wait for Grafana
+          kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=grafana -n eoapi --timeout=300s || echo "Grafana not ready"
+
+          # Wait for prometheus-adapter
+          kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=prometheus-adapter -n eoapi --timeout=300s || echo "prometheus-adapter not ready"
+
+          # Wait for HPA to be created
+          sleep 30
+
+          echo "=== Final monitoring stack status ==="
+          kubectl get pods -n eoapi -l 'app.kubernetes.io/component in (server,grafana,prometheus-adapter)' || true
+          kubectl get hpa -n eoapi || true
+
+      - name: Run observability tests
+        run: |
+          echo "=== Running observability test suite ==="
+          export RELEASE_NAME="$RELEASE_NAME"
+          export NAMESPACE="eoapi"
+
+          # Install python dependencies for testing
+          python -m pip install --upgrade pip
+          pip install pytest requests
+
+          # Run observability tests
+          python -m pytest .github/workflows/tests/test_observability.py -v --tb=short
+
+          # Run autoscaling tests
+          python -m pytest .github/workflows/tests/test_autoscaling.py -v --tb=short -m "not slow"
+
+      - name: Debug observability stack on failure
+        if: failure()
+        run: |
+          echo "=== Observability Debug Information ==="
+
+          echo "=== Monitoring Pods Status ==="
+          kubectl get pods -n eoapi -l 'app.kubernetes.io/name in (prometheus,grafana,prometheus-adapter)' -o wide || true
+
+          echo "=== HPA Status ==="
+          kubectl get hpa -n eoapi -o wide || true
+          kubectl describe hpa -n eoapi || true
+
+          echo "=== Custom Metrics API ==="
+          kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" || true
+
+          echo "=== Pod Metrics ==="
+          kubectl top pods -n eoapi || true
+
+          echo "=== Recent Events ==="
+          kubectl get events -n eoapi --sort-by='.lastTimestamp' | tail -20 || true
+
+          echo "=== Component Logs ==="
+          kubectl logs -l app.kubernetes.io/name=prometheus-adapter -n eoapi --tail=50 || true
+          kubectl logs -l app.kubernetes.io/name=grafana -n eoapi --tail=30 || true
+
+      - name: Cleanup observability test
+        if: always()
+        run: |
+          helm uninstall "$RELEASE_NAME" || true
diff --git a/.github/workflows/tests/test_autoscaling.py b/.github/workflows/tests/test_autoscaling.py
new file mode 100644
index 00000000..593125b0
--- /dev/null
+++ b/.github/workflows/tests/test_autoscaling.py
@@ -0,0 +1,654 @@
+"""Test autoscaling behavior and HPA functionality."""
+
+import json
+import os
+import subprocess
+import threading
+import time
+
+import pytest
+import requests
+
+
+def get_namespace():
+    """Get the target namespace from environment or default."""
+    return os.environ.get("NAMESPACE", "eoapi")
+
+
+def get_release_name():
+    """Get the release name from environment or default."""
+    return os.environ.get("RELEASE_NAME", "eoapi")
+
+
+def get_base_url():
+    """Get the base URL for API endpoints."""
+    # Try to detect ingress or use port-forward
+    namespace = get_namespace()
+
+    # Check if we have an ingress
+    result = subprocess.run(
+        ["kubectl", "get", "ingress", "-n", namespace, "-o", "json"],
+        capture_output=True,
+        text=True,
+    )
+
+    if result.returncode == 0:
+        ingress_data = json.loads(result.stdout)
+        if ingress_data["items"]:
+            ingress = ingress_data["items"][0]
+            rules = ingress.get("spec", {}).get("rules", [])
+            if rules:
+                host = rules[0].get("host", "localhost")
+                # Check if host is accessible
+                try:
+                    response = requests.get(
+                        f"http://{host}/stac/collections", timeout=5
+                    )
+                    if response.status_code == 200:
+                        return f"http://{host}"
+                except requests.RequestException:
+                    pass
+
+    # Fallback to localhost (assuming port-forward)
+    return "http://localhost:8080"
+
+
+def kubectl_get(resource, namespace=None, label_selector=None, output="json"):
+    """Execute kubectl get command with optional parameters."""
+    cmd = ["kubectl", "get", resource]
+
+    if namespace:
+        cmd.extend(["-n", namespace])
+
+    if label_selector:
+        cmd.extend(["-l", label_selector])
+
+    if output:
+        cmd.extend(["-o", output])
+
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    return result
+
+
+def get_pod_metrics(namespace, service_name):
+    """Get current CPU and memory metrics for service pods."""
+    result = subprocess.run(
+        [
+            "kubectl",
+            "top",
+            "pods",
+            "-n",
+            namespace,
+            "-l",
+            f"app=eoapi-{service_name}",
+            "--no-headers",
+        ],
+        capture_output=True,
+        text=True,
+    )
+
+    if result.returncode != 0:
+        return []
+
+    metrics = []
+    for line in result.stdout.strip().split("\n"):
+        if line.strip():
+            parts = line.split()
+            if len(parts) >= 3:
+                pod_name = parts[0]
+                cpu = parts[1]  # e.g., "25m"
+                memory = parts[2]  # e.g., "128Mi"
+                metrics.append({"pod": pod_name, "cpu": cpu, "memory": memory})
+
+    return metrics
+
+
+def get_hpa_status(namespace, hpa_name):
+    """Get current HPA status and metrics."""
+    result = kubectl_get("hpa", namespace=namespace, output="json")
+    if result.returncode != 0:
+        return None
+
+    hpas = json.loads(result.stdout)
+    for hpa in hpas["items"]:
+        if hpa["metadata"]["name"] == hpa_name:
+            return hpa
+
+    return None
+
+
+def get_pod_count(namespace, service_name):
+    """Get current number of running pods for a service."""
+    result = kubectl_get(
+        "pods", namespace=namespace, label_selector=f"app=eoapi-{service_name}"
+    )
+
+    if result.returncode != 0:
+        return 0
+
+    pods = json.loads(result.stdout)
+    running_pods = [
+        pod for pod in pods["items"] if pod["status"]["phase"] == "Running"
+    ]
+
+    return len(running_pods)
+
+
+def make_request(url, timeout=10):
+    """Make a single HTTP request and return success status."""
+    try:
+        response = requests.get(url, timeout=timeout)
+        return response.status_code == 200
+    except requests.RequestException:
+        return False
+
+
+def generate_load(
+    base_url, endpoints, duration=60, concurrent_requests=5, delay=0.1
+):
+    """Generate HTTP load against specified endpoints."""
+    end_time = time.time() + duration
+    success_count = 0
+    error_count = 0
+
+    def worker():
+        nonlocal success_count, error_count
+        while time.time() < end_time:
+            for endpoint in endpoints:
+                url = f"{base_url}{endpoint}"
+                if make_request(url):
+                    success_count += 1
+                else:
+                    error_count += 1
+                time.sleep(delay)
+
+    # Start concurrent workers
+    threads = []
+    for _ in range(concurrent_requests):
+        thread = threading.Thread(target=worker)
+        thread.start()
+        threads.append(thread)
+
+    # Wait for all threads to complete
+    for thread in threads:
+        thread.join()
+
+    return {
+        "total_requests": success_count + error_count,
+        "successful_requests": success_count,
+        "failed_requests": error_count,
+        "success_rate": success_count / (success_count + error_count)
+        if (success_count + error_count) > 0
+        else 0,
+    }
+
+
+class TestHPAConfiguration:
+    """Test HPA resource configuration and basic functionality."""
+
+    def test_hpa_resources_properly_configured(self):
+        """Verify HPA resources have correct configuration."""
+        namespace = get_namespace()
+        result = kubectl_get("hpa", namespace=namespace)
+
+        if result.returncode != 0:
+            pytest.skip("No HPA resources found - autoscaling not enabled")
+
+        hpas = json.loads(result.stdout)
+        assert len(hpas["items"]) > 0, "No HPA resources configured"
+
+        for hpa in hpas["items"]:
+            spec = hpa["spec"]
+            hpa_name = hpa["metadata"]["name"]
+
+            # Check required fields
+            assert "scaleTargetRef" in spec, (
+                f"HPA {hpa_name} missing scaleTargetRef"
+            )
+            assert "minReplicas" in spec, f"HPA {hpa_name} missing minReplicas"
+            assert "maxReplicas" in spec, f"HPA {hpa_name} missing maxReplicas"
+            assert "metrics" in spec, (
+                f"HPA {hpa_name} missing metrics configuration"
+            )
+
+            # Validate replica bounds
+            min_replicas = spec["minReplicas"]
+            max_replicas = spec["maxReplicas"]
+            assert min_replicas > 0, f"HPA {hpa_name} minReplicas must be > 0"
+            assert max_replicas > min_replicas, (
+                f"HPA {hpa_name} maxReplicas must be > minReplicas"
+            )
+
+            # Check metrics configuration
+            metrics = spec["metrics"]
+            assert len(metrics) > 0, f"HPA {hpa_name} has no metrics configured"
+
+            # Verify at least one metric is CPU
+            cpu_metrics = [
+                m
+                for m in metrics
+                if m.get("type") == "Resource"
+                and m.get("resource", {}).get("name") == "cpu"
+            ]
+            assert len(cpu_metrics) > 0, (
+                f"HPA {hpa_name} must have CPU metric configured"
+            )
+
+            print(
+                f"✅ HPA {hpa_name}: {min_replicas}-{max_replicas} replicas, {len(metrics)} metrics"
+            )
+
+    def test_target_deployments_exist(self):
+        """Verify HPA target deployments exist and are ready."""
+        namespace = get_namespace()
+        result = kubectl_get("hpa", namespace=namespace)
+
+        if result.returncode != 0:
+            pytest.skip("No HPA resources found")
+
+        hpas = json.loads(result.stdout)
+
+        for hpa in hpas["items"]:
+            target_ref = hpa["spec"]["scaleTargetRef"]
+            target_name = target_ref["name"]
+            hpa_name = hpa["metadata"]["name"]
+
+            # Check target deployment exists
+            deploy_result = kubectl_get(
+                "deployment", namespace=namespace, output="json"
+            )
+            assert deploy_result.returncode == 0, "Cannot list deployments"
+
+            deployments = json.loads(deploy_result.stdout)
+            target_deployment = next(
+                (
+                    d
+                    for d in deployments["items"]
+                    if d["metadata"]["name"] == target_name
+                ),
+                None,
+            )
+
+            assert target_deployment is not None, (
+                f"HPA {hpa_name} target deployment {target_name} not found"
+            )
+
+            # Check deployment has ready replicas
+            status = target_deployment.get("status", {})
+            ready_replicas = status.get("readyReplicas", 0)
+            assert ready_replicas > 0, (
+                f"Target deployment {target_name} has no ready replicas"
+            )
+
+            print(
+                f"✅ HPA {hpa_name} target deployment {target_name} is ready ({ready_replicas} replicas)"
+            )
+
+
+class TestCPUScaling:
+    """Test CPU-based autoscaling functionality."""
+
+    def test_cpu_metrics_collection(self):
+        """Verify CPU metrics are being collected for HPA targets."""
+        namespace = get_namespace()
+        services = ["stac", "raster", "vector"]
+
+        metrics_available = []
+
+        for service in services:
+            try:
+                pod_metrics = get_pod_metrics(namespace, service)
+                if pod_metrics:
+                    metrics_available.append(service)
+                    for metric in pod_metrics:
+                        print(
+                            f"✅ {service} pod {metric['pod']}: CPU={metric['cpu']}, Memory={metric['memory']}"
+                        )
+            except Exception as e:
+                print(f"⚠️  Cannot get metrics for {service}: {e}")
+
+        assert len(metrics_available) > 0, (
+            "No CPU metrics available for any service"
+        )
+
+    def test_hpa_cpu_utilization_calculation(self):
+        """Verify HPA calculates CPU utilization correctly."""
+        namespace = get_namespace()
+        result = kubectl_get("hpa", namespace=namespace)
+
+        if result.returncode != 0:
+            pytest.skip("No HPA resources found")
+
+        hpas = json.loads(result.stdout)
+
+        for hpa in hpas["items"]:
+            hpa_name = hpa["metadata"]["name"]
+            status = hpa.get("status", {})
+
+            # Check if HPA has current metrics
+            current_metrics = status.get("currentMetrics", [])
+            cpu_metrics = [
+                m
+                for m in current_metrics
+                if m.get("type") == "Resource"
+                and m.get("resource", {}).get("name") == "cpu"
+            ]
+
+            if cpu_metrics:
+                cpu_utilization = cpu_metrics[0]["resource"]["current"].get(
+                    "averageUtilization"
+                )
+                if cpu_utilization is not None:
+                    assert 0 <= cpu_utilization <= 1000, (
+                        f"Invalid CPU utilization: {cpu_utilization}%"
+                    )
+                    print(
+                        f"✅ HPA {hpa_name} CPU utilization: {cpu_utilization}%"
+                    )
+                else:
+                    print(
+                        f"⚠️  HPA {hpa_name} CPU metric exists but no utilization value"
+                    )
+            else:
+                # Check conditions for why metrics might not be available
+                conditions = status.get("conditions", [])
+                for condition in conditions:
+                    if (
+                        condition["type"] == "ScalingActive"
+                        and condition["status"] == "False"
+                    ):
+                        print(
+                            f"⚠️  HPA {hpa_name} scaling not active: {condition.get('message', 'Unknown reason')}"
+                        )
+                        break
+                else:
+                    print(f"⚠️  HPA {hpa_name} no CPU metrics available yet")
+
+    def test_cpu_resource_requests_alignment(self):
+        """Verify CPU resource requests are properly set for percentage calculations."""
+        namespace = get_namespace()
+        services = ["stac", "raster", "vector"]
+
+        for service in services:
+            result = kubectl_get(
+                "pods",
+                namespace=namespace,
+                label_selector=f"app=eoapi-{service}",
+            )
+
+            if result.returncode != 0:
+                continue
+
+            pods = json.loads(result.stdout)
+            running_pods = [
+                p for p in pods["items"] if p["status"]["phase"] == "Running"
+            ]
+
+            if not running_pods:
+                continue
+
+            pod = running_pods[0]  # Check first running pod
+            containers = pod["spec"]["containers"]
+
+            main_container = next(
+                (c for c in containers if c["name"] == service), None
+            )
+            if not main_container:
+                continue
+
+            resources = main_container.get("resources", {})
+            requests = resources.get("requests", {})
+
+            if "cpu" not in requests:
+                print(
+                    f"⚠️  Service {service} missing CPU requests - HPA percentage calculation may be inaccurate"
+                )
+                continue
+
+            cpu_request = requests["cpu"]
+            print(f"✅ Service {service} CPU request: {cpu_request}")
+
+            # Parse CPU request to verify it's reasonable
+            if cpu_request.endswith("m"):
+                cpu_millicores = int(cpu_request[:-1])
+                assert cpu_millicores > 0, (
+                    f"Service {service} has zero CPU request"
+                )
+                assert cpu_millicores <= 2000, (
+                    f"Service {service} has very high CPU request: {cpu_millicores}m"
+                )
+
+
+class TestScalingBehavior:
+    """Test actual scaling behavior under load."""
+
+    @pytest.mark.slow
+    def test_load_response_scaling(self):
+        """Generate load and verify scaling response (when possible)."""
+        namespace = get_namespace()
+        base_url = get_base_url()
+
+        # Test endpoints that should generate CPU load
+        load_endpoints = [
+            "/stac/collections",
+            "/stac/search?collections=noaa-emergency-response&limit=50",
+            "/raster/collections",
+            "/vector/collections",
+        ]
+
+        # Check initial state
+        initial_pod_counts = {}
+        services = ["stac", "raster", "vector"]
+
+        for service in services:
+            initial_pod_counts[service] = get_pod_count(namespace, service)
+
+        print(f"Initial pod counts: {initial_pod_counts}")
+
+        # Skip test if we can't connect to services
+        try:
+            response = requests.get(f"{base_url}/stac/collections", timeout=5)
+            if response.status_code != 200:
+                pytest.skip("Cannot access API endpoints for load testing")
+        except requests.RequestException:
+            pytest.skip("API endpoints not accessible for load testing")
+
+        # Generate moderate load for limited time (suitable for CI)
+        load_duration = 90  # 1.5 minutes
+        concurrent_requests = 8
+
+        print(
+            f"Generating load: {concurrent_requests} concurrent requests for {load_duration}s"
+        )
+
+        # Start load generation
+        load_stats = generate_load(
+            base_url=base_url,
+            endpoints=load_endpoints,
+            duration=load_duration,
+            concurrent_requests=concurrent_requests,
+            delay=0.05,  # 20 requests/second per worker
+        )
+
+        print(f"Load test completed: {load_stats}")
+
+        # Wait a bit for metrics to propagate and scaling to potentially occur
+        print("Waiting for metrics to propagate and potential scaling...")
+        time.sleep(30)
+
+        # Check final state
+        final_pod_counts = {}
+        for service in services:
+            final_pod_counts[service] = get_pod_count(namespace, service)
+
+        print(f"Final pod counts: {final_pod_counts}")
+
+        # Check HPA metrics after load
+        result = kubectl_get("hpa", namespace=namespace)
+        if result.returncode == 0:
+            hpas = json.loads(result.stdout)
+            for hpa in hpas["items"]:
+                hpa_name = hpa["metadata"]["name"]
+                status = hpa.get("status", {})
+                current_metrics = status.get("currentMetrics", [])
+
+                cpu_metrics = [
+                    m
+                    for m in current_metrics
+                    if m.get("type") == "Resource"
+                    and m.get("resource", {}).get("name") == "cpu"
+                ]
+
+                if cpu_metrics:
+                    cpu_utilization = cpu_metrics[0]["resource"]["current"].get(
+                        "averageUtilization"
+                    )
+                    print(f"Post-load HPA {hpa_name} CPU: {cpu_utilization}%")
+
+        # Verify load test was successful
+        assert load_stats["success_rate"] > 0.8, (
+            f"Load test had low success rate: {load_stats['success_rate']:.2%}"
+        )
+        assert load_stats["total_requests"] > 100, (
+            "Load test generated insufficient requests"
+        )
+
+        # Note: In CI environments with limited resources, actual scaling may not occur
+        # The important thing is that the system handled the load successfully
+        scaling_occurred = any(
+            final_pod_counts[svc] > initial_pod_counts[svc]
+            for svc in services
+            if svc in initial_pod_counts and svc in final_pod_counts
+        )
+
+        if scaling_occurred:
+            print("✅ Scaling occurred during load test")
+        else:
+            print(
+                "⚠️  No scaling occurred - may be due to CI resource constraints or low load thresholds"
+            )
+
+    def test_scaling_stabilization_windows(self):
+        """Verify HPA respects stabilization windows in configuration."""
+        namespace = get_namespace()
+        result = kubectl_get("hpa", namespace=namespace)
+
+        if result.returncode != 0:
+            pytest.skip("No HPA resources found")
+
+        hpas = json.loads(result.stdout)
+
+        for hpa in hpas["items"]:
+            hpa_name = hpa["metadata"]["name"]
+            spec = hpa["spec"]
+
+            # Check if behavior is configured
+            behavior = spec.get("behavior", {})
+            if not behavior:
+                print(f"⚠️  HPA {hpa_name} has no scaling behavior configured")
+                continue
+
+            # Check scale up behavior
+            scale_up = behavior.get("scaleUp", {})
+            if scale_up:
+                stabilization = scale_up.get("stabilizationWindowSeconds", 0)
+                policies = scale_up.get("policies", [])
+                print(
+                    f"✅ HPA {hpa_name} scale-up: {stabilization}s stabilization, {len(policies)} policies"
+                )
+
+            # Check scale down behavior
+            scale_down = behavior.get("scaleDown", {})
+            if scale_down:
+                stabilization = scale_down.get("stabilizationWindowSeconds", 0)
+                policies = scale_down.get("policies", [])
+                print(
+                    f"✅ HPA {hpa_name} scale-down: {stabilization}s stabilization, {len(policies)} policies"
+                )
+
+
+class TestRequestRateScaling:
+    """Test request rate-based autoscaling (when available)."""
+
+    def test_custom_metrics_for_request_rate(self):
+        """Check if custom metrics for request rate scaling are available."""
+        namespace = get_namespace()
+
+        # Check if custom metrics API has request rate metrics
+        result = subprocess.run(
+            ["kubectl", "get", "--raw", "/apis/custom.metrics.k8s.io/v1beta1"],
+            capture_output=True,
+            text=True,
+        )
+
+        if result.returncode != 0:
+            pytest.skip("Custom metrics API not available")
+
+        api_response = json.loads(result.stdout)
+        resources = api_response.get("resources", [])
+
+        # Look for nginx ingress controller metrics
+        request_rate_metrics = [
+            r
+            for r in resources
+            if "nginx_ingress_controller" in r.get("name", "")
+            and "requests" in r.get("name", "")
+        ]
+
+        if request_rate_metrics:
+            print(f"✅ Found {len(request_rate_metrics)} request rate metrics")
+            for metric in request_rate_metrics:
+                print(f"  - {metric['name']}")
+        else:
+            print(
+                "⚠️  No request rate metrics available - may require ingress controller metrics configuration"
+            )
+
+    def test_hpa_request_rate_metrics(self):
+        """Verify HPA can access request rate metrics (when configured)."""
+        namespace = get_namespace()
+        result = kubectl_get("hpa", namespace=namespace)
+
+        if result.returncode != 0:
+            pytest.skip("No HPA resources found")
+
+        hpas = json.loads(result.stdout)
+
+        for hpa in hpas["items"]:
+            hpa_name = hpa["metadata"]["name"]
+            status = hpa.get("status", {})
+            current_metrics = status.get("currentMetrics", [])
+
+            # Look for custom metrics (request rate)
+            custom_metrics = [
+                m
+                for m in current_metrics
+                if m.get("type") in ["Pods", "Object"]
+                and "nginx_ingress_controller" in str(m)
+            ]
+
+            if custom_metrics:
+                print(f"✅ HPA {hpa_name} has custom metrics available")
+                for metric in custom_metrics:
+                    print(f"  - {metric}")
+            else:
+                # Check if it's configured but not yet available
+                spec_metrics = hpa["spec"]["metrics"]
+                configured_custom = [
+                    m
+                    for m in spec_metrics
+                    if m.get("type") in ["Pods", "Object"]
+                ]
+
+                if configured_custom:
+                    print(
+                        f"⚠️  HPA {hpa_name} has custom metrics configured but not available yet"
+                    )
+                else:
+                    print(
+                        f"ℹ️  HPA {hpa_name} uses only CPU metrics (no request rate scaling)"
+                    )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/.github/workflows/tests/test_observability.py b/.github/workflows/tests/test_observability.py
new file mode 100644
index 00000000..7edca73c
--- /dev/null
+++ b/.github/workflows/tests/test_observability.py
@@ -0,0 +1,522 @@
+"""Test observability stack deployment and functionality."""
+
+import json
+import os
+import subprocess
+import time
+
+import pytest
+import requests
+
+
+def get_namespace():
+    """Get the target namespace from environment or default."""
+    return os.environ.get("NAMESPACE", "eoapi")
+
+
+def get_release_name():
+    """Get the release name from environment or default."""
+    return os.environ.get("RELEASE_NAME", "eoapi")
+
+
+def kubectl_get(resource, namespace=None, label_selector=None, output="json"):
+    """Execute kubectl get command with optional parameters."""
+    cmd = ["kubectl", "get", resource]
+
+    if namespace:
+        cmd.extend(["-n", namespace])
+
+    if label_selector:
+        cmd.extend(["-l", label_selector])
+
+    if output:
+        cmd.extend(["-o", output])
+
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    return result
+
+
+def kubectl_port_forward(service, local_port, remote_port, namespace):
+    """Start kubectl port-forward in background."""
+    cmd = [
+        "kubectl",
+        "port-forward",
+        f"svc/{service}",
+        f"{local_port}:{remote_port}",
+        "-n",
+        namespace,
+    ]
+
+    process = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+
+    # Give it time to establish connection
+    time.sleep(3)
+    return process
+
+
+def wait_for_url(url, timeout=30, interval=2):
+    """Wait for URL to become available."""
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        try:
+            response = requests.get(url, timeout=5)
+            if response.status_code == 200:
+                return True
+        except (requests.RequestException, requests.ConnectionError):
+            pass
+        time.sleep(interval)
+    return False
+
+
+class TestMonitoringStackDeployment:
+    """Test core monitoring components deployment."""
+
+    def test_prometheus_server_deployment(self):
+        """Verify Prometheus server is deployed and running."""
+        namespace = get_namespace()
+        result = kubectl_get(
+            "pods",
+            namespace=namespace,
+            label_selector="app.kubernetes.io/name=prometheus,app.kubernetes.io/component=server",
+        )
+
+        if result.returncode != 0:
+            pytest.skip("Prometheus server not deployed - monitoring disabled")
+
+        pods = json.loads(result.stdout)
+        assert len(pods["items"]) > 0, "No Prometheus server pods found"
+
+        # Check pod is running
+        for pod in pods["items"]:
+            assert pod["status"]["phase"] == "Running", (
+                f"Prometheus pod {pod['metadata']['name']} not running"
+            )
+
+            # Check readiness
+            conditions = pod["status"].get("conditions", [])
+            ready_condition = next(
+                (c for c in conditions if c["type"] == "Ready"), None
+            )
+            assert ready_condition and ready_condition["status"] == "True", (
+                "Prometheus pod not ready"
+            )
+
+    def test_grafana_deployment(self):
+        """Verify Grafana is deployed and running."""
+        namespace = get_namespace()
+        result = kubectl_get(
+            "pods",
+            namespace=namespace,
+            label_selector="app.kubernetes.io/name=grafana",
+        )
+
+        if result.returncode != 0:
+            pytest.skip("Grafana not deployed - observability disabled")
+
+        pods = json.loads(result.stdout)
+        assert len(pods["items"]) > 0, "No Grafana pods found"
+
+        # Check pod is running
+        for pod in pods["items"]:
+            assert pod["status"]["phase"] == "Running", (
+                f"Grafana pod {pod['metadata']['name']} not running"
+            )
+
+    def test_prometheus_adapter_deployment(self):
+        """Verify prometheus-adapter is deployed and provides custom metrics API."""
+        namespace = get_namespace()
+        result = kubectl_get(
+            "pods",
+            namespace=namespace,
+            label_selector="app.kubernetes.io/name=prometheus-adapter",
+        )
+
+        if result.returncode != 0:
+            pytest.skip("prometheus-adapter not deployed")
+
+        pods = json.loads(result.stdout)
+        assert len(pods["items"]) > 0, "No prometheus-adapter pods found"
+
+        # Check pod is running
+        for pod in pods["items"]:
+            assert pod["status"]["phase"] == "Running", (
+                f"prometheus-adapter pod {pod['metadata']['name']} not running"
+            )
+
+    def test_kube_state_metrics_deployment(self):
+        """Verify kube-state-metrics is collecting Kubernetes object metrics."""
+        namespace = get_namespace()
+        result = kubectl_get(
+            "pods",
+            namespace=namespace,
+            label_selector="app.kubernetes.io/name=kube-state-metrics",
+        )
+
+        if result.returncode != 0:
+            pytest.skip("kube-state-metrics not deployed")
+
+        pods = json.loads(result.stdout)
+        assert len(pods["items"]) > 0, "No kube-state-metrics pods found"
+
+        # Check pod is running
+        for pod in pods["items"]:
+            assert pod["status"]["phase"] == "Running", (
+                f"kube-state-metrics pod {pod['metadata']['name']} not running"
+            )
+
+    def test_node_exporter_deployment(self):
+        """Verify node-exporter is collecting node metrics."""
+        namespace = get_namespace()
+        result = kubectl_get(
+            "pods",
+            namespace=namespace,
+            label_selector="app.kubernetes.io/name=prometheus-node-exporter",
+        )
+
+        if result.returncode != 0:
+            pytest.skip("prometheus-node-exporter not deployed")
+
+        pods = json.loads(result.stdout)
+        assert len(pods["items"]) > 0, "No prometheus-node-exporter pods found"
+
+        # Check pods are running (should be one per node in DaemonSet)
+        for pod in pods["items"]:
+            assert pod["status"]["phase"] == "Running", (
+                f"node-exporter pod {pod['metadata']['name']} not running"
+            )
+
+
+class TestMetricsCollection:
+    """Test metrics collection functionality."""
+
+    def test_custom_metrics_api_available(self):
+        """Verify custom metrics API is available."""
+        result = subprocess.run(
+            ["kubectl", "get", "--raw", "/apis/custom.metrics.k8s.io/v1beta1"],
+            capture_output=True,
+            text=True,
+        )
+
+        if result.returncode != 0:
+            pytest.skip(
+                "Custom metrics API not available - prometheus-adapter may not be configured"
+            )
+
+        api_response = json.loads(result.stdout)
+        assert api_response["kind"] == "APIResourceList", (
+            "Invalid custom metrics API response"
+        )
+        assert (
+            api_response["groupVersion"] == "custom.metrics.k8s.io/v1beta1"
+        ), "Wrong API version"
+
+    def test_metrics_server_integration(self):
+        """Verify metrics-server is working for resource metrics."""
+        # Test if we can get pod metrics
+        result = subprocess.run(
+            ["kubectl", "top", "pods", "-n", get_namespace(), "--no-headers"],
+            capture_output=True,
+            text=True,
+        )
+
+        if result.returncode != 0:
+            pytest.skip("metrics-server not available or not ready")
+
+        # Should have some metrics output
+        lines = result.stdout.strip().split("\n")
+        assert len(lines) > 0, "No pod metrics available"
+
+        # Check format includes CPU and Memory columns
+        for line in lines:
+            if line.strip():  # Skip empty lines
+                parts = line.split()
+                assert len(parts) >= 3, f"Invalid metrics format: {line}"
+
+    def test_prometheus_targets_reachable(self):
+        """Test that Prometheus can reach its scrape targets (when accessible)."""
+        namespace = get_namespace()
+
+        # Check if Prometheus service exists
+        result = kubectl_get(
+            "svc",
+            namespace=namespace,
+            label_selector="app.kubernetes.io/name=prometheus",
+        )
+        if result.returncode != 0:
+            pytest.skip("Prometheus service not found")
+
+        services = json.loads(result.stdout)
+        if not services["items"]:
+            pytest.skip("No Prometheus services found")
+
+        prometheus_service = None
+        for svc in services["items"]:
+            if "server" in svc["metadata"]["name"]:
+                prometheus_service = svc["metadata"]["name"]
+                break
+
+        if not prometheus_service:
+            pytest.skip("Prometheus server service not found")
+
+        # Try to port-forward and check targets (with timeout)
+        port_forward = None
+        try:
+            port_forward = kubectl_port_forward(
+                prometheus_service, 9090, 80, namespace
+            )
+
+            if wait_for_url("http://localhost:9090", timeout=15):
+                # Try to get targets endpoint
+                try:
+                    response = requests.get(
+                        "http://localhost:9090/api/v1/targets", timeout=10
+                    )
+                    if response.status_code == 200:
+                        targets_data = response.json()
+                        assert targets_data["status"] == "success", (
+                            "Prometheus targets API error"
+                        )
+
+                        # Check we have some targets
+                        targets = targets_data.get("data", {}).get(
+                            "activeTargets", []
+                        )
+                        healthy_targets = [
+                            t for t in targets if t.get("health") == "up"
+                        ]
+
+                        # Should have at least some healthy targets
+                        assert len(healthy_targets) > 0, (
+                            "No healthy Prometheus targets found"
+                        )
+                        print(
+                            f"✅ Found {len(healthy_targets)}/{len(targets)} healthy targets"
+                        )
+
+                    else:
+                        pytest.skip(
+                            f"Cannot access Prometheus API: {response.status_code}"
+                        )
+
+                except requests.RequestException:
+                    pytest.skip(
+                        "Cannot connect to Prometheus API via port-forward"
+                    )
+            else:
+                pytest.skip("Cannot establish port-forward to Prometheus")
+
+        finally:
+            if port_forward:
+                port_forward.terminate()
+                port_forward.wait(timeout=5)
+
+
+class TestAutoscalingIntegration:
+    """Test HPA and autoscaling functionality."""
+
+    def test_hpa_resources_exist(self):
+        """Verify HPA resources are configured."""
+        namespace = get_namespace()
+        result = kubectl_get("hpa", namespace=namespace)
+
+        if result.returncode != 0:
+            pytest.skip("No HPA resources found - autoscaling not enabled")
+
+        hpas = json.loads(result.stdout)
+        assert len(hpas["items"]) > 0, "No HPA resources configured"
+
+        # Check common HPA resources
+        hpa_names = [hpa["metadata"]["name"] for hpa in hpas["items"]]
+        expected_hpas = [
+            "eoapi-stac-hpa",
+            "eoapi-raster-hpa",
+            "eoapi-vector-hpa",
+        ]
+
+        found_hpas = [
+            name
+            for name in expected_hpas
+            if any(name in hpa_name for hpa_name in hpa_names)
+        ]
+        assert len(found_hpas) > 0, (
+            f"No expected HPA resources found. Available: {hpa_names}"
+        )
+
+        print(f"✅ Found HPA resources: {found_hpas}")
+
+    def test_hpa_metrics_available(self):
+        """Verify HPA can read required metrics."""
+        namespace = get_namespace()
+        result = kubectl_get("hpa", namespace=namespace)
+
+        if result.returncode != 0:
+            pytest.skip("No HPA resources found")
+
+        hpas = json.loads(result.stdout)
+
+        for hpa in hpas["items"]:
+            hpa_name = hpa["metadata"]["name"]
+
+            # Check HPA status has current metrics
+            status = hpa.get("status", {})
+            current_metrics = status.get("currentMetrics", [])
+
+            # Should have at least CPU metrics
+            cpu_metrics = [
+                m
+                for m in current_metrics
+                if m.get("type") == "Resource"
+                and m.get("resource", {}).get("name") == "cpu"
+            ]
+
+            if not cpu_metrics:
+                # Check if it's still initializing
+                conditions = status.get("conditions", [])
+                scaling_active = next(
+                    (c for c in conditions if c["type"] == "ScalingActive"),
+                    None,
+                )
+
+                if scaling_active and scaling_active["status"] == "False":
+                    print(
+                        f"⚠️  HPA {hpa_name} not yet active: {scaling_active.get('message', 'Unknown')}"
+                    )
+                else:
+                    print(
+                        f"✅ HPA {hpa_name} is configured but may still be initializing"
+                    )
+            else:
+                cpu_value = cpu_metrics[0]["resource"]["current"][
+                    "averageUtilization"
+                ]
+                print(f"✅ HPA {hpa_name} CPU metric: {cpu_value}%")
+
+    def test_service_resource_requests_configured(self):
+        """Verify services have resource requests (required for HPA CPU metrics)."""
+        namespace = get_namespace()
+        services = ["stac", "raster", "vector"]
+
+        for service in services:
+            result = kubectl_get(
+                "pods",
+                namespace=namespace,
+                label_selector=f"app=eoapi-{service}",
+            )
+
+            if result.returncode != 0:
+                continue
+
+            pods = json.loads(result.stdout)
+            running_pods = [
+                p for p in pods["items"] if p["status"]["phase"] == "Running"
+            ]
+
+            if not running_pods:
+                continue
+
+            # Check first running pod for resource requests
+            pod = running_pods[0]
+            containers = pod["spec"]["containers"]
+
+            for container in containers:
+                if container["name"] == service:  # Main service container
+                    resources = container.get("resources", {})
+                    requests = resources.get("requests", {})
+
+                    assert "cpu" in requests, (
+                        f"Service {service} missing CPU resource requests (required for HPA)"
+                    )
+                    assert "memory" in requests, (
+                        f"Service {service} missing memory resource requests"
+                    )
+
+                    print(
+                        f"✅ Service {service} has resource requests: CPU={requests['cpu']}, Memory={requests['memory']}"
+                    )
+                    break
+
+
+class TestGrafanaDashboards:
+    """Test Grafana dashboard functionality (when accessible)."""
+
+    def test_grafana_service_accessibility(self):
+        """Test if Grafana service is accessible."""
+        namespace = get_namespace()
+        result = kubectl_get(
+            "svc",
+            namespace=namespace,
+            label_selector="app.kubernetes.io/name=grafana",
+        )
+
+        if result.returncode != 0:
+            pytest.skip("Grafana service not found")
+
+        services = json.loads(result.stdout)
+        if not services["items"]:
+            pytest.skip("No Grafana services found")
+
+        grafana_service = services["items"][0]["metadata"]["name"]
+
+        # Try port-forward to test accessibility
+        port_forward = None
+        try:
+            port_forward = kubectl_port_forward(
+                grafana_service, 3000, 80, namespace
+            )
+
+            if wait_for_url("http://localhost:3000", timeout=15):
+                # Try to access login page
+                response = requests.get(
+                    "http://localhost:3000/login", timeout=10
+                )
+                assert response.status_code == 200, (
+                    "Cannot access Grafana login page"
+                )
+                assert "Grafana" in response.text, "Invalid Grafana response"
+                print("✅ Grafana service is accessible")
+            else:
+                pytest.skip("Cannot establish connection to Grafana")
+
+        except requests.RequestException as e:
+            pytest.skip(f"Cannot access Grafana: {e}")
+        finally:
+            if port_forward:
+                port_forward.terminate()
+                port_forward.wait(timeout=5)
+
+    def test_grafana_admin_secret_exists(self):
+        """Verify Grafana admin password secret exists."""
+        namespace = get_namespace()
+        release_name = get_release_name()
+
+        result = kubectl_get("secret", namespace=namespace, output="json")
+        if result.returncode != 0:
+            pytest.skip("Cannot list secrets")
+
+        secrets = json.loads(result.stdout)
+        grafana_secrets = [
+            s
+            for s in secrets["items"]
+            if "grafana" in s["metadata"]["name"].lower()
+        ]
+
+        if not grafana_secrets:
+            pytest.skip("No Grafana secrets found")
+
+        # Check for admin password key
+        found_password = False
+        for secret in grafana_secrets:
+            if "admin-password" in secret.get("data", {}):
+                found_password = True
+                print(
+                    f"✅ Found Grafana admin password in secret: {secret['metadata']['name']}"
+                )
+                break
+
+        assert found_password, "Grafana admin password secret not found"
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/scripts/README.md b/scripts/README.md
index acd72a48..6f36947e 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -8,8 +8,9 @@ Automation scripts for deploying, testing, and managing eoAPI on Kubernetes.
 |--------|---------|-------|
 | **`deploy.sh`** | Deploy eoAPI to Kubernetes | `./deploy.sh [deploy\|setup\|cleanup] [--ci]` |
 | **`ingest.sh`** | Ingest STAC data into deployed eoAPI | `./ingest.sh [collections.json] [items.json]` |
-| **`test.sh`** | Run Helm and integration tests | `./test.sh [helm\|integration\|all] [--debug]` |
-| **`lib/`** | Shared utility functions | See [lib/README.md](lib/README.md) |
+| **`test.sh`** | Run Helm, integration, and observability tests | `./test.sh [helm\|integration\|observability\|all] [--debug]` |
+| **`lib/common.sh`** | Core utility functions and logging | Shared functions for all scripts |
+| **`lib/observability.sh`** | Monitoring and autoscaling utilities | Functions for testing observability stack |
 
 ## Quick Start
 
@@ -50,6 +51,35 @@ export RASTER_ENDPOINT=http://...   # Override Raster API endpoint
 export VECTOR_ENDPOINT=http://...   # Override Vector API endpoint
 ```
 
+## Observability Testing
+
+The test suite includes comprehensive observability validation:
+
+**Monitoring Stack Tests:**
+- Prometheus server deployment and metrics collection
+- Grafana dashboard accessibility and data source connectivity
+- Custom metrics API availability via prometheus-adapter
+- HPA (Horizontal Pod Autoscaler) functionality with CPU metrics
+- kube-state-metrics and node-exporter deployment
+
+**Autoscaling Tests:**
+- HPA configuration validation for STAC, Raster, and Vector services
+- CPU-based scaling threshold verification
+- Request-rate scaling metrics (when ingress metrics available)
+- Scaling behavior and stabilization window testing
+
+**Run observability tests:**
+```bash
+# Run only observability tests
+./scripts/test.sh observability
+
+# Run with enhanced monitoring output
+./scripts/test.sh observability --debug
+
+# Run all tests including observability
+./scripts/test.sh all
+```
+
 ## Common Examples
 
 **Deploy with custom namespace:**
@@ -67,6 +97,25 @@ NAMESPACE=my-eoapi ./scripts/deploy.sh
 ./scripts/test.sh all --debug
 ```
 
+**Run only observability tests:**
+```bash
+./scripts/test.sh observability --debug
+```
+
+**Test monitoring stack health:**
+```bash
+# Source observability functions
+source ./scripts/lib/observability.sh
+
+# Check individual components
+check_prometheus_health
+check_grafana_health
+check_hpa_status
+
+# Get comprehensive status
+get_monitoring_stack_status
+```
+
 **Cleanup deployment:**
 ```bash
 ./scripts/deploy.sh cleanup
diff --git a/scripts/lib/common.sh b/scripts/lib/common.sh
index dcaf18d6..d5015db5 100755
--- a/scripts/lib/common.sh
+++ b/scripts/lib/common.sh
@@ -225,6 +225,10 @@ preflight_test() {
             validate_tools kubectl python3 || return 1
             validate_cluster || return 1
             ;;
+        observability)
+            validate_tools kubectl python3 || return 1
+            validate_cluster || return 1
+            ;;
         *)
             log_error "Unknown test type: $test_type"
             return 1
diff --git a/scripts/lib/observability.sh b/scripts/lib/observability.sh
new file mode 100644
index 00000000..2fc1c7d9
--- /dev/null
+++ b/scripts/lib/observability.sh
@@ -0,0 +1,530 @@
+#!/bin/bash
+# Observability utility functions for eoAPI Kubernetes deployments
+# Provides monitoring, metrics, and autoscaling validation capabilities
+
+# Colors for output formatting
+readonly OBS_RED='\033[0;31m'
+readonly OBS_GREEN='\033[0;32m'
+readonly OBS_YELLOW='\033[1;33m'
+readonly OBS_BLUE='\033[0;34m'
+readonly OBS_NC='\033[0m' # No Color
+
+# Logging functions
+obs_log_info() {
+    printf "${OBS_BLUE}[OBS-INFO]${OBS_NC} %s\n" "$1"
+}
+
+obs_log_success() {
+    printf "${OBS_GREEN}[OBS-SUCCESS]${OBS_NC} %s\n" "$1"
+}
+
+obs_log_warning() {
+    printf "${OBS_YELLOW}[OBS-WARNING]${OBS_NC} %s\n" "$1"
+}
+
+obs_log_error() {
+    printf "${OBS_RED}[OBS-ERROR]${OBS_NC} %s\n" "$1"
+}
+
+# Check if a command exists
+obs_command_exists() {
+    command -v "$1" >/dev/null 2>&1
+}
+
+# Get namespace with fallback
+get_obs_namespace() {
+    echo "${NAMESPACE:-eoapi}"
+}
+
+# Get release name with fallback
+get_obs_release_name() {
+    echo "${RELEASE_NAME:-eoapi}"
+}
+
+# Check if monitoring components are deployed
+check_monitoring_deployment() {
+    local namespace
+    namespace=$(get_obs_namespace)
+    local component="$1"
+    local label_selector="$2"
+
+    if [ -z "$component" ] || [ -z "$label_selector" ]; then
+        obs_log_error "check_monitoring_deployment requires component name and label selector"
+        return 1
+    fi
+
+    local pod_count
+    pod_count=$(kubectl get pods -n "$namespace" -l "$label_selector" --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l)
+
+    if [ "$pod_count" -gt 0 ]; then
+        obs_log_success "$component is running ($pod_count pods)"
+        return 0
+    else
+        # Check if pods exist but not running
+        local total_pods
+        total_pods=$(kubectl get pods -n "$namespace" -l "$label_selector" --no-headers 2>/dev/null | wc -l)
+        if [ "$total_pods" -gt 0 ]; then
+            obs_log_warning "$component pods exist but not running ($total_pods pods)"
+            return 1
+        else
+            obs_log_info "$component not deployed"
+            return 2
+        fi
+    fi
+}
+
+# Check Prometheus deployment and health
+check_prometheus_health() {
+    local namespace
+    namespace=$(get_obs_namespace)
+
+    obs_log_info "Checking Prometheus health..."
+
+    # Check deployment
+    if ! check_monitoring_deployment "Prometheus" "app.kubernetes.io/name=prometheus,app.kubernetes.io/component=server"; then
+        return $?
+    fi
+
+    # Check service exists
+    if ! kubectl get svc -n "$namespace" -l "app.kubernetes.io/name=prometheus" >/dev/null 2>&1; then
+        obs_log_warning "Prometheus service not found"
+        return 1
+    fi
+
+    obs_log_success "Prometheus is healthy"
+    return 0
+}
+
+# Check Grafana deployment and health
+check_grafana_health() {
+    local namespace
+    namespace=$(get_obs_namespace)
+
+    obs_log_info "Checking Grafana health..."
+
+    # Check deployment
+    if ! check_monitoring_deployment "Grafana" "app.kubernetes.io/name=grafana"; then
+        return $?
+    fi
+
+    # Check service exists
+    if ! kubectl get svc -n "$namespace" -l "app.kubernetes.io/name=grafana" >/dev/null 2>&1; then
+        obs_log_warning "Grafana service not found"
+        return 1
+    fi
+
+    # Check for admin secret
+    if ! kubectl get secret -n "$namespace" -o name | grep -q grafana; then
+        obs_log_warning "Grafana admin secret not found"
+    fi
+
+    obs_log_success "Grafana is healthy"
+    return 0
+}
+
+# Check prometheus-adapter health
+check_prometheus_adapter_health() {
+    local namespace
+    namespace=$(get_obs_namespace)
+
+    obs_log_info "Checking prometheus-adapter health..."
+
+    # Check deployment
+    if ! check_monitoring_deployment "prometheus-adapter" "app.kubernetes.io/name=prometheus-adapter"; then
+        return $?
+    fi
+
+    # Check custom metrics API
+    if kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" >/dev/null 2>&1; then
+        obs_log_success "Custom metrics API is available"
+    else
+        obs_log_warning "Custom metrics API not accessible"
+        return 1
+    fi
+
+    obs_log_success "prometheus-adapter is healthy"
+    return 0
+}
+
+# Check HPA resources and status
+check_hpa_status() {
+    local namespace
+    namespace=$(get_obs_namespace)
+    local service_name="$1"  # optional: check specific service
+
+    obs_log_info "Checking HPA status..."
+
+    local hpa_selector=""
+    if [ -n "$service_name" ]; then
+        hpa_selector="-l app.kubernetes.io/component=$service_name-hpa"
+    fi
+
+    if ! kubectl get hpa -n "$namespace" "$hpa_selector" >/dev/null 2>&1; then
+        obs_log_info "No HPA resources found"
+        return 2
+    fi
+
+    local hpa_count
+    hpa_count=$(kubectl get hpa -n "$namespace" "$hpa_selector" --no-headers 2>/dev/null | wc -l)
+    obs_log_info "Found $hpa_count HPA resource(s)"
+
+    # Check HPA status details
+    local unhealthy_hpas=""
+    local unhealthy_count=0
+    while IFS= read -r line; do
+        if [ -n "$line" ]; then
+            local hpa_name
+            local targets
+            hpa_name=$(echo "$line" | awk '{print $1}')
+            targets=$(echo "$line" | awk '{print $4}')
+
+            if echo "$targets" | grep -q "<unknown>"; then
+                unhealthy_hpas="$unhealthy_hpas $hpa_name"
+                unhealthy_count=$((unhealthy_count + 1))
+                obs_log_warning "HPA $hpa_name has unknown metrics"
+            else
+                obs_log_success "HPA $hpa_name is reporting metrics: $targets"
+            fi
+        fi
+    done << EOF
+$(kubectl get hpa -n "$namespace" "$hpa_selector" --no-headers 2>/dev/null)
+EOF
+
+    if [ $unhealthy_count -eq 0 ]; then
+        obs_log_success "All HPA resources are healthy"
+        return 0
+    else
+        obs_log_warning "$unhealthy_count HPA resource(s) have issues:$unhealthy_hpas"
+        return 1
+    fi
+}
+
+# Get pod resource metrics
+get_pod_metrics() {
+    local namespace
+    namespace=$(get_obs_namespace)
+    local service_name="$1"
+
+    if [ -z "$service_name" ]; then
+        obs_log_error "get_pod_metrics requires service name"
+        return 1
+    fi
+
+    obs_log_info "Getting metrics for $service_name..."
+
+    if ! obs_command_exists kubectl; then
+        obs_log_error "kubectl not found"
+        return 1
+    fi
+
+    if ! kubectl top pods -n "$namespace" -l "app=eoapi-$service_name" --no-headers 2>/dev/null; then
+        obs_log_warning "Cannot get pod metrics for $service_name (metrics-server may not be ready)"
+        return 1
+    fi
+
+    return 0
+}
+
+# Validate custom metrics API
+validate_custom_metrics_api() {
+    obs_log_info "Validating custom metrics API..."
+
+    if ! kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" >/dev/null 2>&1; then
+        obs_log_error "Custom metrics API not available"
+        return 1
+    fi
+
+    # Get available metrics
+    local metrics_json
+    metrics_json=$(kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" 2>/dev/null)
+    if [ -n "$metrics_json" ]; then
+        local metric_count
+        metric_count=$(echo "$metrics_json" | grep -o '"name"' | wc -l)
+        obs_log_success "Custom metrics API is available with $metric_count metric types"
+    else
+        obs_log_warning "Custom metrics API available but no metrics registered"
+    fi
+
+    return 0
+}
+
+# Setup port forwarding for monitoring services
+setup_monitoring_port_forward() {
+    local service="$1"
+    local local_port="$2"
+    local remote_port="$3"
+    local namespace
+    namespace=$(get_obs_namespace)
+
+    if [ -z "$service" ] || [ -z "$local_port" ] || [ -z "$remote_port" ]; then
+        obs_log_error "setup_monitoring_port_forward requires service, local_port, and remote_port"
+        return 1
+    fi
+
+    obs_log_info "Setting up port forward for $service ($local_port:$remote_port)..."
+
+    # Check if service exists
+    if ! kubectl get svc "$service" -n "$namespace" >/dev/null 2>&1; then
+        obs_log_error "Service $service not found in namespace $namespace"
+        return 1
+    fi
+
+    # Start port forwarding in background
+    kubectl port-forward "svc/$service" "$local_port:$remote_port" -n "$namespace" >/dev/null 2>&1 &
+    local pf_pid=$!
+
+    # Give it time to establish
+    sleep 3
+
+    # Check if port forward is working
+    if kill -0 $pf_pid 2>/dev/null; then
+        obs_log_success "Port forward established (PID: $pf_pid)"
+        echo $pf_pid  # Return PID for cleanup
+        return 0
+    else
+        obs_log_error "Failed to establish port forward"
+        return 1
+    fi
+}
+
+# Wait for monitoring stack to be ready
+wait_for_monitoring_stack() {
+    local timeout="${1:-300}"  # 5 minutes default
+    local namespace
+    namespace=$(get_obs_namespace)
+
+    obs_log_info "Waiting for monitoring stack to be ready (timeout: ${timeout}s)..."
+
+    local start_time
+    start_time=$(date +%s)
+    local components="prometheus:app.kubernetes.io/name=prometheus,app.kubernetes.io/component=server grafana:app.kubernetes.io/name=grafana prometheus-adapter:app.kubernetes.io/name=prometheus-adapter"
+
+    while [ $(($(date +%s) - start_time)) -lt "$timeout" ]; do
+        local all_ready=true
+
+        for component_spec in $components; do
+            local component_name=${component_spec%%:*}
+            local selector=${component_spec#*:}
+
+            if ! kubectl wait --for=condition=Ready pod -l "$selector" -n "$namespace" --timeout=10s >/dev/null 2>&1; then
+                obs_log_info "Waiting for $component_name to be ready..."
+                all_ready=false
+                break
+            fi
+        done
+
+        if [ "$all_ready" = true ]; then
+            obs_log_success "Monitoring stack is ready"
+            return 0
+        fi
+
+        sleep 10
+    done
+
+    obs_log_error "Timeout waiting for monitoring stack to be ready"
+    return 1
+}
+
+# Generate synthetic load for testing autoscaling
+generate_synthetic_load() {
+    local base_url="$1"
+    local duration="${2:-60}"
+    local concurrent_requests="${3:-5}"
+    local delay="${4:-0.1}"
+
+    if [ -z "$base_url" ]; then
+        obs_log_error "generate_synthetic_load requires base_url"
+        return 1
+    fi
+
+    obs_log_info "Generating synthetic load..."
+    obs_log_info "URL: $base_url, Duration: ${duration}s, Concurrent: $concurrent_requests, Delay: ${delay}s"
+
+    if ! obs_command_exists curl; then
+        obs_log_error "curl not found"
+        return 1
+    fi
+
+    # Test endpoints for load generation
+    local endpoints="/stac/collections /stac/search?collections=noaa-emergency-response&limit=50 /raster/collections /vector/collections"
+
+    local success_count=0
+    local error_count=0
+    local pids=""
+
+    # Worker function for load generation
+    load_worker() {
+        local end_time=$(($(date +%s) + duration))
+
+        while [ "$(date +%s)" -lt "$end_time" ]; do
+            for endpoint in $endpoints; do
+                local url="$base_url$endpoint"
+                if curl -s -f "$url" >/dev/null 2>&1; then
+                    success_count=$((success_count + 1))
+                else
+                    error_count=$((error_count + 1))
+                fi
+                sleep "$delay"
+            done
+        done
+    }
+
+    # Start concurrent workers
+    local i=1
+    while [ "$i" -le "$concurrent_requests" ]; do
+        load_worker $i &
+        pids="$pids $!"
+        i=$((i + 1))
+    done
+
+    # Wait for all workers to complete
+    for pid in $pids; do
+        wait "$pid"
+    done
+
+    local total_requests=$((success_count + error_count))
+    local success_rate=0
+    if [ $total_requests -gt 0 ]; then
+        success_rate=$(( (success_count * 100) / total_requests ))
+    fi
+
+    obs_log_info "Load test completed: $total_requests requests ($success_count success, $error_count errors, ${success_rate}% success rate)"
+
+    return 0
+}
+
+# Get comprehensive monitoring stack status
+get_monitoring_stack_status() {
+    local namespace
+    namespace=$(get_obs_namespace)
+
+    obs_log_info "=== Monitoring Stack Status ==="
+
+    # Check each component
+    local components="Prometheus:app.kubernetes.io/name=prometheus,app.kubernetes.io/component=server Grafana:app.kubernetes.io/name=grafana prometheus-adapter:app.kubernetes.io/name=prometheus-adapter kube-state-metrics:app.kubernetes.io/name=kube-state-metrics node-exporter:app.kubernetes.io/name=prometheus-node-exporter"
+
+    local healthy_count=0
+    local total_count=5
+
+    for component_spec in $components; do
+        local component_name=${component_spec%%:*}
+        local selector=${component_spec#*:}
+
+        if check_monitoring_deployment "$component_name" "$selector"; then
+            healthy_count=$((healthy_count + 1))
+        fi
+    done
+
+    obs_log_info "Healthy components: $healthy_count/$total_count"
+
+    # Check HPA status
+    check_hpa_status "$@"
+
+    # Check custom metrics API
+    validate_custom_metrics_api
+
+    obs_log_info "=== End Monitoring Stack Status ==="
+
+    return 0
+}
+
+# Cleanup monitoring port forwards
+cleanup_monitoring_port_forwards() {
+    obs_log_info "Cleaning up monitoring port forwards..."
+
+    # Kill any kubectl port-forward processes
+    pkill -f "kubectl port-forward.*prometheus" 2>/dev/null || true
+    pkill -f "kubectl port-forward.*grafana" 2>/dev/null || true
+
+    obs_log_info "Port forward cleanup completed"
+}
+
+# Test Prometheus connectivity (with port forward)
+test_prometheus_connectivity() {
+    local namespace
+    namespace=$(get_obs_namespace)
+    local timeout="${1:-30}"
+
+    obs_log_info "Testing Prometheus connectivity..."
+
+    # Find Prometheus service
+    local prom_service
+    prom_service=$(kubectl get svc -n "$namespace" -l "app.kubernetes.io/name=prometheus,app.kubernetes.io/component=server" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
+
+    if [ -z "$prom_service" ]; then
+        obs_log_error "Prometheus service not found"
+        return 1
+    fi
+
+    # Setup port forward
+    local pf_pid
+    if ! pf_pid=$(setup_monitoring_port_forward "$prom_service" 9090 80); then
+        return 1
+    fi
+
+    # Test connectivity
+    local connected=false
+    local start_time
+    start_time=$(date +%s)
+
+    while [ $(($(date +%s) - start_time)) -lt "$timeout" ]; do
+        if curl -s "http://localhost:9090/api/v1/query?query=up" >/dev/null 2>&1; then
+            connected=true
+            break
+        fi
+        sleep 2
+    done
+
+    # Cleanup port forward
+    kill "$pf_pid" 2>/dev/null || true
+
+    if [ "$connected" = true ]; then
+        obs_log_success "Prometheus is accessible and responding"
+        return 0
+    else
+        obs_log_error "Cannot connect to Prometheus API"
+        return 1
+    fi
+}
+
+# Validate observability prerequisites
+validate_observability_prerequisites() {
+    obs_log_info "Validating observability prerequisites..."
+
+    local missing_deps=""
+    local missing_count=0
+
+    # Check required tools
+    local required_tools="kubectl curl python3"
+    for tool in $required_tools; do
+        if ! obs_command_exists "$tool"; then
+            missing_deps="$missing_deps $tool"
+            missing_count=$((missing_count + 1))
+        fi
+    done
+
+    if [ $missing_count -gt 0 ]; then
+        obs_log_error "Missing required tools:$missing_deps"
+        return 1
+    fi
+
+    # Check cluster connectivity
+    if ! kubectl cluster-info >/dev/null 2>&1; then
+        obs_log_error "Cannot connect to Kubernetes cluster"
+        return 1
+    fi
+
+    # Check namespace exists
+    local namespace
+    namespace=$(get_obs_namespace)
+    if ! kubectl get namespace "$namespace" >/dev/null 2>&1; then
+        obs_log_error "Namespace $namespace does not exist"
+        return 1
+    fi
+
+    obs_log_success "Observability prerequisites validated"
+    return 0
+}
+
+# Functions are available when script is sourced
+# Note: Function exports removed for compatibility with different shells
diff --git a/scripts/test.sh b/scripts/test.sh
index 805c4285..76785af1 100755
--- a/scripts/test.sh
+++ b/scripts/test.sh
@@ -22,7 +22,7 @@ fi
 # Show help message
 show_help() {
     cat << EOF
-eoAPI Test Suite - Combined Helm and Integration Testing
+eoAPI Test Suite - Combined Helm, Integration, and Observability Testing
 
 USAGE:
     $(basename "$0") [COMMAND] [OPTIONS]
@@ -30,7 +30,8 @@ USAGE:
 COMMANDS:
     helm              Run Helm tests only (lint, unit tests, template validation)
     integration       Run integration tests only (requires deployed eoAPI)
-    all               Run both Helm and integration tests [default]
+    observability     Run observability and autoscaling tests only
+    all               Run Helm, integration, and observability tests [default]
     check-deps        Check and install dependencies only
     check-deployment  Check eoAPI deployment status only
 
@@ -50,12 +51,19 @@ DESCRIPTION:
     Integration Tests:
     - Deployment verification
     - Service readiness checks
+
+    Observability Tests:
+    - Monitoring stack deployment verification (Prometheus, Grafana, etc.)
+    - HPA (Horizontal Pod Autoscaler) configuration validation
+    - Metrics collection and custom metrics API testing
+    - Autoscaling behavior validation
     - API endpoint testing
     - Comprehensive failure debugging
 
 REQUIREMENTS:
     Helm Tests: helm, helm unittest plugin
     Integration Tests: kubectl, python/pytest, deployed eoAPI instance
+    Observability Tests: kubectl, python/pytest, deployed eoAPI with monitoring enabled
 
 ENVIRONMENT VARIABLES:
     RELEASE_NAME             Override release name detection
@@ -69,10 +77,12 @@ EXAMPLES:
     $(basename "$0")                    # Run all tests
     $(basename "$0") helm               # Run only Helm tests
     $(basename "$0") integration        # Run only integration tests
+    $(basename "$0") observability      # Run only observability tests
     $(basename "$0") check-deps         # Check dependencies only
     $(basename "$0") check-deployment   # Check deployment status only
     $(basename "$0") all --debug        # Run all tests with debug output
     $(basename "$0") integration --debug # Run integration tests with enhanced logging
+    $(basename "$0") observability --debug # Run observability tests with debug output
     $(basename "$0") --help             # Show this help
 
 EOF
@@ -82,7 +92,7 @@ EOF
 parse_args() {
     while [[ $# -gt 0 ]]; do
         case $1 in
-            helm|integration|all|check-deps|check-deployment)
+            helm|integration|observability|all|check-deps|check-deployment)
                 COMMAND="$1"; shift ;;
             --debug)
                 DEBUG_MODE=true; shift ;;
@@ -118,6 +128,11 @@ check_integration_dependencies() {
     preflight_test "integration" || exit 1
 }
 
+# Check dependencies for observability tests
+check_observability_dependencies() {
+    preflight_test "observability" || exit 1
+}
+
 # Install Python test dependencies
 install_test_deps() {
     log_info "Installing Python test dependencies..."
@@ -749,6 +764,176 @@ EOF
     fi
 }
 
+# Run observability tests
+run_observability_tests() {
+    log_info "=== Running Observability Tests ==="
+
+    local python_cmd="python"
+    if command_exists python3; then
+        python_cmd="python3"
+    fi
+
+    local test_dir=".github/workflows/tests"
+    if [ ! -d "$test_dir" ]; then
+        log_error "Test directory not found: $test_dir"
+        log_info "Expected observability test files:"
+        log_info "  - $test_dir/test_observability.py"
+        log_info "  - $test_dir/test_autoscaling.py"
+        return 1
+    fi
+
+    # Check if observability test files exist
+    local obs_tests=("test_observability.py" "test_autoscaling.py")
+    local available_tests=()
+
+    for test_file in "${obs_tests[@]}"; do
+        if [ -f "$test_dir/$test_file" ]; then
+            available_tests+=("$test_dir/$test_file")
+        else
+            log_warning "Test file not found: $test_dir/$test_file"
+        fi
+    done
+
+    if [ ${#available_tests[@]} -eq 0 ]; then
+        log_error "No observability test files found in $test_dir"
+        return 1
+    fi
+
+    # Install test dependencies
+    log_info "Installing Python test dependencies..."
+    $python_cmd -m pip install --upgrade pip >/dev/null 2>&1 || log_warning "Could not upgrade pip"
+    $python_cmd -m pip install pytest requests >/dev/null 2>&1 || {
+        log_error "Failed to install pytest and requests"
+        return 1
+    }
+
+    # Check monitoring stack health first
+    check_monitoring_stack_health
+
+    # Set environment variables for tests
+    export NAMESPACE="${NAMESPACE:-eoapi}"
+    export RELEASE_NAME="${RELEASE_NAME:-eoapi}"
+
+    log_info "Running observability tests..."
+    log_info "Namespace: $NAMESPACE"
+    log_info "Release: $RELEASE_NAME"
+
+    local failed_tests=()
+
+    # Run each test file
+    for test_file in "${available_tests[@]}"; do
+        local test_name
+        test_name=$(basename "$test_file" .py)
+        log_info "Running $test_name..."
+
+        local test_result=0
+        if [ "$DEBUG_MODE" = true ]; then
+            $python_cmd -m pytest "$test_file" -v --tb=short || test_result=$?
+        else
+            $python_cmd -m pytest "$test_file" -v --tb=line || test_result=$?
+        fi
+
+        if [ $test_result -ne 0 ]; then
+            failed_tests+=("$test_name")
+            log_error "❌ $test_name failed"
+        else
+            log_info "✅ $test_name passed"
+        fi
+    done
+
+    # Final results
+    if [ ${#failed_tests[@]} -eq 0 ]; then
+        log_info "✅ All observability tests completed successfully!"
+    else
+        log_error "Some tests failed: ${failed_tests[*]}"
+
+        if [ "$DEBUG_MODE" = true ]; then
+            show_observability_debug_info
+        fi
+
+        return 1
+    fi
+}
+
+# Check monitoring stack health
+check_monitoring_stack_health() {
+    log_info "Checking monitoring stack health..."
+
+    local components="prometheus:app.kubernetes.io/name=prometheus grafana:app.kubernetes.io/name=grafana prometheus-adapter:app.kubernetes.io/name=prometheus-adapter kube-state-metrics:app.kubernetes.io/name=kube-state-metrics"
+
+    local healthy_components=""
+    local healthy_count=0
+    local unhealthy_components=""
+
+    for component_spec in $components; do
+        local component_name=${component_spec%%:*}
+        local selector=${component_spec#*:}
+
+        if kubectl get pods -n "$NAMESPACE" -l "$selector" >/dev/null 2>&1; then
+            local running_pods
+            running_pods=$(kubectl get pods -n "$NAMESPACE" -l "$selector" --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l)
+            if [ "$running_pods" -gt 0 ]; then
+                healthy_components="$healthy_components $component_name"
+                healthy_count=$((healthy_count + 1))
+                log_info "✅ $component_name is running ($running_pods pods)"
+            else
+                unhealthy_components="$unhealthy_components $component_name"
+                log_warning "⚠️  $component_name found but not running"
+            fi
+        else
+            log_info "ℹ️  $component_name not deployed (monitoring may be disabled)"
+        fi
+    done
+
+    # Check HPA resources
+    if kubectl get hpa -n "$NAMESPACE" >/dev/null 2>&1; then
+        local hpa_count
+        hpa_count=$(kubectl get hpa -n "$NAMESPACE" --no-headers 2>/dev/null | wc -l)
+        log_info "✅ Found $hpa_count HPA resources"
+    else
+        log_info "ℹ️  No HPA resources found (autoscaling may be disabled)"
+    fi
+
+    # Check custom metrics API
+    if kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" >/dev/null 2>&1; then
+        log_info "✅ Custom metrics API is available"
+    else
+        log_warning "⚠️  Custom metrics API not available"
+    fi
+
+    if [ $healthy_count -gt 0 ]; then
+        log_info "Monitoring stack health check completed"
+        return 0
+    else
+        log_warning "No monitoring components found - some tests may be skipped"
+        return 0  # Don't fail, just warn
+    fi
+}
+
+# Show observability debug information
+show_observability_debug_info() {
+    log_info "=== Observability Debug Information ==="
+
+    log_info "=== Monitoring Pods Status ==="
+    kubectl get pods -n "$NAMESPACE" -l 'app.kubernetes.io/name in (prometheus,grafana,prometheus-adapter,kube-state-metrics)' -o wide 2>/dev/null || true
+
+    log_info "=== HPA Status ==="
+    kubectl get hpa -n "$NAMESPACE" -o wide 2>/dev/null || true
+    kubectl describe hpa -n "$NAMESPACE" 2>/dev/null || true
+
+    log_info "=== Custom Metrics API ==="
+    kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" 2>/dev/null || true
+
+    log_info "=== Pod Resource Metrics ==="
+    kubectl top pods -n "$NAMESPACE" 2>/dev/null || true
+
+    log_info "=== Monitoring Services ==="
+    kubectl get services -n "$NAMESPACE" -l 'app.kubernetes.io/name in (prometheus,grafana)' 2>/dev/null || true
+
+    log_info "=== Recent Events ==="
+    kubectl get events -n "$NAMESPACE" --sort-by='.lastTimestamp' 2>/dev/null | tail -20 || true
+}
+
 # Main function
 main() {
     parse_args "$@"
@@ -799,8 +984,25 @@ main() {
 
             run_integration_tests
             ;;
+        observability)
+            log_info "Running observability and autoscaling tests"
+
+            check_observability_dependencies
+            check_cluster
+            install_test_deps
+            detect_deployment
+
+            # Show enhanced debugging in debug mode
+            if [ "$DEBUG_MODE" = true ]; then
+                show_debug_info
+            fi
+
+            check_eoapi_deployment
+
+            run_observability_tests
+            ;;
         all)
-            log_info "Running comprehensive test suite (Helm + Integration tests)"
+            log_info "Running comprehensive test suite (Helm + Integration + Observability tests)"
 
             # Run Helm tests first
             log_info "=== Phase 1: Helm Tests ==="
@@ -825,6 +1027,12 @@ main() {
             setup_test_environment
 
             run_integration_tests
+
+            # Run Observability tests third
+            log_info "=== Phase 3: Observability Tests ==="
+            check_observability_dependencies
+
+            run_observability_tests
             ;;
         *)
             log_error "Unknown command: $COMMAND"