Skip to content

DevOps Pipeline

Joao Palma edited this page Oct 27, 2025 · 3 revisions

DevOps Pipeline Workflow

Complete DevOps workflow for managing microservices on Kubernetes with CI/CD pipeline, monitoring, and infrastructure automation.

Overview

This workflow is designed for DevOps teams managing:

  • Microservices Architecture: Multiple services deployed independently
  • Kubernetes Infrastructure: Container orchestration and management
  • CI/CD Pipelines: Automated testing, building, and deployment
  • Monitoring & Observability: Comprehensive system monitoring
  • Infrastructure as Code: Automated infrastructure management

Collection Setup

Repository: company/devops-pipeline

# Initialize DevOps collection
mkdir devops-pipeline && cd devops-pipeline
git init
git remote add origin git@github.com:company/devops-pipeline.git

# Create collection structure
mkdir -p bin/{infrastructure,deployment,monitoring,security,backup}
mkdir -p docs/{infrastructure,deployment,monitoring,security,backup}
mkdir -p config/{k8s,helm,terraform}
mkdir -p scripts/{ci,cd,maintenance}

Collection Metadata (dotrun.collection.yml):

name: "devops-pipeline"
description: "DevOps automation and infrastructure management tools"
author: "DevOps Team"
version: "2.0.0"
dependencies:
  - kubectl
  - helm
  - docker
  - git
  - terraform
optional_dependencies:
  - istioctl
  - argocd
  - flux
  - prometheus
categories:
  - infrastructure
  - deployment
  - monitoring
  - security
  - backup

Infrastructure Scripts

Cluster Setup

bin/infrastructure/setup-cluster.sh:

#!/usr/bin/env bash
### DOC
# Set up Kubernetes cluster with monitoring and ingress
# Configures complete production-ready cluster
### DOC
set -euo pipefail

source "$DR_CONFIG/helpers/pkg.sh"
validatePkg kubectl helm

main() {
  local cluster_name="${1:-dev-cluster}"
  local environment="${2:-development}"

  echo "🏗️  Setting up Kubernetes cluster: $cluster_name"

  # Install ingress controller
  echo "🌐 Installing NGINX Ingress Controller..."
  helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx
  helm repo update
  helm install ingress-nginx ingress-nginx/ingress-nginx \
    --create-namespace \
    --namespace ingress-nginx \
    --values config/helm/ingress-nginx-values.yaml

  # Install monitoring stack
  echo "📊 Installing monitoring stack..."
  helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
  helm install monitoring prometheus-community/kube-prometheus-stack \
    --create-namespace \
    --namespace monitoring \
    --values config/helm/monitoring-values.yaml

  # Install cert-manager for TLS
  echo "🔒 Installing cert-manager..."
  helm repo add jetstack https://charts.jetstack.io
  helm install cert-manager jetstack/cert-manager \
    --namespace cert-manager \
    --create-namespace \
    --set installCRDs=true \
    --values config/helm/cert-manager-values.yaml

  # Install service mesh (optional)
  if [[ "$environment" == "production" ]]; then
    echo "🕸️  Installing Istio service mesh..."
    dr infrastructure/setup-istio
  fi

  echo "✅ Cluster setup complete!"
  echo "🔍 Checking cluster status..."
  kubectl get nodes
  kubectl get pods -A
}

main "$@"

Terraform Infrastructure

bin/infrastructure/terraform-apply.sh:

#!/usr/bin/env bash
### DOC
# Apply Terraform infrastructure changes with safety checks
### DOC
set -euo pipefail

source "$DR_CONFIG/helpers/pkg.sh"
validatePkg terraform

main() {
  local environment="${1:-development}"
  local action="${2:-plan}"

  echo "🏗️  Running Terraform $action for $environment..."

  # Change to terraform directory
  cd "terraform/$environment"

  # Initialize Terraform
  echo "🔧 Initializing Terraform..."
  terraform init

  case "$action" in
    plan)
      echo "📋 Creating Terraform plan..."
      terraform plan -out="terraform.tfplan"
      ;;
    apply)
      echo "🚀 Applying Terraform changes..."
      if [[ ! -f "terraform.tfplan" ]]; then
        echo "❌ No plan file found. Run 'plan' first."
        exit 1
      fi

      # Show plan before applying
      terraform show terraform.tfplan

      # Confirm in production
      if [[ "$environment" == "production" ]]; then
        read -p "Apply to PRODUCTION? [y/N]: " -n 1 -r
        echo
        if [[ ! $REPLY =~ ^[Yy]$ ]]; then
          echo "❌ Production apply cancelled"
          exit 1
        fi
      fi

      terraform apply terraform.tfplan
      ;;
    destroy)
      echo "💥 Planning infrastructure destruction..."
      terraform plan -destroy -out="destroy.tfplan"

      read -p "DESTROY infrastructure in $environment? [y/N]: " -n 1 -r
      echo
      if [[ $REPLY =~ ^[Yy]$ ]]; then
        terraform apply destroy.tfplan
      else
        echo "❌ Destruction cancelled"
      fi
      ;;
    *)
      echo "Usage: dr infrastructure/terraform-apply <env> [plan|apply|destroy]"
      exit 1
      ;;
  esac

  echo "✅ Terraform $action completed for $environment!"
}

main "$@"

Deployment Scripts

Microservice Deployment

bin/deployment/deploy-microservice.sh:

#!/usr/bin/env bash
### DOC
# Deploy microservice to Kubernetes with zero-downtime
# Supports blue-green and rolling deployments
### DOC
set -euo pipefail

source "$DR_CONFIG/helpers/pkg.sh"
validatePkg kubectl docker

main() {
  local service_name="$1"
  local environment="${2:-staging}"
  local image_tag="${3:-latest}"
  local deployment_strategy="${4:-rolling}"

  if [[ -z "$service_name" ]]; then
    echo "Usage: dr deploy-microservice <service-name> [environment] [tag] [strategy]"
    echo "Strategies: rolling, blue-green, canary"
    exit 1
  fi

  echo "🚀 Deploying $service_name:$image_tag to $environment ($deployment_strategy)"

  # Validate deployment files exist
  local k8s_dir="config/k8s/$environment"
  if [[ ! -d "$k8s_dir" ]]; then
    echo "❌ Kubernetes manifests not found: $k8s_dir"
    exit 1
  fi

  # Build and push Docker image
  echo "🐳 Building Docker image..."
  docker build -t "registry.company.com/$service_name:$image_tag" \
    -f "services/$service_name/Dockerfile" \
    "services/$service_name/"

  docker push "registry.company.com/$service_name:$image_tag"

  # Update image tag in manifests
  echo "📝 Updating deployment manifests..."
  sed -i "s|image: registry.company.com/$service_name:.*|image: registry.company.com/$service_name:$image_tag|g" \
    "$k8s_dir/$service_name/deployment.yaml"

  # Deploy based on strategy
  case "$deployment_strategy" in
    rolling)
      dr deployment/rolling-deploy "$service_name" "$environment"
      ;;
    blue-green)
      dr deployment/blue-green-deploy "$service_name" "$environment"
      ;;
    canary)
      dr deployment/canary-deploy "$service_name" "$environment"
      ;;
    *)
      echo "❌ Unknown deployment strategy: $deployment_strategy"
      exit 1
      ;;
  esac
}

main "$@"

Blue-Green Deployment

bin/deployment/blue-green-deploy.sh:

#!/usr/bin/env bash
### DOC
# Blue-green deployment with automatic rollback
### DOC
set -euo pipefail

source "$DR_CONFIG/helpers/pkg.sh"
validatePkg kubectl

main() {
  local service_name="$1"
  local environment="$2"
  local namespace="$environment"

  echo "🔵🟢 Starting blue-green deployment for $service_name"

  # Determine current and new colors
  local current_color
  current_color=$(kubectl get service "$service_name" -n "$namespace" \
    -o jsonpath='{.spec.selector.color}' 2>/dev/null || echo "blue")

  local new_color
  if [[ "$current_color" == "blue" ]]; then
    new_color="green"
  else
    new_color="blue"
  fi

  echo "📍 Current: $current_color, Deploying: $new_color"

  # Deploy new version
  echo "🚀 Deploying $new_color version..."
  kubectl apply -f "config/k8s/$environment/$service_name/deployment-$new_color.yaml" -n "$namespace"

  # Wait for deployment to be ready
  echo "⏳ Waiting for $new_color deployment to be ready..."
  kubectl rollout status deployment/"$service_name-$new_color" -n "$namespace" --timeout=300s

  # Health check new version
  echo "🏥 Running health checks on $new_color version..."
  if ! dr deployment/health-check "$service_name-$new_color" "$environment"; then
    echo "❌ Health check failed, rolling back..."
    kubectl delete -f "config/k8s/$environment/$service_name/deployment-$new_color.yaml" -n "$namespace"
    exit 1
  fi

  # Switch traffic to new version
  echo "🔄 Switching traffic to $new_color version..."
  kubectl patch service "$service_name" -n "$namespace" \
    -p '{"spec":{"selector":{"color":"'$new_color'"}}}'

  # Wait for traffic switch
  sleep 10

  # Final health check
  echo "🔍 Final health check after traffic switch..."
  if dr deployment/health-check "$service_name" "$environment"; then
    echo "✅ Blue-green deployment successful!"

    # Clean up old version
    echo "🧹 Cleaning up $current_color version..."
    kubectl delete deployment "$service_name-$current_color" -n "$namespace" || true

    echo "🎉 Deployment complete: $service_name is now running $new_color version"
  else
    echo "❌ Final health check failed, rolling back..."
    kubectl patch service "$service_name" -n "$namespace" \
      -p '{"spec":{"selector":{"color":"'$current_color'"}}}'
    kubectl delete deployment "$service_name-$new_color" -n "$namespace"
    exit 1
  fi
}

main "$@"

Rollback Script

bin/deployment/rollback.sh:

#!/usr/bin/env bash
### DOC
# Rollback deployment to previous version
### DOC
set -euo pipefail

source "$DR_CONFIG/helpers/pkg.sh"
validatePkg kubectl

main() {
  local service_name="$1"
  local environment="${2:-staging}"
  local namespace="$environment"

  echo "↩️  Rolling back $service_name in $environment..."

  # Get rollout history
  echo "📜 Deployment history:"
  kubectl rollout history deployment/"$service_name" -n "$namespace"

  # Rollback to previous version
  echo "🔄 Rolling back to previous version..."
  kubectl rollout undo deployment/"$service_name" -n "$namespace"

  # Wait for rollback to complete
  echo "⏳ Waiting for rollback to complete..."
  kubectl rollout status deployment/"$service_name" -n "$namespace" --timeout=300s

  # Health check after rollback
  echo "🏥 Running health check after rollback..."
  if dr deployment/health-check "$service_name" "$environment"; then
    echo "✅ Rollback successful!"
  else
    echo "❌ Rollback health check failed"
    exit 1
  fi
}

main "$@"

Monitoring Scripts

Cluster Health Check

bin/monitoring/check-cluster-health.sh:

#!/usr/bin/env bash
### DOC
# Comprehensive cluster health check with alerting
### DOC
set -euo pipefail

source "$DR_CONFIG/helpers/pkg.sh"
validatePkg kubectl

main() {
  echo "🏥 Checking cluster health..."

  local issues=0
  local report_file="/tmp/cluster-health-$(date +%Y%m%d-%H%M%S).txt"

  echo "📊 Cluster Health Report - $(date)" >"$report_file"
  echo "=================================" >>"$report_file"

  # Check node status
  echo "🖥️  Checking node status..."
  echo -e "\n### Node Status ###" >>"$report_file"

  local not_ready_nodes
  not_ready_nodes=$(kubectl get nodes --no-headers | grep -v " Ready " | wc -l)
  if [[ $not_ready_nodes -gt 0 ]]; then
    echo "$not_ready_nodes nodes not ready!" | tee -a "$report_file"
    kubectl get nodes | grep -v " Ready " >>"$report_file"
    ((issues++))
  else
    echo "✅ All nodes ready" | tee -a "$report_file"
  fi

  # Check pod status
  echo "🐳 Checking pod status..."
  echo -e "\n### Pod Status ###" >>"$report_file"

  local failed_pods
  failed_pods=$(kubectl get pods -A --no-headers | grep -E "(Error|CrashLoopBackOff|Failed|Pending)" | wc -l)
  if [[ $failed_pods -gt 0 ]]; then
    echo "$failed_pods pods in failed state!" | tee -a "$report_file"
    kubectl get pods -A | grep -E "(Error|CrashLoopBackOff|Failed|Pending)" >>"$report_file"
    ((issues++))
  else
    echo "✅ All pods healthy" | tee -a "$report_file"
  fi

  # Check resource usage
  echo "📊 Checking resource usage..."
  echo -e "\n### Resource Usage ###" >>"$report_file"

  if kubectl top nodes &>/dev/null; then
    kubectl top nodes >>"$report_file"

    # Check for high resource usage
    local high_cpu_nodes
    high_cpu_nodes=$(kubectl top nodes --no-headers | awk '$3 > 80 {print $1}' | wc -l)
    if [[ $high_cpu_nodes -gt 0 ]]; then
      echo "⚠️  $high_cpu_nodes nodes with high CPU usage" | tee -a "$report_file"
      ((issues++))
    fi
  else
    echo "⚠️  Metrics server not available" | tee -a "$report_file"
  fi

  # Check persistent volumes
  echo "💾 Checking persistent volumes..."
  echo -e "\n### Persistent Volumes ###" >>"$report_file"

  local pv_issues
  pv_issues=$(kubectl get pv --no-headers | grep -v "Bound" | wc -l)
  if [[ $pv_issues -gt 0 ]]; then
    echo "$pv_issues persistent volumes not bound!" | tee -a "$report_file"
    kubectl get pv | grep -v "Bound" >>"$report_file"
    ((issues++))
  else
    echo "✅ All persistent volumes bound" | tee -a "$report_file"
  fi

  # Check certificate expiration
  echo "🔒 Checking certificate expiration..."
  echo -e "\n### Certificate Status ###" >>"$report_file"

  local expiring_certs
  expiring_certs=$(kubectl get certificates -A -o json | jq -r '.items[] | select(.status.notAfter != null) | select(((.status.notAfter | fromdate) - now) < (30 * 24 * 3600)) | "\(.metadata.namespace)/\(.metadata.name)"' | wc -l)
  if [[ $expiring_certs -gt 0 ]]; then
    echo "⚠️  $expiring_certs certificates expiring within 30 days" | tee -a "$report_file"
    kubectl get certificates -A -o json | jq -r '.items[] | select(.status.notAfter != null) | select(((.status.notAfter | fromdate) - now) < (30 * 24 * 3600)) | "\(.metadata.namespace)/\(.metadata.name): expires \(.status.notAfter)"' >>"$report_file"
  else
    echo "✅ No certificates expiring soon" | tee -a "$report_file"
  fi

  # Summary
  echo -e "\n### Summary ###" >>"$report_file"
  if [[ $issues -eq 0 ]]; then
    echo "🎉 Cluster is healthy!" | tee -a "$report_file"
    return 0
  else
    echo "⚠️  Found $issues issues in cluster" | tee -a "$report_file"

    # Send alert if configured
    if [[ -n "${SLACK_WEBHOOK_URL:-}" ]]; then
      dr monitoring/send-alert "Cluster Health Issues" "$report_file"
    fi

    return 1
  fi
}

main "$@"

Prometheus Metrics

bin/monitoring/prometheus-query.sh:

#!/usr/bin/env bash
### DOC
# Query Prometheus metrics with common queries
### DOC
set -euo pipefail

source "$DR_CONFIG/helpers/pkg.sh"
validatePkg curl jq

main() {
  local query="$1"
  local prometheus_url="${PROMETHEUS_URL:-http://localhost:9090}"

  echo "📊 Querying Prometheus: $query"

  case "$query" in
    cpu-usage)
      local prom_query="100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)"
      ;;
    memory-usage)
      local prom_query="(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100"
      ;;
    pod-restarts)
      local prom_query="increase(kube_pod_container_status_restarts_total[1h])"
      ;;
    disk-usage)
      local prom_query="100 - ((node_filesystem_avail_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"}) * 100)"
      ;;
    *)
      local prom_query="$query"
      ;;
  esac

  # Execute query
  local result
  result=$(curl -s "$prometheus_url/api/v1/query" \
    --data-urlencode "query=$prom_query" | jq -r '.data.result')

  if [[ "$result" == "[]" ]]; then
    echo "❌ No data returned for query"
    exit 1
  fi

  echo "📈 Results:"
  echo "$result" | jq -r '.[] | "\(.metric.instance // .metric.pod): \(.value[1])"'
}

main "$@"

Security Scripts

Security Scan

bin/security/scan-cluster.sh:

#!/usr/bin/env bash
### DOC
# Security scan of cluster and workloads
### DOC
set -euo pipefail

source "$DR_CONFIG/helpers/pkg.sh"
validatePkg kubectl

main() {
  echo "🔒 Running security scan..."

  # Check for privileged containers
  echo "🚨 Checking for privileged containers..."
  local privileged_pods
  privileged_pods=$(kubectl get pods -A -o json | jq -r '.items[] | select(.spec.containers[]?.securityContext.privileged == true) | "\(.metadata.namespace)/\(.metadata.name)"')

  if [[ -n "$privileged_pods" ]]; then
    echo "⚠️  Found privileged containers:"
    echo "$privileged_pods"
  else
    echo "✅ No privileged containers found"
  fi

  # Check for containers running as root
  echo "👤 Checking for containers running as root..."
  local root_containers
  root_containers=$(kubectl get pods -A -o json | jq -r '.items[] | select(.spec.containers[]?.securityContext.runAsUser == 0 or (.spec.containers[]?.securityContext.runAsUser == null and .spec.securityContext.runAsUser == null)) | "\(.metadata.namespace)/\(.metadata.name)"')

  if [[ -n "$root_containers" ]]; then
    echo "⚠️  Found containers running as root:"
    echo "$root_containers"
  else
    echo "✅ No containers running as root"
  fi

  # Check network policies
  echo "🌐 Checking network policies..."
  local namespaces_without_netpol
  namespaces_without_netpol=$(kubectl get namespaces -o json | jq -r '.items[] | select(.metadata.name != "kube-system" and .metadata.name != "kube-public" and .metadata.name != "default") | .metadata.name' | while read -r ns; do
    if ! kubectl get networkpolicies -n "$ns" &>/dev/null || [[ $(kubectl get networkpolicies -n "$ns" --no-headers | wc -l) -eq 0 ]]; then
      echo "$ns"
    fi
  done)

  if [[ -n "$namespaces_without_netpol" ]]; then
    echo "⚠️  Namespaces without network policies:"
    echo "$namespaces_without_netpol"
  else
    echo "✅ All namespaces have network policies"
  fi

  echo "✅ Security scan completed"
}

main "$@"

Team Usage

Daily Operations

# DevOps team setup
dr import git@github.com:company/devops-pipeline.git ops

# Daily monitoring
dr ops/monitoring/check-cluster-health
dr ops/monitoring/prometheus-query cpu-usage
dr ops/security/scan-cluster

# Deployment operations
dr ops/deployment/deploy-microservice auth-service staging v1.2.3
dr ops/deployment/health-check auth-service staging

# Infrastructure management
dr ops/infrastructure/terraform-apply staging plan
dr ops/infrastructure/setup-cluster prod-cluster production

Incident Response

# Quick cluster assessment
dr ops/monitoring/check-cluster-health

# Check specific service
dr ops/monitoring/service-status auth-service production

# Rollback if needed
dr ops/deployment/rollback auth-service production

# Scale resources
dr ops/deployment/scale-service auth-service production 10

Maintenance Workflows

# Backup operations
dr ops/backup/backup-etcd
dr ops/backup/backup-persistent-volumes

# Updates and patches
dr ops/infrastructure/update-cluster
dr ops/deployment/update-all-services staging

# Security maintenance
dr ops/security/scan-cluster
dr ops/security/update-certificates

This DevOps pipeline workflow provides comprehensive automation for Kubernetes-based infrastructure while maintaining security, reliability, and observability at scale.

Clone this wiki locally