Skip to content

DevOps Team Workflow

Joao Palma edited this page Oct 27, 2025 · 2 revisions

DevOps Team Workflow

This workflow demonstrates how DevOps teams can leverage dotrun to automate infrastructure management, deployment processes, and maintain consistent environments across development, staging, and production.

Overview

A DevOps workflow with dotrun enables:

  • Automated infrastructure provisioning and management
  • Consistent deployment processes across environments
  • Centralized script management for the entire team
  • Streamlined incident response and troubleshooting
  • Integration with CI/CD pipelines

Infrastructure Management Scripts

1. Environment Provisioning

AWS Environment Setup

infrastructure/aws/provision-environment.sh

#!/bin/bash
# Provision complete AWS environment

set -e

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../utilities/colors.sh"
source "$SCRIPT_DIR/../../utilities/logging.sh"
source "$SCRIPT_DIR/../../utilities/aws-helpers.sh"

ENVIRONMENT=${1:-"development"}
REGION=${2:-"us-west-2"}
PROJECT_NAME=${3:-"myapp"}

log_info "Provisioning AWS environment: $ENVIRONMENT in $REGION"

# Check AWS CLI and credentials
check_aws_credentials

# Create Terraform workspace
WORKSPACE_DIR="terraform/environments/$ENVIRONMENT"
mkdir -p "$WORKSPACE_DIR"

# Generate Terraform configuration
cat >"$WORKSPACE_DIR/main.tf" <<EOF
terraform {
  required_version = ">= 1.0"
  required_providers {
    aws = {
      source  = "hashicorp/aws"
      version = "~> 5.0"
    }
  }
  
  backend "s3" {
    bucket = "${PROJECT_NAME}-terraform-state"
    key    = "${ENVIRONMENT}/terraform.tfstate"
    region = "${REGION}"
  }
}

provider "aws" {
  region = var.aws_region
}

# Variables
variable "aws_region" {
  description = "AWS region"
  type        = string
  default     = "${REGION}"
}

variable "environment" {
  description = "Environment name"
  type        = string
  default     = "${ENVIRONMENT}"
}

variable "project_name" {
  description = "Project name"
  type        = string
  default     = "${PROJECT_NAME}"
}

# VPC and Networking
module "vpc" {
  source = "../../modules/vpc"
  
  environment  = var.environment
  project_name = var.project_name
  aws_region   = var.aws_region
}

# EKS Cluster
module "eks" {
  source = "../../modules/eks"
  
  environment    = var.environment
  project_name   = var.project_name
  vpc_id         = module.vpc.vpc_id
  subnet_ids     = module.vpc.private_subnet_ids
  
  depends_on = [module.vpc]
}

# RDS Database
module "database" {
  source = "../../modules/rds"
  
  environment     = var.environment
  project_name    = var.project_name
  vpc_id          = module.vpc.vpc_id
  subnet_ids      = module.vpc.database_subnet_ids
  security_groups = [module.eks.database_security_group_id]
  
  depends_on = [module.vpc]
}

# S3 Buckets
module "storage" {
  source = "../../modules/s3"
  
  environment  = var.environment
  project_name = var.project_name
}

# Outputs
output "eks_cluster_name" {
  value = module.eks.cluster_name
}

output "database_endpoint" {
  value = module.database.endpoint
}

output "s3_bucket_names" {
  value = module.storage.bucket_names
}
EOF

# Initialize and apply Terraform
cd "$WORKSPACE_DIR"
log_info "Initializing Terraform..."
terraform init

log_info "Planning Terraform deployment..."
terraform plan -out=tfplan

echo -e "${YELLOW}Review the Terraform plan above.${NC}"
read -p "Proceed with deployment? (y/N): " confirm

if [[ $confirm == [yY] ]]; then
  log_info "Applying Terraform configuration..."
  terraform apply tfplan

  # Save outputs
  terraform output -json >outputs.json

  log_success "Environment $ENVIRONMENT provisioned successfully!"

  # Configure kubectl
  if [[ $(terraform output -raw eks_cluster_name) ]]; then
    log_info "Configuring kubectl..."
    aws eks update-kubeconfig --region $REGION --name $(terraform output -raw eks_cluster_name)
  fi
else
  log_info "Deployment cancelled"
  rm -f tfplan
fi

Kubernetes Cluster Management

infrastructure/k8s/cluster-manager.sh

#!/bin/bash
# Kubernetes cluster management

set -e

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../utilities/colors.sh"
source "$SCRIPT_DIR/../../utilities/logging.sh"

show_menu() {
  echo -e "${BLUE}=== Kubernetes Cluster Manager ===${NC}"
  echo "1) List clusters"
  echo "2) Switch context"
  echo "3) Deploy application"
  echo "4) Scale deployment"
  echo "5) View logs"
  echo "6) Port forward"
  echo "7) Execute command in pod"
  echo "8) Cluster health check"
  echo "9) Resource usage"
  echo "10) Backup ETCD"
  echo "0) Exit"
}

list_clusters() {
  log_info "Available clusters:"
  kubectl config get-contexts
}

switch_context() {
  kubectl config get-contexts
  read -p "Enter context name: " context
  kubectl config use-context "$context"
  log_success "Switched to context: $context"
}

deploy_application() {
  echo -e "${BLUE}Application Deployment${NC}"
  read -p "Namespace: " namespace
  read -p "Deployment file/directory: " deploy_path

  if [[ -d "$deploy_path" ]]; then
    kubectl apply -f "$deploy_path" -n "$namespace"
  elif [[ -f "$deploy_path" ]]; then
    kubectl apply -f "$deploy_path" -n "$namespace"
  else
    log_error "Deploy path not found: $deploy_path"
    return 1
  fi

  log_success "Application deployed to namespace: $namespace"
}

scale_deployment() {
  kubectl get deployments --all-namespaces
  read -p "Namespace: " namespace
  read -p "Deployment name: " deployment
  read -p "Replica count: " replicas

  kubectl scale deployment "$deployment" --replicas="$replicas" -n "$namespace"
  log_success "Deployment $deployment scaled to $replicas replicas"
}

view_logs() {
  kubectl get pods --all-namespaces
  read -p "Namespace: " namespace
  read -p "Pod name (or partial name): " pod_name

  # Find matching pod
  full_pod_name=$(kubectl get pods -n "$namespace" | grep "$pod_name" | head -1 | awk '{print $1}')

  if [[ -n "$full_pod_name" ]]; then
    echo -e "${BLUE}Logs for $full_pod_name:${NC}"
    kubectl logs -f "$full_pod_name" -n "$namespace"
  else
    log_error "Pod not found: $pod_name"
  fi
}

port_forward() {
  kubectl get services --all-namespaces
  read -p "Namespace: " namespace
  read -p "Service name: " service
  read -p "Local port: " local_port
  read -p "Service port: " service_port

  log_info "Port forwarding $local_port:$service_port for service $service"
  kubectl port-forward service/"$service" "$local_port:$service_port" -n "$namespace"
}

exec_pod_command() {
  kubectl get pods --all-namespaces
  read -p "Namespace: " namespace
  read -p "Pod name: " pod_name
  read -p "Command (default: /bin/bash): " command
  command=${command:-"/bin/bash"}

  kubectl exec -it "$pod_name" -n "$namespace" -- $command
}

cluster_health_check() {
  log_info "Cluster Health Check"

  echo -e "\n${BLUE}Node Status:${NC}"
  kubectl get nodes

  echo -e "\n${BLUE}System Pods:${NC}"
  kubectl get pods -n kube-system

  echo -e "\n${BLUE}Cluster Info:${NC}"
  kubectl cluster-info

  echo -e "\n${BLUE}Resource Usage:${NC}"
  kubectl top nodes 2>/dev/null || echo "Metrics server not available"
}

resource_usage() {
  log_info "Resource Usage Report"

  echo -e "\n${BLUE}Node Resource Usage:${NC}"
  kubectl top nodes 2>/dev/null || echo "Metrics server not available"

  echo -e "\n${BLUE}Pod Resource Usage (Top 10):${NC}"
  kubectl top pods --all-namespaces --sort-by=cpu 2>/dev/null | head -11 || echo "Metrics server not available"

  echo -e "\n${BLUE}Persistent Volume Claims:${NC}"
  kubectl get pvc --all-namespaces

  echo -e "\n${BLUE}Storage Classes:${NC}"
  kubectl get storageclass
}

backup_etcd() {
  log_info "ETCD Backup"
  timestamp=$(date +%Y%m%d_%H%M%S)
  backup_path="/tmp/etcd-backup-$timestamp"

  # This assumes ETCD is accessible - adjust for your setup
  ETCDCTL_API=3 etcdctl snapshot save "$backup_path.db" \
    --endpoints=https://127.0.0.1:2379 \
    --cacert=/etc/kubernetes/pki/etcd/ca.crt \
    --cert=/etc/kubernetes/pki/etcd/server.crt \
    --key=/etc/kubernetes/pki/etcd/server.key

  log_success "ETCD backup saved to: $backup_path.db"
}

# Main menu loop
while true; do
  show_menu
  read -p "Choose option: " choice

  case $choice in
    1) list_clusters ;;
    2) switch_context ;;
    3) deploy_application ;;
    4) scale_deployment ;;
    5) view_logs ;;
    6) port_forward ;;
    7) exec_pod_command ;;
    8) cluster_health_check ;;
    9) resource_usage ;;
    10) backup_etcd ;;
    0) exit 0 ;;
    *) echo -e "${RED}Invalid option${NC}" ;;
  esac

  echo ""
  read -p "Press Enter to continue..."
  clear
done

2. Deployment Automation

Multi-Environment Deployment

deployment/deploy-app.sh

#!/bin/bash
# Multi-environment application deployment

set -e

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../utilities/colors.sh"
source "$SCRIPT_DIR/../utilities/logging.sh"

# Configuration
APP_NAME=${1:-"myapp"}
ENVIRONMENT=${2:-"development"}
VERSION=${3:-"latest"}
NAMESPACE="$APP_NAME-$ENVIRONMENT"

# Environment-specific configurations
case $ENVIRONMENT in
  "development")
    REPLICAS=1
    RESOURCES_REQUESTS_CPU="100m"
    RESOURCES_REQUESTS_MEMORY="128Mi"
    RESOURCES_LIMITS_CPU="500m"
    RESOURCES_LIMITS_MEMORY="512Mi"
    ;;
  "staging")
    REPLICAS=2
    RESOURCES_REQUESTS_CPU="200m"
    RESOURCES_REQUESTS_MEMORY="256Mi"
    RESOURCES_LIMITS_CPU="1000m"
    RESOURCES_LIMITS_MEMORY="1Gi"
    ;;
  "production")
    REPLICAS=3
    RESOURCES_REQUESTS_CPU="500m"
    RESOURCES_REQUESTS_MEMORY="512Mi"
    RESOURCES_LIMITS_CPU="2000m"
    RESOURCES_LIMITS_MEMORY="2Gi"
    ;;
  *)
    log_error "Unknown environment: $ENVIRONMENT"
    exit 1
    ;;
esac

log_info "Deploying $APP_NAME version $VERSION to $ENVIRONMENT"

# Pre-deployment checks
check_prerequisites() {
  log_info "Running pre-deployment checks..."

  # Check kubectl access
  if ! kubectl cluster-info &>/dev/null; then
    log_error "Cannot connect to Kubernetes cluster"
    exit 1
  fi

  # Check if namespace exists
  if ! kubectl get namespace "$NAMESPACE" &>/dev/null; then
    log_info "Creating namespace: $NAMESPACE"
    kubectl create namespace "$NAMESPACE"
  fi

  # Check if image exists
  if [[ "$VERSION" != "latest" ]]; then
    # This would typically check your container registry
    log_info "Verifying image: $APP_NAME:$VERSION"
  fi

  log_success "Pre-deployment checks passed"
}

# Generate Kubernetes manifests
generate_manifests() {
  log_info "Generating Kubernetes manifests..."

  MANIFEST_DIR="/tmp/$APP_NAME-$ENVIRONMENT-deploy"
  mkdir -p "$MANIFEST_DIR"

  # Deployment manifest
  cat >"$MANIFEST_DIR/deployment.yaml" <<EOF
apiVersion: apps/v1
kind: Deployment
metadata:
  name: $APP_NAME
  namespace: $NAMESPACE
  labels:
    app: $APP_NAME
    environment: $ENVIRONMENT
spec:
  replicas: $REPLICAS
  selector:
    matchLabels:
      app: $APP_NAME
  template:
    metadata:
      labels:
        app: $APP_NAME
        environment: $ENVIRONMENT
    spec:
      containers:
      - name: $APP_NAME
        image: $APP_NAME:$VERSION
        ports:
        - containerPort: 3000
        env:
        - name: NODE_ENV
          value: $ENVIRONMENT
        - name: PORT
          value: "3000"
        resources:
          requests:
            cpu: $RESOURCES_REQUESTS_CPU
            memory: $RESOURCES_REQUESTS_MEMORY
          limits:
            cpu: $RESOURCES_LIMITS_CPU
            memory: $RESOURCES_LIMITS_MEMORY
        livenessProbe:
          httpGet:
            path: /health
            port: 3000
          initialDelaySeconds: 30
          periodSeconds: 10
        readinessProbe:
          httpGet:
            path: /ready
            port: 3000
          initialDelaySeconds: 5
          periodSeconds: 5
EOF

  # Service manifest
  cat >"$MANIFEST_DIR/service.yaml" <<EOF
apiVersion: v1
kind: Service
metadata:
  name: $APP_NAME-service
  namespace: $NAMESPACE
  labels:
    app: $APP_NAME
spec:
  selector:
    app: $APP_NAME
  ports:
  - protocol: TCP
    port: 80
    targetPort: 3000
  type: ClusterIP
EOF

  # Ingress manifest (for production)
  if [[ "$ENVIRONMENT" == "production" ]]; then
    cat >"$MANIFEST_DIR/ingress.yaml" <<EOF
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: $APP_NAME-ingress
  namespace: $NAMESPACE
  annotations:
    nginx.ingress.kubernetes.io/rewrite-target: /
spec:
  rules:
  - host: $APP_NAME.company.com
    http:
      paths:
      - path: /
        pathType: Prefix
        backend:
          service:
            name: $APP_NAME-service
            port:
              number: 80
EOF
  fi

  log_success "Manifests generated in $MANIFEST_DIR"
}

# Deploy application
deploy_application() {
  log_info "Deploying application to Kubernetes..."

  # Apply manifests
  kubectl apply -f "$MANIFEST_DIR/"

  # Wait for deployment to be ready
  log_info "Waiting for deployment to be ready..."
  kubectl rollout status deployment/$APP_NAME -n "$NAMESPACE" --timeout=300s

  # Verify deployment
  log_info "Verifying deployment..."
  kubectl get pods -n "$NAMESPACE" -l app="$APP_NAME"

  log_success "Deployment completed successfully!"
}

# Post-deployment tests
run_post_deployment_tests() {
  log_info "Running post-deployment tests..."

  # Get service endpoint
  if [[ "$ENVIRONMENT" == "production" ]]; then
    ENDPOINT="http://$APP_NAME.company.com"
  else
    # Port forward for testing
    kubectl port-forward service/$APP_NAME-service 8080:80 -n "$NAMESPACE" &
    PORT_FORWARD_PID=$!
    sleep 5
    ENDPOINT="http://localhost:8080"
  fi

  # Health check
  if curl -f "$ENDPOINT/health" &>/dev/null; then
    log_success "Health check passed"
  else
    log_error "Health check failed"
    cleanup_and_exit 1
  fi

  # Cleanup port forward
  if [[ -n "$PORT_FORWARD_PID" ]]; then
    kill $PORT_FORWARD_PID 2>/dev/null || true
  fi

  log_success "Post-deployment tests passed"
}

# Cleanup function
cleanup_and_exit() {
  local exit_code=${1:-0}

  # Kill port forward if running
  if [[ -n "$PORT_FORWARD_PID" ]]; then
    kill $PORT_FORWARD_PID 2>/dev/null || true
  fi

  # Clean up temporary files
  rm -rf "$MANIFEST_DIR" 2>/dev/null || true

  exit $exit_code
}

# Main deployment flow
main() {
  check_prerequisites
  generate_manifests
  deploy_application
  run_post_deployment_tests

  log_success "🚀 Deployment of $APP_NAME:$VERSION to $ENVIRONMENT completed successfully!"

  # Show access information
  echo -e "\n${BLUE}Access Information:${NC}"
  if [[ "$ENVIRONMENT" == "production" ]]; then
    echo "  URL: http://$APP_NAME.company.com"
  else
    echo "  Port forward: kubectl port-forward service/$APP_NAME-service 8080:80 -n $NAMESPACE"
    echo "  Then access: http://localhost:8080"
  fi
}

# Trap for cleanup
trap cleanup_and_exit EXIT

# Run main function
main

Blue-Green Deployment

deployment/blue-green-deploy.sh

#!/bin/bash
# Blue-green deployment strategy

set -e

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../utilities/colors.sh"
source "$SCRIPT_DIR/../utilities/logging.sh"

APP_NAME=${1:-"myapp"}
ENVIRONMENT=${2:-"production"}
NEW_VERSION=${3:-"latest"}
NAMESPACE="$APP_NAME-$ENVIRONMENT"

# Determine current and new colors
CURRENT_COLOR=$(kubectl get service "$APP_NAME-service" -n "$NAMESPACE" -o jsonpath='{.spec.selector.color}' 2>/dev/null || echo "blue")
if [[ "$CURRENT_COLOR" == "blue" ]]; then
  NEW_COLOR="green"
else
  NEW_COLOR="blue"
fi

log_info "Blue-Green Deployment: $APP_NAME:$NEW_VERSION"
log_info "Current active: $CURRENT_COLOR, Deploying to: $NEW_COLOR"

# Deploy new version to inactive color
deploy_new_version() {
  log_info "Deploying new version to $NEW_COLOR environment..."

  # Create deployment with new color
  cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
  name: $APP_NAME-$NEW_COLOR
  namespace: $NAMESPACE
  labels:
    app: $APP_NAME
    color: $NEW_COLOR
spec:
  replicas: 3
  selector:
    matchLabels:
      app: $APP_NAME
      color: $NEW_COLOR
  template:
    metadata:
      labels:
        app: $APP_NAME
        color: $NEW_COLOR
    spec:
      containers:
      - name: $APP_NAME
        image: $APP_NAME:$NEW_VERSION
        ports:
        - containerPort: 3000
        env:
        - name: NODE_ENV
          value: $ENVIRONMENT
        livenessProbe:
          httpGet:
            path: /health
            port: 3000
          initialDelaySeconds: 30
        readinessProbe:
          httpGet:
            path: /ready
            port: 3000
          initialDelaySeconds: 10
EOF

  # Wait for new deployment
  kubectl rollout status deployment/$APP_NAME-$NEW_COLOR -n "$NAMESPACE" --timeout=300s
  log_success "New version deployed to $NEW_COLOR"
}

# Test new version
test_new_version() {
  log_info "Testing new version on $NEW_COLOR..."

  # Port forward to test the new version
  kubectl port-forward deployment/$APP_NAME-$NEW_COLOR 9090:3000 -n "$NAMESPACE" &
  PORT_FORWARD_PID=$!
  sleep 10

  # Run tests
  if curl -f http://localhost:9090/health &>/dev/null; then
    log_success "Health check passed for $NEW_COLOR"
  else
    log_error "Health check failed for $NEW_COLOR"
    kill $PORT_FORWARD_PID
    return 1
  fi

  # Additional smoke tests
  if curl -f http://localhost:9090/api/status &>/dev/null; then
    log_success "API status check passed for $NEW_COLOR"
  else
    log_warning "API status check failed for $NEW_COLOR"
  fi

  kill $PORT_FORWARD_PID
  log_success "New version testing completed"
}

# Switch traffic to new version
switch_traffic() {
  log_info "Switching traffic from $CURRENT_COLOR to $NEW_COLOR..."

  # Update service selector
  kubectl patch service "$APP_NAME-service" -n "$NAMESPACE" -p '{"spec":{"selector":{"color":"'$NEW_COLOR'"}}}'

  # Verify traffic switch
  sleep 5
  ACTIVE_COLOR=$(kubectl get service "$APP_NAME-service" -n "$NAMESPACE" -o jsonpath='{.spec.selector.color}')

  if [[ "$ACTIVE_COLOR" == "$NEW_COLOR" ]]; then
    log_success "Traffic successfully switched to $NEW_COLOR"
  else
    log_error "Traffic switch failed"
    return 1
  fi
}

# Cleanup old version
cleanup_old_version() {
  echo -e "${YELLOW}Do you want to remove the old version ($CURRENT_COLOR)? This cannot be undone.${NC}"
  read -p "Remove old version? (y/N): " confirm

  if [[ $confirm == [yY] ]]; then
    log_info "Removing old version ($CURRENT_COLOR)..."
    kubectl delete deployment "$APP_NAME-$CURRENT_COLOR" -n "$NAMESPACE"
    log_success "Old version removed"
  else
    log_info "Old version kept for rollback capability"
    echo "To remove later: kubectl delete deployment $APP_NAME-$CURRENT_COLOR -n $NAMESPACE"
  fi
}

# Rollback function
rollback() {
  log_warning "Rolling back to $CURRENT_COLOR..."
  kubectl patch service "$APP_NAME-service" -n "$NAMESPACE" -p '{"spec":{"selector":{"color":"'$CURRENT_COLOR'"}}}'
  log_success "Rollback completed"
}

# Main deployment flow
main() {
  # Check if current deployment exists
  if ! kubectl get deployment "$APP_NAME-$CURRENT_COLOR" -n "$NAMESPACE" &>/dev/null; then
    log_error "No current deployment found. Use regular deployment first."
    exit 1
  fi

  deploy_new_version

  if test_new_version; then
    switch_traffic

    # Final verification
    log_info "Performing final verification..."
    sleep 10

    if curl -f "http://$APP_NAME.company.com/health" &>/dev/null; then
      log_success "🎉 Blue-green deployment completed successfully!"
      cleanup_old_version
    else
      log_error "Final verification failed, rolling back..."
      rollback
      exit 1
    fi
  else
    log_error "New version testing failed, keeping current version active"
    kubectl delete deployment "$APP_NAME-$NEW_COLOR" -n "$NAMESPACE"
    exit 1
  fi
}

# Trap for emergency rollback
trap 'echo -e "\n${RED}Deployment interrupted! Rolling back...${NC}"; rollback' INT TERM

main

3. Monitoring and Alerting

System Monitoring Script

monitoring/system-monitor.sh

#!/bin/bash
# Comprehensive system monitoring

set -e

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../utilities/colors.sh"
source "$SCRIPT_DIR/../utilities/logging.sh"

# Configuration
ALERT_THRESHOLD_CPU=80
ALERT_THRESHOLD_MEMORY=85
ALERT_THRESHOLD_DISK=90
SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL:-}"

send_alert() {
  local message="$1"
  local severity="${2:-WARNING}"

  log_warning "$message"

  if [[ -n "$SLACK_WEBHOOK_URL" ]]; then
    curl -X POST -H 'Content-type: application/json' \
      --data "{\"text\":\"🚨 [$severity] $message\"}" \
      "$SLACK_WEBHOOK_URL"
  fi
}

check_system_resources() {
  log_info "Checking system resources..."

  # CPU usage
  cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
  cpu_usage=${cpu_usage%.*} # Remove decimal

  if [[ $cpu_usage -gt $ALERT_THRESHOLD_CPU ]]; then
    send_alert "High CPU usage: ${cpu_usage}%" "CRITICAL"
  fi

  # Memory usage
  memory_info=$(free | grep Mem)
  total_memory=$(echo $memory_info | awk '{print $2}')
  used_memory=$(echo $memory_info | awk '{print $3}')
  memory_usage=$((used_memory * 100 / total_memory))

  if [[ $memory_usage -gt $ALERT_THRESHOLD_MEMORY ]]; then
    send_alert "High memory usage: ${memory_usage}%" "CRITICAL"
  fi

  # Disk usage
  while IFS= read -r line; do
    usage=$(echo "$line" | awk '{print $5}' | sed 's/%//')
    mount=$(echo "$line" | awk '{print $6}')

    if [[ $usage -gt $ALERT_THRESHOLD_DISK ]]; then
      send_alert "High disk usage: ${usage}% on ${mount}" "CRITICAL"
    fi
  done < <(df -h | grep -E '^/dev/')

  echo -e "${BLUE}System Resources:${NC}"
  echo "  CPU Usage: ${cpu_usage}%"
  echo "  Memory Usage: ${memory_usage}%"
  echo "  Disk Usage:"
  df -h | grep -E '^/dev/' | while read line; do
    echo "    $line"
  done
}

check_services() {
  log_info "Checking critical services..."

  # Define critical services
  CRITICAL_SERVICES=("docker" "nginx" "postgresql" "redis")

  for service in "${CRITICAL_SERVICES[@]}"; do
    if systemctl is-active --quiet "$service"; then
      echo -e "  ${GREEN}${NC} $service is running"
    else
      send_alert "Service $service is not running" "CRITICAL"
      echo -e "  ${RED}${NC} $service is not running"
    fi
  done
}

check_kubernetes_health() {
  if command -v kubectl &>/dev/null; then
    log_info "Checking Kubernetes cluster health..."

    # Check node status
    not_ready_nodes=$(kubectl get nodes | grep -v Ready | grep -v STATUS | wc -l)
    if [[ $not_ready_nodes -gt 0 ]]; then
      send_alert "$not_ready_nodes Kubernetes nodes are not ready" "CRITICAL"
    fi

    # Check failed pods
    failed_pods=$(kubectl get pods --all-namespaces --field-selector=status.phase=Failed --no-headers | wc -l)
    if [[ $failed_pods -gt 0 ]]; then
      send_alert "$failed_pods pods are in Failed state" "WARNING"
    fi

    # Check pending pods
    pending_pods=$(kubectl get pods --all-namespaces --field-selector=status.phase=Pending --no-headers | wc -l)
    if [[ $pending_pods -gt 5 ]]; then
      send_alert "$pending_pods pods are in Pending state" "WARNING"
    fi

    echo -e "${BLUE}Kubernetes Status:${NC}"
    echo "  Not Ready Nodes: $not_ready_nodes"
    echo "  Failed Pods: $failed_pods"
    echo "  Pending Pods: $pending_pods"
  fi
}

check_application_health() {
  log_info "Checking application health endpoints..."

  # Define application health endpoints
  HEALTH_ENDPOINTS=(
    "http://localhost:3000/health"
    "http://localhost:8080/actuator/health"
    "http://api.company.com/health"
  )

  for endpoint in "${HEALTH_ENDPOINTS[@]}"; do
    if curl -f --max-time 10 "$endpoint" &>/dev/null; then
      echo -e "  ${GREEN}${NC} $endpoint is healthy"
    else
      send_alert "Health check failed for $endpoint" "CRITICAL"
      echo -e "  ${RED}${NC} $endpoint is unhealthy"
    fi
  done
}

check_log_errors() {
  log_info "Checking for recent errors in logs..."

  # Check system logs for errors in the last hour
  error_count=$(journalctl --since "1 hour ago" --priority=err --no-pager | wc -l)

  if [[ $error_count -gt 10 ]]; then
    send_alert "$error_count error messages in system logs in the last hour" "WARNING"
  fi

  # Check application logs if they exist
  if [[ -f "/var/log/application.log" ]]; then
    app_errors=$(grep -i error /var/log/application.log | grep "$(date '+%Y-%m-%d %H')" | wc -l)
    if [[ $app_errors -gt 5 ]]; then
      send_alert "$app_errors application errors in the last hour" "WARNING"
    fi
  fi

  echo -e "${BLUE}Log Analysis:${NC}"
  echo "  System errors (last hour): $error_count"
  if [[ -f "/var/log/application.log" ]]; then
    echo "  Application errors (last hour): $app_errors"
  fi
}

generate_report() {
  local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
  local report_file="/tmp/system-monitor-report-$(date +%Y%m%d-%H%M).txt"

  {
    echo "System Monitor Report - $timestamp"
    echo "======================================="
    echo ""
    check_system_resources
    echo ""
    check_services
    echo ""
    check_kubernetes_health
    echo ""
    check_application_health
    echo ""
    check_log_errors
  } | tee "$report_file"

  log_success "Report saved to: $report_file"
}

# Main monitoring function
main() {
  echo -e "${BBLUE}=== System Monitor ===${NC}\n"

  generate_report

  log_success "System monitoring completed"
}

main

4. Incident Response

Emergency Response Toolkit

incident-response/emergency-toolkit.sh

#!/bin/bash
# Emergency incident response toolkit

set -e

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../utilities/colors.sh"
source "$SCRIPT_DIR/../utilities/logging.sh"

show_emergency_menu() {
  echo -e "${BRED}=== EMERGENCY RESPONSE TOOLKIT ===${NC}"
  echo -e "${YELLOW}Use with caution - these actions affect production!${NC}\n"
  echo "1) Scale down problematic service"
  echo "2) Rollback deployment"
  echo "3) Restart failed pods"
  echo "4) Enable maintenance mode"
  echo "5) Emergency log collection"
  echo "6) Database emergency backup"
  echo "7) Traffic rerouting"
  echo "8) Resource emergency scaling"
  echo "9) Complete system status"
  echo "0) Exit"
}

scale_down_service() {
  echo -e "${BRED}EMERGENCY: Scale down service${NC}"
  kubectl get deployments --all-namespaces
  read -p "Namespace: " namespace
  read -p "Deployment name: " deployment
  read -p "Scale to replicas (0 to stop): " replicas

  echo -e "${YELLOW}This will immediately scale $deployment to $replicas replicas!${NC}"
  read -p "Confirm (type 'EMERGENCY'): " confirm

  if [[ "$confirm" == "EMERGENCY" ]]; then
    kubectl scale deployment "$deployment" --replicas="$replicas" -n "$namespace"
    log_success "Emergency scaling completed"

    # Log the incident
    echo "$(date): EMERGENCY SCALE - $deployment scaled to $replicas replicas by $USER" >>/tmp/emergency-actions.log
  else
    log_info "Emergency scaling cancelled"
  fi
}

emergency_rollback() {
  echo -e "${BRED}EMERGENCY: Rollback deployment${NC}"
  kubectl get deployments --all-namespaces
  read -p "Namespace: " namespace
  read -p "Deployment name: " deployment

  # Show rollout history
  echo -e "\n${BLUE}Rollout history:${NC}"
  kubectl rollout history deployment/"$deployment" -n "$namespace"

  read -p "Revision to rollback to (blank for previous): " revision

  echo -e "${YELLOW}This will immediately rollback $deployment!${NC}"
  read -p "Confirm (type 'EMERGENCY'): " confirm

  if [[ "$confirm" == "EMERGENCY" ]]; then
    if [[ -n "$revision" ]]; then
      kubectl rollout undo deployment/"$deployment" --to-revision="$revision" -n "$namespace"
    else
      kubectl rollout undo deployment/"$deployment" -n "$namespace"
    fi

    kubectl rollout status deployment/"$deployment" -n "$namespace"
    log_success "Emergency rollback completed"

    # Log the incident
    echo "$(date): EMERGENCY ROLLBACK - $deployment rolled back by $USER" >>/tmp/emergency-actions.log
  else
    log_info "Emergency rollback cancelled"
  fi
}

restart_failed_pods() {
  echo -e "${BRED}EMERGENCY: Restart failed pods${NC}"

  # Find failed pods
  failed_pods=$(kubectl get pods --all-namespaces --field-selector=status.phase=Failed --no-headers)

  if [[ -z "$failed_pods" ]]; then
    log_info "No failed pods found"
    return
  fi

  echo -e "${BLUE}Failed pods:${NC}"
  echo "$failed_pods"

  read -p "Restart all failed pods? (y/N): " confirm

  if [[ $confirm == [yY] ]]; then
    echo "$failed_pods" | while read namespace name status age; do
      kubectl delete pod "$name" -n "$namespace"
      log_info "Deleted failed pod: $name in $namespace"
    done

    log_success "Failed pods restarted"
    echo "$(date): EMERGENCY RESTART - Failed pods restarted by $USER" >>/tmp/emergency-actions.log
  fi
}

enable_maintenance_mode() {
  echo -e "${BRED}EMERGENCY: Enable maintenance mode${NC}"

  # This assumes you have a maintenance page setup
  read -p "Duration in minutes: " duration

  echo -e "${YELLOW}This will put the system in maintenance mode for $duration minutes!${NC}"
  read -p "Confirm (type 'EMERGENCY'): " confirm

  if [[ "$confirm" == "EMERGENCY" ]]; then
    # Update ingress to point to maintenance page
    kubectl patch ingress app-ingress -n production -p '{
            "spec": {
                "rules": [{
                    "host": "app.company.com",
                    "http": {
                        "paths": [{
                            "path": "/",
                            "pathType": "Prefix",
                            "backend": {
                                "service": {
                                    "name": "maintenance-service",
                                    "port": {"number": 80}
                                }
                            }
                        }]
                    }
                }]
            }
        }'

    log_success "Maintenance mode enabled for $duration minutes"

    # Set reminder to disable maintenance mode
    echo "#!/bin/bash" >/tmp/disable-maintenance.sh
    echo "kubectl patch ingress app-ingress -n production --type=json -p='[{\"op\": \"replace\", \"path\": \"/spec/rules/0/http/paths/0/backend/service/name\", \"value\": \"app-service\"}]'" >>/tmp/disable-maintenance.sh
    chmod +x /tmp/disable-maintenance.sh

    echo "$(date): EMERGENCY MAINTENANCE - Maintenance mode enabled for $duration minutes by $USER" >>/tmp/emergency-actions.log
    echo -e "${YELLOW}Remember to disable maintenance mode: /tmp/disable-maintenance.sh${NC}"
  fi
}

emergency_log_collection() {
  echo -e "${BRED}EMERGENCY: Log collection${NC}"

  timestamp=$(date +%Y%m%d_%H%M%S)
  log_dir="/tmp/emergency-logs-$timestamp"
  mkdir -p "$log_dir"

  log_info "Collecting emergency logs to $log_dir"

  # System logs
  journalctl --since "2 hours ago" >"$log_dir/system.log"

  # Kubernetes logs
  if command -v kubectl &>/dev/null; then
    kubectl get events --all-namespaces --sort-by='.lastTimestamp' >"$log_dir/k8s-events.log"
    kubectl get pods --all-namespaces -o wide >"$log_dir/k8s-pods.log"

    # Pod logs for failed/pending pods
    kubectl get pods --all-namespaces --field-selector=status.phase=Failed -o json \
      | jq -r '.items[] | "\(.metadata.namespace) \(.metadata.name)"' \
      | while read namespace pod; do
        kubectl logs "$pod" -n "$namespace" >"$log_dir/pod-$namespace-$pod.log" 2>/dev/null || true
      done
  fi

  # Application logs
  if [[ -d "/var/log" ]]; then
    cp /var/log/*.log "$log_dir/" 2>/dev/null || true
  fi

  # Create archive
  tar -czf "$log_dir.tar.gz" -C "/tmp" "emergency-logs-$timestamp"

  log_success "Emergency logs collected: $log_dir.tar.gz"
  echo "$(date): EMERGENCY LOGS - Logs collected by $USER" >>/tmp/emergency-actions.log
}

# Main emergency menu
while true; do
  show_emergency_menu
  read -p "Choose emergency action: " choice

  case $choice in
    1) scale_down_service ;;
    2) emergency_rollback ;;
    3) restart_failed_pods ;;
    4) enable_maintenance_mode ;;
    5) emergency_log_collection ;;
    6) echo "Database backup feature - implement based on your DB" ;;
    7) echo "Traffic rerouting feature - implement based on your setup" ;;
    8) echo "Resource scaling feature - implement based on your infrastructure" ;;
    9) dotrun run monitoring/system-monitor ;;
    0) exit 0 ;;
    *) echo -e "${RED}Invalid option${NC}" ;;
  esac

  echo ""
  read -p "Press Enter to continue..."
  clear
done

CI/CD Integration

1. Jenkins Pipeline Integration

ci-cd/jenkins/Jenkinsfile

pipeline {
    agent any

    environment {
        DOCKER_REGISTRY = 'your-registry.com'
        APP_NAME = 'myapp'
        DOTRUN_SCRIPTS = '/opt/dotrun-scripts'
    }

    stages {
        stage('Checkout') {
            steps {
                checkout scm
            }
        }

        stage('Build') {
            steps {
                script {
                    sh "${DOTRUN_SCRIPTS}/ci-cd/build-application.sh ${APP_NAME} ${BUILD_NUMBER}"
                }
            }
        }

        stage('Test') {
            steps {
                script {
                    sh "${DOTRUN_SCRIPTS}/ci-cd/run-tests.sh ${APP_NAME}"
                }
            }
            post {
                always {
                    publishTestResults testResultsPattern: 'test-results.xml'
                }
            }
        }

        stage('Security Scan') {
            steps {
                script {
                    sh "${DOTRUN_SCRIPTS}/ci-cd/security-scan.sh ${APP_NAME}"
                }
            }
        }

        stage('Deploy to Staging') {
            when {
                branch 'develop'
            }
            steps {
                script {
                    sh "${DOTRUN_SCRIPTS}/deployment/deploy-app.sh ${APP_NAME} staging ${BUILD_NUMBER}"
                }
            }
        }

        stage('Deploy to Production') {
            when {
                branch 'master'
            }
            steps {
                script {
                    sh "${DOTRUN_SCRIPTS}/deployment/blue-green-deploy.sh ${APP_NAME} production ${BUILD_NUMBER}"
                }
            }
        }
    }

    post {
        failure {
            sh "${DOTRUN_SCRIPTS}/utilities/send-alert.sh 'Pipeline failed for ${APP_NAME}:${BUILD_NUMBER}'"
        }
        success {
            sh "${DOTRUN_SCRIPTS}/utilities/send-alert.sh 'Pipeline succeeded for ${APP_NAME}:${BUILD_NUMBER}'"
        }
    }
}

This comprehensive DevOps workflow provides robust infrastructure management, automated deployments, monitoring capabilities, and emergency response tools that can be integrated into any modern DevOps pipeline.

Clone this wiki locally