-
-
Notifications
You must be signed in to change notification settings - Fork 0
DevOps Pipeline
Joao Palma edited this page Oct 27, 2025
·
3 revisions
Complete DevOps workflow for managing microservices on Kubernetes with CI/CD pipeline, monitoring, and infrastructure automation.
This workflow is designed for DevOps teams managing:
- Microservices Architecture: Multiple services deployed independently
- Kubernetes Infrastructure: Container orchestration and management
- CI/CD Pipelines: Automated testing, building, and deployment
- Monitoring & Observability: Comprehensive system monitoring
- Infrastructure as Code: Automated infrastructure management
Repository: company/devops-pipeline
# Initialize DevOps collection
mkdir devops-pipeline && cd devops-pipeline
git init
git remote add origin git@github.com:company/devops-pipeline.git
# Create collection structure
mkdir -p bin/{infrastructure,deployment,monitoring,security,backup}
mkdir -p docs/{infrastructure,deployment,monitoring,security,backup}
mkdir -p config/{k8s,helm,terraform}
mkdir -p scripts/{ci,cd,maintenance}Collection Metadata (dotrun.collection.yml):
name: "devops-pipeline"
description: "DevOps automation and infrastructure management tools"
author: "DevOps Team"
version: "2.0.0"
dependencies:
- kubectl
- helm
- docker
- git
- terraform
optional_dependencies:
- istioctl
- argocd
- flux
- prometheus
categories:
- infrastructure
- deployment
- monitoring
- security
- backupbin/infrastructure/setup-cluster.sh:
#!/usr/bin/env bash
### DOC
# Set up Kubernetes cluster with monitoring and ingress
# Configures complete production-ready cluster
### DOC
set -euo pipefail
source "$DR_CONFIG/helpers/pkg.sh"
validatePkg kubectl helm
main() {
local cluster_name="${1:-dev-cluster}"
local environment="${2:-development}"
echo "🏗️ Setting up Kubernetes cluster: $cluster_name"
# Install ingress controller
echo "🌐 Installing NGINX Ingress Controller..."
helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx
helm repo update
helm install ingress-nginx ingress-nginx/ingress-nginx \
--create-namespace \
--namespace ingress-nginx \
--values config/helm/ingress-nginx-values.yaml
# Install monitoring stack
echo "📊 Installing monitoring stack..."
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm install monitoring prometheus-community/kube-prometheus-stack \
--create-namespace \
--namespace monitoring \
--values config/helm/monitoring-values.yaml
# Install cert-manager for TLS
echo "🔒 Installing cert-manager..."
helm repo add jetstack https://charts.jetstack.io
helm install cert-manager jetstack/cert-manager \
--namespace cert-manager \
--create-namespace \
--set installCRDs=true \
--values config/helm/cert-manager-values.yaml
# Install service mesh (optional)
if [[ "$environment" == "production" ]]; then
echo "🕸️ Installing Istio service mesh..."
dr infrastructure/setup-istio
fi
echo "✅ Cluster setup complete!"
echo "🔍 Checking cluster status..."
kubectl get nodes
kubectl get pods -A
}
main "$@"bin/infrastructure/terraform-apply.sh:
#!/usr/bin/env bash
### DOC
# Apply Terraform infrastructure changes with safety checks
### DOC
set -euo pipefail
source "$DR_CONFIG/helpers/pkg.sh"
validatePkg terraform
main() {
local environment="${1:-development}"
local action="${2:-plan}"
echo "🏗️ Running Terraform $action for $environment..."
# Change to terraform directory
cd "terraform/$environment"
# Initialize Terraform
echo "🔧 Initializing Terraform..."
terraform init
case "$action" in
plan)
echo "📋 Creating Terraform plan..."
terraform plan -out="terraform.tfplan"
;;
apply)
echo "🚀 Applying Terraform changes..."
if [[ ! -f "terraform.tfplan" ]]; then
echo "❌ No plan file found. Run 'plan' first."
exit 1
fi
# Show plan before applying
terraform show terraform.tfplan
# Confirm in production
if [[ "$environment" == "production" ]]; then
read -p "Apply to PRODUCTION? [y/N]: " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
echo "❌ Production apply cancelled"
exit 1
fi
fi
terraform apply terraform.tfplan
;;
destroy)
echo "💥 Planning infrastructure destruction..."
terraform plan -destroy -out="destroy.tfplan"
read -p "DESTROY infrastructure in $environment? [y/N]: " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
terraform apply destroy.tfplan
else
echo "❌ Destruction cancelled"
fi
;;
*)
echo "Usage: dr infrastructure/terraform-apply <env> [plan|apply|destroy]"
exit 1
;;
esac
echo "✅ Terraform $action completed for $environment!"
}
main "$@"bin/deployment/deploy-microservice.sh:
#!/usr/bin/env bash
### DOC
# Deploy microservice to Kubernetes with zero-downtime
# Supports blue-green and rolling deployments
### DOC
set -euo pipefail
source "$DR_CONFIG/helpers/pkg.sh"
validatePkg kubectl docker
main() {
local service_name="$1"
local environment="${2:-staging}"
local image_tag="${3:-latest}"
local deployment_strategy="${4:-rolling}"
if [[ -z "$service_name" ]]; then
echo "Usage: dr deploy-microservice <service-name> [environment] [tag] [strategy]"
echo "Strategies: rolling, blue-green, canary"
exit 1
fi
echo "🚀 Deploying $service_name:$image_tag to $environment ($deployment_strategy)"
# Validate deployment files exist
local k8s_dir="config/k8s/$environment"
if [[ ! -d "$k8s_dir" ]]; then
echo "❌ Kubernetes manifests not found: $k8s_dir"
exit 1
fi
# Build and push Docker image
echo "🐳 Building Docker image..."
docker build -t "registry.company.com/$service_name:$image_tag" \
-f "services/$service_name/Dockerfile" \
"services/$service_name/"
docker push "registry.company.com/$service_name:$image_tag"
# Update image tag in manifests
echo "📝 Updating deployment manifests..."
sed -i "s|image: registry.company.com/$service_name:.*|image: registry.company.com/$service_name:$image_tag|g" \
"$k8s_dir/$service_name/deployment.yaml"
# Deploy based on strategy
case "$deployment_strategy" in
rolling)
dr deployment/rolling-deploy "$service_name" "$environment"
;;
blue-green)
dr deployment/blue-green-deploy "$service_name" "$environment"
;;
canary)
dr deployment/canary-deploy "$service_name" "$environment"
;;
*)
echo "❌ Unknown deployment strategy: $deployment_strategy"
exit 1
;;
esac
}
main "$@"bin/deployment/blue-green-deploy.sh:
#!/usr/bin/env bash
### DOC
# Blue-green deployment with automatic rollback
### DOC
set -euo pipefail
source "$DR_CONFIG/helpers/pkg.sh"
validatePkg kubectl
main() {
local service_name="$1"
local environment="$2"
local namespace="$environment"
echo "🔵🟢 Starting blue-green deployment for $service_name"
# Determine current and new colors
local current_color
current_color=$(kubectl get service "$service_name" -n "$namespace" \
-o jsonpath='{.spec.selector.color}' 2>/dev/null || echo "blue")
local new_color
if [[ "$current_color" == "blue" ]]; then
new_color="green"
else
new_color="blue"
fi
echo "📍 Current: $current_color, Deploying: $new_color"
# Deploy new version
echo "🚀 Deploying $new_color version..."
kubectl apply -f "config/k8s/$environment/$service_name/deployment-$new_color.yaml" -n "$namespace"
# Wait for deployment to be ready
echo "⏳ Waiting for $new_color deployment to be ready..."
kubectl rollout status deployment/"$service_name-$new_color" -n "$namespace" --timeout=300s
# Health check new version
echo "🏥 Running health checks on $new_color version..."
if ! dr deployment/health-check "$service_name-$new_color" "$environment"; then
echo "❌ Health check failed, rolling back..."
kubectl delete -f "config/k8s/$environment/$service_name/deployment-$new_color.yaml" -n "$namespace"
exit 1
fi
# Switch traffic to new version
echo "🔄 Switching traffic to $new_color version..."
kubectl patch service "$service_name" -n "$namespace" \
-p '{"spec":{"selector":{"color":"'$new_color'"}}}'
# Wait for traffic switch
sleep 10
# Final health check
echo "🔍 Final health check after traffic switch..."
if dr deployment/health-check "$service_name" "$environment"; then
echo "✅ Blue-green deployment successful!"
# Clean up old version
echo "🧹 Cleaning up $current_color version..."
kubectl delete deployment "$service_name-$current_color" -n "$namespace" || true
echo "🎉 Deployment complete: $service_name is now running $new_color version"
else
echo "❌ Final health check failed, rolling back..."
kubectl patch service "$service_name" -n "$namespace" \
-p '{"spec":{"selector":{"color":"'$current_color'"}}}'
kubectl delete deployment "$service_name-$new_color" -n "$namespace"
exit 1
fi
}
main "$@"bin/deployment/rollback.sh:
#!/usr/bin/env bash
### DOC
# Rollback deployment to previous version
### DOC
set -euo pipefail
source "$DR_CONFIG/helpers/pkg.sh"
validatePkg kubectl
main() {
local service_name="$1"
local environment="${2:-staging}"
local namespace="$environment"
echo "↩️ Rolling back $service_name in $environment..."
# Get rollout history
echo "📜 Deployment history:"
kubectl rollout history deployment/"$service_name" -n "$namespace"
# Rollback to previous version
echo "🔄 Rolling back to previous version..."
kubectl rollout undo deployment/"$service_name" -n "$namespace"
# Wait for rollback to complete
echo "⏳ Waiting for rollback to complete..."
kubectl rollout status deployment/"$service_name" -n "$namespace" --timeout=300s
# Health check after rollback
echo "🏥 Running health check after rollback..."
if dr deployment/health-check "$service_name" "$environment"; then
echo "✅ Rollback successful!"
else
echo "❌ Rollback health check failed"
exit 1
fi
}
main "$@"bin/monitoring/check-cluster-health.sh:
#!/usr/bin/env bash
### DOC
# Comprehensive cluster health check with alerting
### DOC
set -euo pipefail
source "$DR_CONFIG/helpers/pkg.sh"
validatePkg kubectl
main() {
echo "🏥 Checking cluster health..."
local issues=0
local report_file="/tmp/cluster-health-$(date +%Y%m%d-%H%M%S).txt"
echo "📊 Cluster Health Report - $(date)" >"$report_file"
echo "=================================" >>"$report_file"
# Check node status
echo "🖥️ Checking node status..."
echo -e "\n### Node Status ###" >>"$report_file"
local not_ready_nodes
not_ready_nodes=$(kubectl get nodes --no-headers | grep -v " Ready " | wc -l)
if [[ $not_ready_nodes -gt 0 ]]; then
echo "❌ $not_ready_nodes nodes not ready!" | tee -a "$report_file"
kubectl get nodes | grep -v " Ready " >>"$report_file"
((issues++))
else
echo "✅ All nodes ready" | tee -a "$report_file"
fi
# Check pod status
echo "🐳 Checking pod status..."
echo -e "\n### Pod Status ###" >>"$report_file"
local failed_pods
failed_pods=$(kubectl get pods -A --no-headers | grep -E "(Error|CrashLoopBackOff|Failed|Pending)" | wc -l)
if [[ $failed_pods -gt 0 ]]; then
echo "❌ $failed_pods pods in failed state!" | tee -a "$report_file"
kubectl get pods -A | grep -E "(Error|CrashLoopBackOff|Failed|Pending)" >>"$report_file"
((issues++))
else
echo "✅ All pods healthy" | tee -a "$report_file"
fi
# Check resource usage
echo "📊 Checking resource usage..."
echo -e "\n### Resource Usage ###" >>"$report_file"
if kubectl top nodes &>/dev/null; then
kubectl top nodes >>"$report_file"
# Check for high resource usage
local high_cpu_nodes
high_cpu_nodes=$(kubectl top nodes --no-headers | awk '$3 > 80 {print $1}' | wc -l)
if [[ $high_cpu_nodes -gt 0 ]]; then
echo "⚠️ $high_cpu_nodes nodes with high CPU usage" | tee -a "$report_file"
((issues++))
fi
else
echo "⚠️ Metrics server not available" | tee -a "$report_file"
fi
# Check persistent volumes
echo "💾 Checking persistent volumes..."
echo -e "\n### Persistent Volumes ###" >>"$report_file"
local pv_issues
pv_issues=$(kubectl get pv --no-headers | grep -v "Bound" | wc -l)
if [[ $pv_issues -gt 0 ]]; then
echo "❌ $pv_issues persistent volumes not bound!" | tee -a "$report_file"
kubectl get pv | grep -v "Bound" >>"$report_file"
((issues++))
else
echo "✅ All persistent volumes bound" | tee -a "$report_file"
fi
# Check certificate expiration
echo "🔒 Checking certificate expiration..."
echo -e "\n### Certificate Status ###" >>"$report_file"
local expiring_certs
expiring_certs=$(kubectl get certificates -A -o json | jq -r '.items[] | select(.status.notAfter != null) | select(((.status.notAfter | fromdate) - now) < (30 * 24 * 3600)) | "\(.metadata.namespace)/\(.metadata.name)"' | wc -l)
if [[ $expiring_certs -gt 0 ]]; then
echo "⚠️ $expiring_certs certificates expiring within 30 days" | tee -a "$report_file"
kubectl get certificates -A -o json | jq -r '.items[] | select(.status.notAfter != null) | select(((.status.notAfter | fromdate) - now) < (30 * 24 * 3600)) | "\(.metadata.namespace)/\(.metadata.name): expires \(.status.notAfter)"' >>"$report_file"
else
echo "✅ No certificates expiring soon" | tee -a "$report_file"
fi
# Summary
echo -e "\n### Summary ###" >>"$report_file"
if [[ $issues -eq 0 ]]; then
echo "🎉 Cluster is healthy!" | tee -a "$report_file"
return 0
else
echo "⚠️ Found $issues issues in cluster" | tee -a "$report_file"
# Send alert if configured
if [[ -n "${SLACK_WEBHOOK_URL:-}" ]]; then
dr monitoring/send-alert "Cluster Health Issues" "$report_file"
fi
return 1
fi
}
main "$@"bin/monitoring/prometheus-query.sh:
#!/usr/bin/env bash
### DOC
# Query Prometheus metrics with common queries
### DOC
set -euo pipefail
source "$DR_CONFIG/helpers/pkg.sh"
validatePkg curl jq
main() {
local query="$1"
local prometheus_url="${PROMETHEUS_URL:-http://localhost:9090}"
echo "📊 Querying Prometheus: $query"
case "$query" in
cpu-usage)
local prom_query="100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)"
;;
memory-usage)
local prom_query="(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100"
;;
pod-restarts)
local prom_query="increase(kube_pod_container_status_restarts_total[1h])"
;;
disk-usage)
local prom_query="100 - ((node_filesystem_avail_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"}) * 100)"
;;
*)
local prom_query="$query"
;;
esac
# Execute query
local result
result=$(curl -s "$prometheus_url/api/v1/query" \
--data-urlencode "query=$prom_query" | jq -r '.data.result')
if [[ "$result" == "[]" ]]; then
echo "❌ No data returned for query"
exit 1
fi
echo "📈 Results:"
echo "$result" | jq -r '.[] | "\(.metric.instance // .metric.pod): \(.value[1])"'
}
main "$@"bin/security/scan-cluster.sh:
#!/usr/bin/env bash
### DOC
# Security scan of cluster and workloads
### DOC
set -euo pipefail
source "$DR_CONFIG/helpers/pkg.sh"
validatePkg kubectl
main() {
echo "🔒 Running security scan..."
# Check for privileged containers
echo "🚨 Checking for privileged containers..."
local privileged_pods
privileged_pods=$(kubectl get pods -A -o json | jq -r '.items[] | select(.spec.containers[]?.securityContext.privileged == true) | "\(.metadata.namespace)/\(.metadata.name)"')
if [[ -n "$privileged_pods" ]]; then
echo "⚠️ Found privileged containers:"
echo "$privileged_pods"
else
echo "✅ No privileged containers found"
fi
# Check for containers running as root
echo "👤 Checking for containers running as root..."
local root_containers
root_containers=$(kubectl get pods -A -o json | jq -r '.items[] | select(.spec.containers[]?.securityContext.runAsUser == 0 or (.spec.containers[]?.securityContext.runAsUser == null and .spec.securityContext.runAsUser == null)) | "\(.metadata.namespace)/\(.metadata.name)"')
if [[ -n "$root_containers" ]]; then
echo "⚠️ Found containers running as root:"
echo "$root_containers"
else
echo "✅ No containers running as root"
fi
# Check network policies
echo "🌐 Checking network policies..."
local namespaces_without_netpol
namespaces_without_netpol=$(kubectl get namespaces -o json | jq -r '.items[] | select(.metadata.name != "kube-system" and .metadata.name != "kube-public" and .metadata.name != "default") | .metadata.name' | while read -r ns; do
if ! kubectl get networkpolicies -n "$ns" &>/dev/null || [[ $(kubectl get networkpolicies -n "$ns" --no-headers | wc -l) -eq 0 ]]; then
echo "$ns"
fi
done)
if [[ -n "$namespaces_without_netpol" ]]; then
echo "⚠️ Namespaces without network policies:"
echo "$namespaces_without_netpol"
else
echo "✅ All namespaces have network policies"
fi
echo "✅ Security scan completed"
}
main "$@"# DevOps team setup
dr import git@github.com:company/devops-pipeline.git ops
# Daily monitoring
dr ops/monitoring/check-cluster-health
dr ops/monitoring/prometheus-query cpu-usage
dr ops/security/scan-cluster
# Deployment operations
dr ops/deployment/deploy-microservice auth-service staging v1.2.3
dr ops/deployment/health-check auth-service staging
# Infrastructure management
dr ops/infrastructure/terraform-apply staging plan
dr ops/infrastructure/setup-cluster prod-cluster production# Quick cluster assessment
dr ops/monitoring/check-cluster-health
# Check specific service
dr ops/monitoring/service-status auth-service production
# Rollback if needed
dr ops/deployment/rollback auth-service production
# Scale resources
dr ops/deployment/scale-service auth-service production 10# Backup operations
dr ops/backup/backup-etcd
dr ops/backup/backup-persistent-volumes
# Updates and patches
dr ops/infrastructure/update-cluster
dr ops/deployment/update-all-services staging
# Security maintenance
dr ops/security/scan-cluster
dr ops/security/update-certificatesThis DevOps pipeline workflow provides comprehensive automation for Kubernetes-based infrastructure while maintaining security, reliability, and observability at scale.