-
-
Notifications
You must be signed in to change notification settings - Fork 0
DevOps Team Workflow
Joao Palma edited this page Oct 27, 2025
·
2 revisions
This workflow demonstrates how DevOps teams can leverage dotrun to automate infrastructure management, deployment processes, and maintain consistent environments across development, staging, and production.
A DevOps workflow with dotrun enables:
- Automated infrastructure provisioning and management
- Consistent deployment processes across environments
- Centralized script management for the entire team
- Streamlined incident response and troubleshooting
- Integration with CI/CD pipelines
infrastructure/aws/provision-environment.sh
#!/bin/bash
# Provision complete AWS environment
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../utilities/colors.sh"
source "$SCRIPT_DIR/../../utilities/logging.sh"
source "$SCRIPT_DIR/../../utilities/aws-helpers.sh"
ENVIRONMENT=${1:-"development"}
REGION=${2:-"us-west-2"}
PROJECT_NAME=${3:-"myapp"}
log_info "Provisioning AWS environment: $ENVIRONMENT in $REGION"
# Check AWS CLI and credentials
check_aws_credentials
# Create Terraform workspace
WORKSPACE_DIR="terraform/environments/$ENVIRONMENT"
mkdir -p "$WORKSPACE_DIR"
# Generate Terraform configuration
cat >"$WORKSPACE_DIR/main.tf" <<EOF
terraform {
required_version = ">= 1.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
}
backend "s3" {
bucket = "${PROJECT_NAME}-terraform-state"
key = "${ENVIRONMENT}/terraform.tfstate"
region = "${REGION}"
}
}
provider "aws" {
region = var.aws_region
}
# Variables
variable "aws_region" {
description = "AWS region"
type = string
default = "${REGION}"
}
variable "environment" {
description = "Environment name"
type = string
default = "${ENVIRONMENT}"
}
variable "project_name" {
description = "Project name"
type = string
default = "${PROJECT_NAME}"
}
# VPC and Networking
module "vpc" {
source = "../../modules/vpc"
environment = var.environment
project_name = var.project_name
aws_region = var.aws_region
}
# EKS Cluster
module "eks" {
source = "../../modules/eks"
environment = var.environment
project_name = var.project_name
vpc_id = module.vpc.vpc_id
subnet_ids = module.vpc.private_subnet_ids
depends_on = [module.vpc]
}
# RDS Database
module "database" {
source = "../../modules/rds"
environment = var.environment
project_name = var.project_name
vpc_id = module.vpc.vpc_id
subnet_ids = module.vpc.database_subnet_ids
security_groups = [module.eks.database_security_group_id]
depends_on = [module.vpc]
}
# S3 Buckets
module "storage" {
source = "../../modules/s3"
environment = var.environment
project_name = var.project_name
}
# Outputs
output "eks_cluster_name" {
value = module.eks.cluster_name
}
output "database_endpoint" {
value = module.database.endpoint
}
output "s3_bucket_names" {
value = module.storage.bucket_names
}
EOF
# Initialize and apply Terraform
cd "$WORKSPACE_DIR"
log_info "Initializing Terraform..."
terraform init
log_info "Planning Terraform deployment..."
terraform plan -out=tfplan
echo -e "${YELLOW}Review the Terraform plan above.${NC}"
read -p "Proceed with deployment? (y/N): " confirm
if [[ $confirm == [yY] ]]; then
log_info "Applying Terraform configuration..."
terraform apply tfplan
# Save outputs
terraform output -json >outputs.json
log_success "Environment $ENVIRONMENT provisioned successfully!"
# Configure kubectl
if [[ $(terraform output -raw eks_cluster_name) ]]; then
log_info "Configuring kubectl..."
aws eks update-kubeconfig --region $REGION --name $(terraform output -raw eks_cluster_name)
fi
else
log_info "Deployment cancelled"
rm -f tfplan
fiinfrastructure/k8s/cluster-manager.sh
#!/bin/bash
# Kubernetes cluster management
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../utilities/colors.sh"
source "$SCRIPT_DIR/../../utilities/logging.sh"
show_menu() {
echo -e "${BLUE}=== Kubernetes Cluster Manager ===${NC}"
echo "1) List clusters"
echo "2) Switch context"
echo "3) Deploy application"
echo "4) Scale deployment"
echo "5) View logs"
echo "6) Port forward"
echo "7) Execute command in pod"
echo "8) Cluster health check"
echo "9) Resource usage"
echo "10) Backup ETCD"
echo "0) Exit"
}
list_clusters() {
log_info "Available clusters:"
kubectl config get-contexts
}
switch_context() {
kubectl config get-contexts
read -p "Enter context name: " context
kubectl config use-context "$context"
log_success "Switched to context: $context"
}
deploy_application() {
echo -e "${BLUE}Application Deployment${NC}"
read -p "Namespace: " namespace
read -p "Deployment file/directory: " deploy_path
if [[ -d "$deploy_path" ]]; then
kubectl apply -f "$deploy_path" -n "$namespace"
elif [[ -f "$deploy_path" ]]; then
kubectl apply -f "$deploy_path" -n "$namespace"
else
log_error "Deploy path not found: $deploy_path"
return 1
fi
log_success "Application deployed to namespace: $namespace"
}
scale_deployment() {
kubectl get deployments --all-namespaces
read -p "Namespace: " namespace
read -p "Deployment name: " deployment
read -p "Replica count: " replicas
kubectl scale deployment "$deployment" --replicas="$replicas" -n "$namespace"
log_success "Deployment $deployment scaled to $replicas replicas"
}
view_logs() {
kubectl get pods --all-namespaces
read -p "Namespace: " namespace
read -p "Pod name (or partial name): " pod_name
# Find matching pod
full_pod_name=$(kubectl get pods -n "$namespace" | grep "$pod_name" | head -1 | awk '{print $1}')
if [[ -n "$full_pod_name" ]]; then
echo -e "${BLUE}Logs for $full_pod_name:${NC}"
kubectl logs -f "$full_pod_name" -n "$namespace"
else
log_error "Pod not found: $pod_name"
fi
}
port_forward() {
kubectl get services --all-namespaces
read -p "Namespace: " namespace
read -p "Service name: " service
read -p "Local port: " local_port
read -p "Service port: " service_port
log_info "Port forwarding $local_port:$service_port for service $service"
kubectl port-forward service/"$service" "$local_port:$service_port" -n "$namespace"
}
exec_pod_command() {
kubectl get pods --all-namespaces
read -p "Namespace: " namespace
read -p "Pod name: " pod_name
read -p "Command (default: /bin/bash): " command
command=${command:-"/bin/bash"}
kubectl exec -it "$pod_name" -n "$namespace" -- $command
}
cluster_health_check() {
log_info "Cluster Health Check"
echo -e "\n${BLUE}Node Status:${NC}"
kubectl get nodes
echo -e "\n${BLUE}System Pods:${NC}"
kubectl get pods -n kube-system
echo -e "\n${BLUE}Cluster Info:${NC}"
kubectl cluster-info
echo -e "\n${BLUE}Resource Usage:${NC}"
kubectl top nodes 2>/dev/null || echo "Metrics server not available"
}
resource_usage() {
log_info "Resource Usage Report"
echo -e "\n${BLUE}Node Resource Usage:${NC}"
kubectl top nodes 2>/dev/null || echo "Metrics server not available"
echo -e "\n${BLUE}Pod Resource Usage (Top 10):${NC}"
kubectl top pods --all-namespaces --sort-by=cpu 2>/dev/null | head -11 || echo "Metrics server not available"
echo -e "\n${BLUE}Persistent Volume Claims:${NC}"
kubectl get pvc --all-namespaces
echo -e "\n${BLUE}Storage Classes:${NC}"
kubectl get storageclass
}
backup_etcd() {
log_info "ETCD Backup"
timestamp=$(date +%Y%m%d_%H%M%S)
backup_path="/tmp/etcd-backup-$timestamp"
# This assumes ETCD is accessible - adjust for your setup
ETCDCTL_API=3 etcdctl snapshot save "$backup_path.db" \
--endpoints=https://127.0.0.1:2379 \
--cacert=/etc/kubernetes/pki/etcd/ca.crt \
--cert=/etc/kubernetes/pki/etcd/server.crt \
--key=/etc/kubernetes/pki/etcd/server.key
log_success "ETCD backup saved to: $backup_path.db"
}
# Main menu loop
while true; do
show_menu
read -p "Choose option: " choice
case $choice in
1) list_clusters ;;
2) switch_context ;;
3) deploy_application ;;
4) scale_deployment ;;
5) view_logs ;;
6) port_forward ;;
7) exec_pod_command ;;
8) cluster_health_check ;;
9) resource_usage ;;
10) backup_etcd ;;
0) exit 0 ;;
*) echo -e "${RED}Invalid option${NC}" ;;
esac
echo ""
read -p "Press Enter to continue..."
clear
donedeployment/deploy-app.sh
#!/bin/bash
# Multi-environment application deployment
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../utilities/colors.sh"
source "$SCRIPT_DIR/../utilities/logging.sh"
# Configuration
APP_NAME=${1:-"myapp"}
ENVIRONMENT=${2:-"development"}
VERSION=${3:-"latest"}
NAMESPACE="$APP_NAME-$ENVIRONMENT"
# Environment-specific configurations
case $ENVIRONMENT in
"development")
REPLICAS=1
RESOURCES_REQUESTS_CPU="100m"
RESOURCES_REQUESTS_MEMORY="128Mi"
RESOURCES_LIMITS_CPU="500m"
RESOURCES_LIMITS_MEMORY="512Mi"
;;
"staging")
REPLICAS=2
RESOURCES_REQUESTS_CPU="200m"
RESOURCES_REQUESTS_MEMORY="256Mi"
RESOURCES_LIMITS_CPU="1000m"
RESOURCES_LIMITS_MEMORY="1Gi"
;;
"production")
REPLICAS=3
RESOURCES_REQUESTS_CPU="500m"
RESOURCES_REQUESTS_MEMORY="512Mi"
RESOURCES_LIMITS_CPU="2000m"
RESOURCES_LIMITS_MEMORY="2Gi"
;;
*)
log_error "Unknown environment: $ENVIRONMENT"
exit 1
;;
esac
log_info "Deploying $APP_NAME version $VERSION to $ENVIRONMENT"
# Pre-deployment checks
check_prerequisites() {
log_info "Running pre-deployment checks..."
# Check kubectl access
if ! kubectl cluster-info &>/dev/null; then
log_error "Cannot connect to Kubernetes cluster"
exit 1
fi
# Check if namespace exists
if ! kubectl get namespace "$NAMESPACE" &>/dev/null; then
log_info "Creating namespace: $NAMESPACE"
kubectl create namespace "$NAMESPACE"
fi
# Check if image exists
if [[ "$VERSION" != "latest" ]]; then
# This would typically check your container registry
log_info "Verifying image: $APP_NAME:$VERSION"
fi
log_success "Pre-deployment checks passed"
}
# Generate Kubernetes manifests
generate_manifests() {
log_info "Generating Kubernetes manifests..."
MANIFEST_DIR="/tmp/$APP_NAME-$ENVIRONMENT-deploy"
mkdir -p "$MANIFEST_DIR"
# Deployment manifest
cat >"$MANIFEST_DIR/deployment.yaml" <<EOF
apiVersion: apps/v1
kind: Deployment
metadata:
name: $APP_NAME
namespace: $NAMESPACE
labels:
app: $APP_NAME
environment: $ENVIRONMENT
spec:
replicas: $REPLICAS
selector:
matchLabels:
app: $APP_NAME
template:
metadata:
labels:
app: $APP_NAME
environment: $ENVIRONMENT
spec:
containers:
- name: $APP_NAME
image: $APP_NAME:$VERSION
ports:
- containerPort: 3000
env:
- name: NODE_ENV
value: $ENVIRONMENT
- name: PORT
value: "3000"
resources:
requests:
cpu: $RESOURCES_REQUESTS_CPU
memory: $RESOURCES_REQUESTS_MEMORY
limits:
cpu: $RESOURCES_LIMITS_CPU
memory: $RESOURCES_LIMITS_MEMORY
livenessProbe:
httpGet:
path: /health
port: 3000
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /ready
port: 3000
initialDelaySeconds: 5
periodSeconds: 5
EOF
# Service manifest
cat >"$MANIFEST_DIR/service.yaml" <<EOF
apiVersion: v1
kind: Service
metadata:
name: $APP_NAME-service
namespace: $NAMESPACE
labels:
app: $APP_NAME
spec:
selector:
app: $APP_NAME
ports:
- protocol: TCP
port: 80
targetPort: 3000
type: ClusterIP
EOF
# Ingress manifest (for production)
if [[ "$ENVIRONMENT" == "production" ]]; then
cat >"$MANIFEST_DIR/ingress.yaml" <<EOF
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: $APP_NAME-ingress
namespace: $NAMESPACE
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /
spec:
rules:
- host: $APP_NAME.company.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: $APP_NAME-service
port:
number: 80
EOF
fi
log_success "Manifests generated in $MANIFEST_DIR"
}
# Deploy application
deploy_application() {
log_info "Deploying application to Kubernetes..."
# Apply manifests
kubectl apply -f "$MANIFEST_DIR/"
# Wait for deployment to be ready
log_info "Waiting for deployment to be ready..."
kubectl rollout status deployment/$APP_NAME -n "$NAMESPACE" --timeout=300s
# Verify deployment
log_info "Verifying deployment..."
kubectl get pods -n "$NAMESPACE" -l app="$APP_NAME"
log_success "Deployment completed successfully!"
}
# Post-deployment tests
run_post_deployment_tests() {
log_info "Running post-deployment tests..."
# Get service endpoint
if [[ "$ENVIRONMENT" == "production" ]]; then
ENDPOINT="http://$APP_NAME.company.com"
else
# Port forward for testing
kubectl port-forward service/$APP_NAME-service 8080:80 -n "$NAMESPACE" &
PORT_FORWARD_PID=$!
sleep 5
ENDPOINT="http://localhost:8080"
fi
# Health check
if curl -f "$ENDPOINT/health" &>/dev/null; then
log_success "Health check passed"
else
log_error "Health check failed"
cleanup_and_exit 1
fi
# Cleanup port forward
if [[ -n "$PORT_FORWARD_PID" ]]; then
kill $PORT_FORWARD_PID 2>/dev/null || true
fi
log_success "Post-deployment tests passed"
}
# Cleanup function
cleanup_and_exit() {
local exit_code=${1:-0}
# Kill port forward if running
if [[ -n "$PORT_FORWARD_PID" ]]; then
kill $PORT_FORWARD_PID 2>/dev/null || true
fi
# Clean up temporary files
rm -rf "$MANIFEST_DIR" 2>/dev/null || true
exit $exit_code
}
# Main deployment flow
main() {
check_prerequisites
generate_manifests
deploy_application
run_post_deployment_tests
log_success "🚀 Deployment of $APP_NAME:$VERSION to $ENVIRONMENT completed successfully!"
# Show access information
echo -e "\n${BLUE}Access Information:${NC}"
if [[ "$ENVIRONMENT" == "production" ]]; then
echo " URL: http://$APP_NAME.company.com"
else
echo " Port forward: kubectl port-forward service/$APP_NAME-service 8080:80 -n $NAMESPACE"
echo " Then access: http://localhost:8080"
fi
}
# Trap for cleanup
trap cleanup_and_exit EXIT
# Run main function
maindeployment/blue-green-deploy.sh
#!/bin/bash
# Blue-green deployment strategy
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../utilities/colors.sh"
source "$SCRIPT_DIR/../utilities/logging.sh"
APP_NAME=${1:-"myapp"}
ENVIRONMENT=${2:-"production"}
NEW_VERSION=${3:-"latest"}
NAMESPACE="$APP_NAME-$ENVIRONMENT"
# Determine current and new colors
CURRENT_COLOR=$(kubectl get service "$APP_NAME-service" -n "$NAMESPACE" -o jsonpath='{.spec.selector.color}' 2>/dev/null || echo "blue")
if [[ "$CURRENT_COLOR" == "blue" ]]; then
NEW_COLOR="green"
else
NEW_COLOR="blue"
fi
log_info "Blue-Green Deployment: $APP_NAME:$NEW_VERSION"
log_info "Current active: $CURRENT_COLOR, Deploying to: $NEW_COLOR"
# Deploy new version to inactive color
deploy_new_version() {
log_info "Deploying new version to $NEW_COLOR environment..."
# Create deployment with new color
cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
name: $APP_NAME-$NEW_COLOR
namespace: $NAMESPACE
labels:
app: $APP_NAME
color: $NEW_COLOR
spec:
replicas: 3
selector:
matchLabels:
app: $APP_NAME
color: $NEW_COLOR
template:
metadata:
labels:
app: $APP_NAME
color: $NEW_COLOR
spec:
containers:
- name: $APP_NAME
image: $APP_NAME:$NEW_VERSION
ports:
- containerPort: 3000
env:
- name: NODE_ENV
value: $ENVIRONMENT
livenessProbe:
httpGet:
path: /health
port: 3000
initialDelaySeconds: 30
readinessProbe:
httpGet:
path: /ready
port: 3000
initialDelaySeconds: 10
EOF
# Wait for new deployment
kubectl rollout status deployment/$APP_NAME-$NEW_COLOR -n "$NAMESPACE" --timeout=300s
log_success "New version deployed to $NEW_COLOR"
}
# Test new version
test_new_version() {
log_info "Testing new version on $NEW_COLOR..."
# Port forward to test the new version
kubectl port-forward deployment/$APP_NAME-$NEW_COLOR 9090:3000 -n "$NAMESPACE" &
PORT_FORWARD_PID=$!
sleep 10
# Run tests
if curl -f http://localhost:9090/health &>/dev/null; then
log_success "Health check passed for $NEW_COLOR"
else
log_error "Health check failed for $NEW_COLOR"
kill $PORT_FORWARD_PID
return 1
fi
# Additional smoke tests
if curl -f http://localhost:9090/api/status &>/dev/null; then
log_success "API status check passed for $NEW_COLOR"
else
log_warning "API status check failed for $NEW_COLOR"
fi
kill $PORT_FORWARD_PID
log_success "New version testing completed"
}
# Switch traffic to new version
switch_traffic() {
log_info "Switching traffic from $CURRENT_COLOR to $NEW_COLOR..."
# Update service selector
kubectl patch service "$APP_NAME-service" -n "$NAMESPACE" -p '{"spec":{"selector":{"color":"'$NEW_COLOR'"}}}'
# Verify traffic switch
sleep 5
ACTIVE_COLOR=$(kubectl get service "$APP_NAME-service" -n "$NAMESPACE" -o jsonpath='{.spec.selector.color}')
if [[ "$ACTIVE_COLOR" == "$NEW_COLOR" ]]; then
log_success "Traffic successfully switched to $NEW_COLOR"
else
log_error "Traffic switch failed"
return 1
fi
}
# Cleanup old version
cleanup_old_version() {
echo -e "${YELLOW}Do you want to remove the old version ($CURRENT_COLOR)? This cannot be undone.${NC}"
read -p "Remove old version? (y/N): " confirm
if [[ $confirm == [yY] ]]; then
log_info "Removing old version ($CURRENT_COLOR)..."
kubectl delete deployment "$APP_NAME-$CURRENT_COLOR" -n "$NAMESPACE"
log_success "Old version removed"
else
log_info "Old version kept for rollback capability"
echo "To remove later: kubectl delete deployment $APP_NAME-$CURRENT_COLOR -n $NAMESPACE"
fi
}
# Rollback function
rollback() {
log_warning "Rolling back to $CURRENT_COLOR..."
kubectl patch service "$APP_NAME-service" -n "$NAMESPACE" -p '{"spec":{"selector":{"color":"'$CURRENT_COLOR'"}}}'
log_success "Rollback completed"
}
# Main deployment flow
main() {
# Check if current deployment exists
if ! kubectl get deployment "$APP_NAME-$CURRENT_COLOR" -n "$NAMESPACE" &>/dev/null; then
log_error "No current deployment found. Use regular deployment first."
exit 1
fi
deploy_new_version
if test_new_version; then
switch_traffic
# Final verification
log_info "Performing final verification..."
sleep 10
if curl -f "http://$APP_NAME.company.com/health" &>/dev/null; then
log_success "🎉 Blue-green deployment completed successfully!"
cleanup_old_version
else
log_error "Final verification failed, rolling back..."
rollback
exit 1
fi
else
log_error "New version testing failed, keeping current version active"
kubectl delete deployment "$APP_NAME-$NEW_COLOR" -n "$NAMESPACE"
exit 1
fi
}
# Trap for emergency rollback
trap 'echo -e "\n${RED}Deployment interrupted! Rolling back...${NC}"; rollback' INT TERM
mainmonitoring/system-monitor.sh
#!/bin/bash
# Comprehensive system monitoring
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../utilities/colors.sh"
source "$SCRIPT_DIR/../utilities/logging.sh"
# Configuration
ALERT_THRESHOLD_CPU=80
ALERT_THRESHOLD_MEMORY=85
ALERT_THRESHOLD_DISK=90
SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL:-}"
send_alert() {
local message="$1"
local severity="${2:-WARNING}"
log_warning "$message"
if [[ -n "$SLACK_WEBHOOK_URL" ]]; then
curl -X POST -H 'Content-type: application/json' \
--data "{\"text\":\"🚨 [$severity] $message\"}" \
"$SLACK_WEBHOOK_URL"
fi
}
check_system_resources() {
log_info "Checking system resources..."
# CPU usage
cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
cpu_usage=${cpu_usage%.*} # Remove decimal
if [[ $cpu_usage -gt $ALERT_THRESHOLD_CPU ]]; then
send_alert "High CPU usage: ${cpu_usage}%" "CRITICAL"
fi
# Memory usage
memory_info=$(free | grep Mem)
total_memory=$(echo $memory_info | awk '{print $2}')
used_memory=$(echo $memory_info | awk '{print $3}')
memory_usage=$((used_memory * 100 / total_memory))
if [[ $memory_usage -gt $ALERT_THRESHOLD_MEMORY ]]; then
send_alert "High memory usage: ${memory_usage}%" "CRITICAL"
fi
# Disk usage
while IFS= read -r line; do
usage=$(echo "$line" | awk '{print $5}' | sed 's/%//')
mount=$(echo "$line" | awk '{print $6}')
if [[ $usage -gt $ALERT_THRESHOLD_DISK ]]; then
send_alert "High disk usage: ${usage}% on ${mount}" "CRITICAL"
fi
done < <(df -h | grep -E '^/dev/')
echo -e "${BLUE}System Resources:${NC}"
echo " CPU Usage: ${cpu_usage}%"
echo " Memory Usage: ${memory_usage}%"
echo " Disk Usage:"
df -h | grep -E '^/dev/' | while read line; do
echo " $line"
done
}
check_services() {
log_info "Checking critical services..."
# Define critical services
CRITICAL_SERVICES=("docker" "nginx" "postgresql" "redis")
for service in "${CRITICAL_SERVICES[@]}"; do
if systemctl is-active --quiet "$service"; then
echo -e " ${GREEN}✓${NC} $service is running"
else
send_alert "Service $service is not running" "CRITICAL"
echo -e " ${RED}✗${NC} $service is not running"
fi
done
}
check_kubernetes_health() {
if command -v kubectl &>/dev/null; then
log_info "Checking Kubernetes cluster health..."
# Check node status
not_ready_nodes=$(kubectl get nodes | grep -v Ready | grep -v STATUS | wc -l)
if [[ $not_ready_nodes -gt 0 ]]; then
send_alert "$not_ready_nodes Kubernetes nodes are not ready" "CRITICAL"
fi
# Check failed pods
failed_pods=$(kubectl get pods --all-namespaces --field-selector=status.phase=Failed --no-headers | wc -l)
if [[ $failed_pods -gt 0 ]]; then
send_alert "$failed_pods pods are in Failed state" "WARNING"
fi
# Check pending pods
pending_pods=$(kubectl get pods --all-namespaces --field-selector=status.phase=Pending --no-headers | wc -l)
if [[ $pending_pods -gt 5 ]]; then
send_alert "$pending_pods pods are in Pending state" "WARNING"
fi
echo -e "${BLUE}Kubernetes Status:${NC}"
echo " Not Ready Nodes: $not_ready_nodes"
echo " Failed Pods: $failed_pods"
echo " Pending Pods: $pending_pods"
fi
}
check_application_health() {
log_info "Checking application health endpoints..."
# Define application health endpoints
HEALTH_ENDPOINTS=(
"http://localhost:3000/health"
"http://localhost:8080/actuator/health"
"http://api.company.com/health"
)
for endpoint in "${HEALTH_ENDPOINTS[@]}"; do
if curl -f --max-time 10 "$endpoint" &>/dev/null; then
echo -e " ${GREEN}✓${NC} $endpoint is healthy"
else
send_alert "Health check failed for $endpoint" "CRITICAL"
echo -e " ${RED}✗${NC} $endpoint is unhealthy"
fi
done
}
check_log_errors() {
log_info "Checking for recent errors in logs..."
# Check system logs for errors in the last hour
error_count=$(journalctl --since "1 hour ago" --priority=err --no-pager | wc -l)
if [[ $error_count -gt 10 ]]; then
send_alert "$error_count error messages in system logs in the last hour" "WARNING"
fi
# Check application logs if they exist
if [[ -f "/var/log/application.log" ]]; then
app_errors=$(grep -i error /var/log/application.log | grep "$(date '+%Y-%m-%d %H')" | wc -l)
if [[ $app_errors -gt 5 ]]; then
send_alert "$app_errors application errors in the last hour" "WARNING"
fi
fi
echo -e "${BLUE}Log Analysis:${NC}"
echo " System errors (last hour): $error_count"
if [[ -f "/var/log/application.log" ]]; then
echo " Application errors (last hour): $app_errors"
fi
}
generate_report() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
local report_file="/tmp/system-monitor-report-$(date +%Y%m%d-%H%M).txt"
{
echo "System Monitor Report - $timestamp"
echo "======================================="
echo ""
check_system_resources
echo ""
check_services
echo ""
check_kubernetes_health
echo ""
check_application_health
echo ""
check_log_errors
} | tee "$report_file"
log_success "Report saved to: $report_file"
}
# Main monitoring function
main() {
echo -e "${BBLUE}=== System Monitor ===${NC}\n"
generate_report
log_success "System monitoring completed"
}
mainincident-response/emergency-toolkit.sh
#!/bin/bash
# Emergency incident response toolkit
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../utilities/colors.sh"
source "$SCRIPT_DIR/../utilities/logging.sh"
show_emergency_menu() {
echo -e "${BRED}=== EMERGENCY RESPONSE TOOLKIT ===${NC}"
echo -e "${YELLOW}Use with caution - these actions affect production!${NC}\n"
echo "1) Scale down problematic service"
echo "2) Rollback deployment"
echo "3) Restart failed pods"
echo "4) Enable maintenance mode"
echo "5) Emergency log collection"
echo "6) Database emergency backup"
echo "7) Traffic rerouting"
echo "8) Resource emergency scaling"
echo "9) Complete system status"
echo "0) Exit"
}
scale_down_service() {
echo -e "${BRED}EMERGENCY: Scale down service${NC}"
kubectl get deployments --all-namespaces
read -p "Namespace: " namespace
read -p "Deployment name: " deployment
read -p "Scale to replicas (0 to stop): " replicas
echo -e "${YELLOW}This will immediately scale $deployment to $replicas replicas!${NC}"
read -p "Confirm (type 'EMERGENCY'): " confirm
if [[ "$confirm" == "EMERGENCY" ]]; then
kubectl scale deployment "$deployment" --replicas="$replicas" -n "$namespace"
log_success "Emergency scaling completed"
# Log the incident
echo "$(date): EMERGENCY SCALE - $deployment scaled to $replicas replicas by $USER" >>/tmp/emergency-actions.log
else
log_info "Emergency scaling cancelled"
fi
}
emergency_rollback() {
echo -e "${BRED}EMERGENCY: Rollback deployment${NC}"
kubectl get deployments --all-namespaces
read -p "Namespace: " namespace
read -p "Deployment name: " deployment
# Show rollout history
echo -e "\n${BLUE}Rollout history:${NC}"
kubectl rollout history deployment/"$deployment" -n "$namespace"
read -p "Revision to rollback to (blank for previous): " revision
echo -e "${YELLOW}This will immediately rollback $deployment!${NC}"
read -p "Confirm (type 'EMERGENCY'): " confirm
if [[ "$confirm" == "EMERGENCY" ]]; then
if [[ -n "$revision" ]]; then
kubectl rollout undo deployment/"$deployment" --to-revision="$revision" -n "$namespace"
else
kubectl rollout undo deployment/"$deployment" -n "$namespace"
fi
kubectl rollout status deployment/"$deployment" -n "$namespace"
log_success "Emergency rollback completed"
# Log the incident
echo "$(date): EMERGENCY ROLLBACK - $deployment rolled back by $USER" >>/tmp/emergency-actions.log
else
log_info "Emergency rollback cancelled"
fi
}
restart_failed_pods() {
echo -e "${BRED}EMERGENCY: Restart failed pods${NC}"
# Find failed pods
failed_pods=$(kubectl get pods --all-namespaces --field-selector=status.phase=Failed --no-headers)
if [[ -z "$failed_pods" ]]; then
log_info "No failed pods found"
return
fi
echo -e "${BLUE}Failed pods:${NC}"
echo "$failed_pods"
read -p "Restart all failed pods? (y/N): " confirm
if [[ $confirm == [yY] ]]; then
echo "$failed_pods" | while read namespace name status age; do
kubectl delete pod "$name" -n "$namespace"
log_info "Deleted failed pod: $name in $namespace"
done
log_success "Failed pods restarted"
echo "$(date): EMERGENCY RESTART - Failed pods restarted by $USER" >>/tmp/emergency-actions.log
fi
}
enable_maintenance_mode() {
echo -e "${BRED}EMERGENCY: Enable maintenance mode${NC}"
# This assumes you have a maintenance page setup
read -p "Duration in minutes: " duration
echo -e "${YELLOW}This will put the system in maintenance mode for $duration minutes!${NC}"
read -p "Confirm (type 'EMERGENCY'): " confirm
if [[ "$confirm" == "EMERGENCY" ]]; then
# Update ingress to point to maintenance page
kubectl patch ingress app-ingress -n production -p '{
"spec": {
"rules": [{
"host": "app.company.com",
"http": {
"paths": [{
"path": "/",
"pathType": "Prefix",
"backend": {
"service": {
"name": "maintenance-service",
"port": {"number": 80}
}
}
}]
}
}]
}
}'
log_success "Maintenance mode enabled for $duration minutes"
# Set reminder to disable maintenance mode
echo "#!/bin/bash" >/tmp/disable-maintenance.sh
echo "kubectl patch ingress app-ingress -n production --type=json -p='[{\"op\": \"replace\", \"path\": \"/spec/rules/0/http/paths/0/backend/service/name\", \"value\": \"app-service\"}]'" >>/tmp/disable-maintenance.sh
chmod +x /tmp/disable-maintenance.sh
echo "$(date): EMERGENCY MAINTENANCE - Maintenance mode enabled for $duration minutes by $USER" >>/tmp/emergency-actions.log
echo -e "${YELLOW}Remember to disable maintenance mode: /tmp/disable-maintenance.sh${NC}"
fi
}
emergency_log_collection() {
echo -e "${BRED}EMERGENCY: Log collection${NC}"
timestamp=$(date +%Y%m%d_%H%M%S)
log_dir="/tmp/emergency-logs-$timestamp"
mkdir -p "$log_dir"
log_info "Collecting emergency logs to $log_dir"
# System logs
journalctl --since "2 hours ago" >"$log_dir/system.log"
# Kubernetes logs
if command -v kubectl &>/dev/null; then
kubectl get events --all-namespaces --sort-by='.lastTimestamp' >"$log_dir/k8s-events.log"
kubectl get pods --all-namespaces -o wide >"$log_dir/k8s-pods.log"
# Pod logs for failed/pending pods
kubectl get pods --all-namespaces --field-selector=status.phase=Failed -o json \
| jq -r '.items[] | "\(.metadata.namespace) \(.metadata.name)"' \
| while read namespace pod; do
kubectl logs "$pod" -n "$namespace" >"$log_dir/pod-$namespace-$pod.log" 2>/dev/null || true
done
fi
# Application logs
if [[ -d "/var/log" ]]; then
cp /var/log/*.log "$log_dir/" 2>/dev/null || true
fi
# Create archive
tar -czf "$log_dir.tar.gz" -C "/tmp" "emergency-logs-$timestamp"
log_success "Emergency logs collected: $log_dir.tar.gz"
echo "$(date): EMERGENCY LOGS - Logs collected by $USER" >>/tmp/emergency-actions.log
}
# Main emergency menu
while true; do
show_emergency_menu
read -p "Choose emergency action: " choice
case $choice in
1) scale_down_service ;;
2) emergency_rollback ;;
3) restart_failed_pods ;;
4) enable_maintenance_mode ;;
5) emergency_log_collection ;;
6) echo "Database backup feature - implement based on your DB" ;;
7) echo "Traffic rerouting feature - implement based on your setup" ;;
8) echo "Resource scaling feature - implement based on your infrastructure" ;;
9) dotrun run monitoring/system-monitor ;;
0) exit 0 ;;
*) echo -e "${RED}Invalid option${NC}" ;;
esac
echo ""
read -p "Press Enter to continue..."
clear
doneci-cd/jenkins/Jenkinsfile
pipeline {
agent any
environment {
DOCKER_REGISTRY = 'your-registry.com'
APP_NAME = 'myapp'
DOTRUN_SCRIPTS = '/opt/dotrun-scripts'
}
stages {
stage('Checkout') {
steps {
checkout scm
}
}
stage('Build') {
steps {
script {
sh "${DOTRUN_SCRIPTS}/ci-cd/build-application.sh ${APP_NAME} ${BUILD_NUMBER}"
}
}
}
stage('Test') {
steps {
script {
sh "${DOTRUN_SCRIPTS}/ci-cd/run-tests.sh ${APP_NAME}"
}
}
post {
always {
publishTestResults testResultsPattern: 'test-results.xml'
}
}
}
stage('Security Scan') {
steps {
script {
sh "${DOTRUN_SCRIPTS}/ci-cd/security-scan.sh ${APP_NAME}"
}
}
}
stage('Deploy to Staging') {
when {
branch 'develop'
}
steps {
script {
sh "${DOTRUN_SCRIPTS}/deployment/deploy-app.sh ${APP_NAME} staging ${BUILD_NUMBER}"
}
}
}
stage('Deploy to Production') {
when {
branch 'master'
}
steps {
script {
sh "${DOTRUN_SCRIPTS}/deployment/blue-green-deploy.sh ${APP_NAME} production ${BUILD_NUMBER}"
}
}
}
}
post {
failure {
sh "${DOTRUN_SCRIPTS}/utilities/send-alert.sh 'Pipeline failed for ${APP_NAME}:${BUILD_NUMBER}'"
}
success {
sh "${DOTRUN_SCRIPTS}/utilities/send-alert.sh 'Pipeline succeeded for ${APP_NAME}:${BUILD_NUMBER}'"
}
}
}This comprehensive DevOps workflow provides robust infrastructure management, automated deployments, monitoring capabilities, and emergency response tools that can be integrated into any modern DevOps pipeline.