From 001909104fc201f2b7bdadba9f5d12b46de69224 Mon Sep 17 00:00:00 2001 From: Lucas Baker Date: Mon, 19 May 2025 10:56:25 -0400 Subject: [PATCH 01/10] Added to the readme --- README.md | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/README.md b/README.md index 047b8e4..1f12db9 100644 --- a/README.md +++ b/README.md @@ -162,12 +162,43 @@ Running **Resource Exhaustion** experiment with custom parameters: ## Viewing the Dashboard +### *Make sure your kubernetes cluster is up and running* + +1. Open a new tab on your browser and type in "localhost:32000" +2. Username: admin, Password: admin +3. Click on "Dashboards" on the left hand side +4. Click on "Capstone" +5. Click on "Database Recovery System" + +Done! ## Querying MongoDB Directly +### *Make sure your kubernetes cluster is up and running* +In your terminal... +1. Type in this command: `kubectl get pods` +2. Locate the mongodb pod and copy the name +3. Type in this command: `kubectl exec -it -- sh` +4. Log into mongosh using this command: `mongosh -u root -p` +5. When prompted, type in the password: `root` +6. From there, you can see databases with: `show dbs` +7. Type: `use metrics_db` +8. From there, you can see collections with: `show collections` +9. To view a collection, type: `db..find().pretty()` +10. At this point you can enter your query ## Querying MySQL Directly +### *Make sure your kubernetes cluster is up and running* +In your terminal... +3. Type in this command: `kubectl exec -it mysql-summary-records-0 -- sh` +4. Log into mongosh using this command: `mysql -u root -p` +5. When prompted, type in the password: `root` +6. From there, you can see databases with: `show databases;` +7. Type: `use summary_db;` +8. From there, you can see tables with: `show tables;` +9. To view a table, type: `select * from ;` +10. At this point you can enter your query # Credits Designed and built by Lucas Baker, Rachel Cox, Henry Hewitt, and Lukas McCain for the the February 2025 cohort of the PNC/TEKsystems Early Career SRE bootcamp. From 9bbd3548e56a23875b4699e1e277f37c04daed5c Mon Sep 17 00:00:00 2001 From: Lukas Date: Mon, 19 May 2025 10:41:33 -0400 Subject: [PATCH 02/10] fixed the issue with the replica not replicating the primary --- docker/app/mysql-proxy-server.py | 1 + docker/mysql-primary/Dockerfile | 4 ++++ docker/mysql-primary/init-replica-user.sql | 3 +++ docker/mysql-replica/Dockerfile | 5 +++++ docker/mysql-replica/replication-init.sh | 22 ++++++++++++++++++++ docker/mysql-replica/test-log.sh | 2 ++ k8s/deployments/app-deployment.yaml | 2 +- k8s/statefulsets/mysql-proxy-deployment.yaml | 18 ++++++++++------ 8 files changed, 50 insertions(+), 7 deletions(-) create mode 100644 docker/mysql-primary/Dockerfile create mode 100644 docker/mysql-primary/init-replica-user.sql create mode 100644 docker/mysql-replica/Dockerfile create mode 100644 docker/mysql-replica/replication-init.sh create mode 100644 docker/mysql-replica/test-log.sh diff --git a/docker/app/mysql-proxy-server.py b/docker/app/mysql-proxy-server.py index d344c2d..6ea793a 100644 --- a/docker/app/mysql-proxy-server.py +++ b/docker/app/mysql-proxy-server.py @@ -276,6 +276,7 @@ def configure_as_replica(replica_host, master_host): conn = mysql.connector.connect(host=replica_host, user="root", password="admin") cursor = conn.cursor() cursor.execute("STOP SLAVE;") + cursor.execute("RESET SLAVE ALL;") cursor.execute(f""" CHANGE MASTER TO MASTER_HOST='{master_host}', diff --git a/docker/mysql-primary/Dockerfile b/docker/mysql-primary/Dockerfile new file mode 100644 index 0000000..d10ef0b --- /dev/null +++ b/docker/mysql-primary/Dockerfile @@ -0,0 +1,4 @@ +FROM mysql:8.0 + +# Add an init SQL script to create the replica user +COPY init-replica-user.sql /docker-entrypoint-initdb.d/init-replica-user.sql diff --git a/docker/mysql-primary/init-replica-user.sql b/docker/mysql-primary/init-replica-user.sql new file mode 100644 index 0000000..21af716 --- /dev/null +++ b/docker/mysql-primary/init-replica-user.sql @@ -0,0 +1,3 @@ +CREATE USER IF NOT EXISTS 'replica_user'@'%' IDENTIFIED BY 'replica_password'; +GRANT REPLICATION SLAVE ON *.* TO 'replica_user'@'%'; +FLUSH PRIVILEGES; diff --git a/docker/mysql-replica/Dockerfile b/docker/mysql-replica/Dockerfile new file mode 100644 index 0000000..e97ad67 --- /dev/null +++ b/docker/mysql-replica/Dockerfile @@ -0,0 +1,5 @@ +FROM mysql:8.0 + +COPY replication-init.sh /docker-entrypoint-initdb.d/replication-init.sh +COPY test-log.sh /docker-entrypoint-initdb.d/test-log.sh +RUN chmod +x /docker-entrypoint-initdb.d/*.sh diff --git a/docker/mysql-replica/replication-init.sh b/docker/mysql-replica/replication-init.sh new file mode 100644 index 0000000..080ab64 --- /dev/null +++ b/docker/mysql-replica/replication-init.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -e + +echo "[replication-init] SCRIPT STARTED" + +until mysql -h mysql-primary -uroot -p"$MYSQL_ROOT_PASSWORD" -e "SELECT 1" &>/dev/null; do + echo "[replication-init] Waiting for primary..." + sleep 3 +done + +echo "[replication-init] Connecting and starting replication..." + +mysql -uroot -p"$MYSQL_ROOT_PASSWORD" < /var/log/test-script.log diff --git a/k8s/deployments/app-deployment.yaml b/k8s/deployments/app-deployment.yaml index 2d10f13..54bb6f7 100644 --- a/k8s/deployments/app-deployment.yaml +++ b/k8s/deployments/app-deployment.yaml @@ -16,7 +16,7 @@ spec: spec: containers: - name: python-proxy-container - image: henrose/python-proxy-app:latest + image: lukasmccain0/python-proxy-app:latest imagePullPolicy: Always securityContext: privileged: true diff --git a/k8s/statefulsets/mysql-proxy-deployment.yaml b/k8s/statefulsets/mysql-proxy-deployment.yaml index 29eeccb..da8ff45 100644 --- a/k8s/statefulsets/mysql-proxy-deployment.yaml +++ b/k8s/statefulsets/mysql-proxy-deployment.yaml @@ -23,7 +23,7 @@ spec: spec: containers: - name: mysql - image: lukasmccain0/mysql:8.0 + image: lukasmccain0/mysql-primary:8.0 ports: - containerPort: 3306 env: @@ -33,8 +33,11 @@ spec: value: "capstone_db" - name: MYSQL_ROOT_HOST value: "%" - - name: MYSQL_EXTRA_FLAGS - value: "--gtid-mode=ON --enforce-gtid-consistency=ON --log-bin=mysql-bin" + args: + - "--gtid-mode=ON" + - "--enforce-gtid-consistency=ON" + - "--log-bin=mysql-bin" + - "--server-id=1" volumeMounts: - name: mysql-data mountPath: /var/lib/mysql @@ -73,7 +76,7 @@ spec: spec: containers: - name: mysql - image: lukasmccain0/mysql:8.0 + image: lukasmccain0/mysql-replica:8.0 ports: - containerPort: 3306 env: @@ -83,8 +86,11 @@ spec: value: "capstone_db" - name: MYSQL_ROOT_HOST value: "%" - - name: MYSQL_EXTRA_FLAGS - value: "--gtid-mode=ON --enforce-gtid-consistency=ON --log-bin=mysql-bin" + args: + - "--gtid-mode=ON" + - "--enforce-gtid-consistency=ON" + - "--log-bin=mysql-bin" + - "--server-id=2" volumeMounts: - name: mysql-data mountPath: /var/lib/mysql From 5e806d5e0e2038e3a1a409efe944bdfb59346f50 Mon Sep 17 00:00:00 2001 From: hen-ro Date: Mon, 19 May 2025 11:08:34 -0400 Subject: [PATCH 03/10] Minor update to README --- README.md | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 1f12db9..d0e224c 100644 --- a/README.md +++ b/README.md @@ -187,18 +187,26 @@ In your terminal... 9. To view a collection, type: `db..find().pretty()` 10. At this point you can enter your query +### Examples +```sh +``` + ## Querying MySQL Directly ### *Make sure your kubernetes cluster is up and running* In your terminal... -3. Type in this command: `kubectl exec -it mysql-summary-records-0 -- sh` -4. Log into mongosh using this command: `mysql -u root -p` -5. When prompted, type in the password: `root` -6. From there, you can see databases with: `show databases;` -7. Type: `use summary_db;` -8. From there, you can see tables with: `show tables;` -9. To view a table, type: `select * from
;` -10. At this point you can enter your query +1. Type in this command: `kubectl exec -it mysql-summary-records-0 -- sh` +2. Log into mongosh using this command: `mysql -u root -p` +3. When prompted, type in the password: `root` +4. From there, you can see databases with: `show databases;` +5. Type: `use summary_db;` +6. From there, you can see tables with: `show tables;` +7. To view a table, type: `select * from
;` +8. At this point you can enter your query + +### Examples +```sh +``` # Credits Designed and built by Lucas Baker, Rachel Cox, Henry Hewitt, and Lukas McCain for the the February 2025 cohort of the PNC/TEKsystems Early Career SRE bootcamp. From b10fab72d4ec98279a31d9e43dff8e3dab56f9cd Mon Sep 17 00:00:00 2001 From: Lukas McCain Date: Mon, 19 May 2025 17:40:09 -0400 Subject: [PATCH 04/10] Made it so the primary adds the replica user automatically when the application runs --- docker/app/mysql-proxy-server.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/docker/app/mysql-proxy-server.py b/docker/app/mysql-proxy-server.py index 6ea793a..207baa6 100644 --- a/docker/app/mysql-proxy-server.py +++ b/docker/app/mysql-proxy-server.py @@ -291,7 +291,24 @@ def configure_as_replica(replica_host, master_host): print(f"Failed to configure {replica_host} as replica: {e}") #***************************************************************************************************************************************************** +def ensure_replica_user(): + try: + conn = mysql.connector.connect( + host=PRIMARY_HOST, + user="root", + password="admin" + ) + cursor = conn.cursor() + cursor.execute("CREATE USER IF NOT EXISTS 'replica_user'@'%' IDENTIFIED BY 'replica_password';") + cursor.execute("GRANT REPLICATION SLAVE ON *.* TO 'replica_user'@'%';") + cursor.execute("FLUSH PRIVILEGES;") + conn.close() + print("Ensured replica_user setup on primary.") + except Error as e: + print(f"Failed to ensure replica_user: {e}") + if __name__ == "__main__": + ensure_replica_user() threading.Thread(target=monitor_and_failover, daemon=True).start() start_proxy() From e0028509c9e0a336301812a91879dd6003196d3d Mon Sep 17 00:00:00 2001 From: Lukas McCain Date: Mon, 19 May 2025 19:55:06 -0400 Subject: [PATCH 05/10] made a script to automatically populate the primary database (which should automatically populate the replica) --- docker/app/mysql-proxy-server.py | 45 ++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/docker/app/mysql-proxy-server.py b/docker/app/mysql-proxy-server.py index 207baa6..51f8f54 100644 --- a/docker/app/mysql-proxy-server.py +++ b/docker/app/mysql-proxy-server.py @@ -307,8 +307,53 @@ def ensure_replica_user(): except Error as e: print(f"Failed to ensure replica_user: {e}") +#***************************************************************************************************************************************************** +def seed_bank_transactions(): + try: + conn = mysql.connector.connect(**DB_CONFIG) + cursor = conn.cursor() + + # Create table if it doesn't exist + cursor.execute(""" + CREATE TABLE IF NOT EXISTS bank_transactions ( + id INT AUTO_INCREMENT PRIMARY KEY, + account_number VARCHAR(20), + transaction_type ENUM('DEPOSIT', 'WITHDRAWAL', 'TRANSFER'), + amount DECIMAL(10,2), + transaction_date DATETIME, + description VARCHAR(255) + ); + """) + + # Check if records already exist + cursor.execute("SELECT COUNT(*) FROM bank_transactions;") + count = cursor.fetchone()[0] + + if count == 0: + # Insert sample data + cursor.executemany(""" + INSERT INTO bank_transactions (account_number, transaction_type, amount, transaction_date, description) + VALUES (%s, %s, %s, %s, %s) + """, [ + ('1234567890', 'DEPOSIT', 1000.00, '2025-05-19 09:15:00', 'Initial deposit'), + ('1234567890', 'WITHDRAWAL', 200.00, '2025-05-20 14:30:00', 'ATM withdrawal'), + ('9876543210', 'TRANSFER', 500.00, '2025-05-21 11:00:00', 'Transfer to savings'), + ('5555555555', 'DEPOSIT', 750.50, '2025-05-22 16:45:00', 'Paycheck'), + ('1234567890', 'WITHDRAWAL', 120.75, '2025-05-23 08:05:00', 'Online purchase') + ]) + conn.commit() + print("Sample bank transactions inserted.") + else: + print("Bank transactions already seeded. Skipping.") + + conn.close() + except Error as e: + print(f"Error seeding bank transactions: {e}") + +#***************************************************************************************************************************************************** if __name__ == "__main__": ensure_replica_user() + seed_bank_transactions() threading.Thread(target=monitor_and_failover, daemon=True).start() start_proxy() From 366a0e96a722a18dba5dedf1da94cfd904d92e57 Mon Sep 17 00:00:00 2001 From: Lukas McCain Date: Mon, 19 May 2025 21:43:32 -0400 Subject: [PATCH 06/10] fixed a bug where when the primary goes --- docker/app/mysql-proxy-server.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docker/app/mysql-proxy-server.py b/docker/app/mysql-proxy-server.py index 51f8f54..fda570c 100644 --- a/docker/app/mysql-proxy-server.py +++ b/docker/app/mysql-proxy-server.py @@ -50,6 +50,13 @@ def init_kafka_producer(retries=5, delay=5): #***************************************************************************************************************************************************** def connect_to_database(host): + try: + # Resolve hostname first to catch DNS issues + socket.gethostbyname(host) + except socket.gaierror as e: + print(f"DNS resolution failed for {host}: {e}") + return None + try: conn = mysql.connector.connect(host=host, **DB_CONFIG) if conn.is_connected(): From 0dd50d8f1879c2c186bebac98d66992b34a64dba Mon Sep 17 00:00:00 2001 From: Lucas Baker Date: Tue, 20 May 2025 12:50:06 -0400 Subject: [PATCH 07/10] added 2, removed 1 panel from grafana --- custom-exporter/exporter.py | 15 + grafana/dashboards/capstone-dashboard.json | 447 ++++++++++-------- k8s/configmaps/capstone-dashboard-json.yaml | 194 ++++---- .../custom-exporter-deployment.yaml | 2 +- 4 files changed, 371 insertions(+), 287 deletions(-) diff --git a/custom-exporter/exporter.py b/custom-exporter/exporter.py index bd696d2..96880f3 100644 --- a/custom-exporter/exporter.py +++ b/custom-exporter/exporter.py @@ -158,6 +158,21 @@ def metrics(): count = proxy_collection.count_documents({"event": event}) lines.append(f'proxy_log_errors_total{{event="{event}"}} {count}') + # FAILOVER EVENTS + failover_count = proxy_collection.count_documents({"event": "failover"}) + lines.append(f'proxy_failover_events_total {failover_count}') + + # WHICH DATABASE IS BEING USED + latest_up_event = proxy_collection.find_one( + {"event": "up"}, + sort=[("timestamp", -1)] + ) + + if latest_up_event and latest_up_event.get("db_target") == "mysql-replica": + lines.append("mysql_replica_in_use 1") + else: + lines.append("mysql_replica_in_use 0") + except Exception as e: lines.append(f'# MongoDB error: {str(e)}') diff --git a/grafana/dashboards/capstone-dashboard.json b/grafana/dashboards/capstone-dashboard.json index b8cc9f5..92c49ca 100644 --- a/grafana/dashboards/capstone-dashboard.json +++ b/grafana/dashboards/capstone-dashboard.json @@ -326,11 +326,25 @@ "fieldConfig": { "defaults": { "color": { - "fixedColor": "green", - "mode": "fixed" + "mode": "thresholds" }, - "mappings": [], - "noValue": "No Chaos Event Yet", + "mappings": [ + { + "options": { + "0": { + "color": "green", + "index": 1, + "text": "MySql Primary" + }, + "1": { + "color": "yellow", + "index": 0, + "text": "MySql Replica" + } + }, + "type": "value" + } + ], "thresholds": { "mode": "absolute", "steps": [ @@ -339,11 +353,7 @@ }, { "color": "red", - "value": "" - }, - { - "color": "green", - "value": "" + "value": 80 } ] } @@ -356,7 +366,7 @@ "x": 12, "y": 8 }, - "id": 8, + "id": 14, "options": { "colorMode": "value", "graphMode": "area", @@ -378,14 +388,14 @@ "targets": [ { "editorMode": "code", - "expr": "seconds_since_last_chaos_event\r\n", + "expr": "mysql_replica_in_use", "interval": "5", "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "Seconds Since Last Chaos Event", + "title": "Which Database Is Currently In Use?", "type": "stat" }, { @@ -396,18 +406,22 @@ "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" - }, - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - } + "mode": "thresholds" }, "mappings": [], - "noValue": "No Chaos Event Yet", - "unit": "short" + "noValue": "No Failovers Yet", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } }, "overrides": [] }, @@ -417,14 +431,13 @@ "x": 0, "y": 16 }, - "id": 9, + "id": 13, "options": { - "legend": { - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "pieType": "pie", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" @@ -432,25 +445,23 @@ "fields": "", "values": false }, - "tooltip": { - "hideZeros": false, - "mode": "single", - "sort": "none" - } + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true }, "pluginVersion": "12.0.0", "targets": [ { "editorMode": "code", - "expr": "chaos_events_total_by_type", + "expr": "proxy_failover_events_total", "interval": "5", - "legendFormat": "{{chaos_type}}", + "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "Chaos Events By Type", - "type": "piechart" + "title": "Total Number of Failovers", + "type": "stat" }, { "datasource": { @@ -462,8 +473,23 @@ "color": { "mode": "thresholds" }, - "mappings": [], - "noValue": "No Chaos Event Yet", + "mappings": [ + { + "options": { + "0": { + "color": "red", + "index": 1, + "text": "Down" + }, + "1": { + "color": "green", + "index": 0, + "text": "Up" + } + }, + "type": "value" + } + ], "thresholds": { "mode": "absolute", "steps": [ @@ -477,44 +503,153 @@ ] } }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "end" - }, - "properties": [ + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 12, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.0", + "targets": [ + { + "editorMode": "code", + "expr": "mysql_primary_up", + "interval": "5", + "legendFormat": "{{}}", + "range": true, + "refId": "A" + } + ], + "title": "MySql Primary Pod Status", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "green", + "mode": "fixed" + }, + "mappings": [], + "noValue": "No Chaos Event Yet", + "thresholds": { + "mode": "absolute", + "steps": [ { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } + "color": "green" + }, + { + "color": "red", + "value": "" + }, + { + "color": "green", + "value": "" } ] } - ] + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.0", + "targets": [ + { + "editorMode": "code", + "expr": "seconds_since_last_chaos_event\r\n", + "interval": "5", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Seconds Since Last Chaos Event", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [], + "noValue": "No Chaos Event Yet", + "unit": "short" + }, + "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 16 + "y": 24 }, - "id": 7, + "id": 9, "options": { - "displayMode": "gradient", "legend": { - "calcs": [], "displayMode": "list", "placement": "bottom", - "showLegend": false + "showLegend": true }, - "maxVizHeight": 300, - "minVizHeight": 16, - "minVizWidth": 8, - "namePlacement": "auto", - "orientation": "auto", + "pieType": "pie", "reduceOptions": { "calcs": [ "lastNotNull" @@ -522,23 +657,25 @@ "fields": "", "values": false }, - "showUnfilled": true, - "sizing": "auto", - "valueMode": "color" + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } }, "pluginVersion": "12.0.0", "targets": [ { "editorMode": "code", - "expr": "chaos_event_count", + "expr": "chaos_events_total_by_type", "interval": "5", - "legendFormat": "{{event_type}}", + "legendFormat": "{{chaos_type}}", "range": true, "refId": "A" } ], - "title": "Chaos Events By Status", - "type": "bargauge" + "title": "Chaos Events By Type", + "type": "piechart" }, { "datasource": { @@ -574,7 +711,7 @@ "h": 8, "w": 12, "x": 0, - "y": 24 + "y": 32 }, "id": 5, "options": { @@ -618,23 +755,8 @@ "color": { "mode": "thresholds" }, - "mappings": [ - { - "options": { - "0": { - "color": "red", - "index": 1, - "text": "Down" - }, - "1": { - "color": "green", - "index": 0, - "text": "Up" - } - }, - "type": "value" - } - ], + "mappings": [], + "noValue": "No Chaos Event Yet", "thresholds": { "mode": "absolute", "steps": [ @@ -648,21 +770,44 @@ ] } }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "end" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] }, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 24 + "y": 32 }, - "id": 12, + "id": 7, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", + "displayMode": "gradient", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", "orientation": "auto", - "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" @@ -670,23 +815,23 @@ "fields": "", "values": false }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" }, "pluginVersion": "12.0.0", "targets": [ { "editorMode": "code", - "expr": "mysql_primary_up", + "expr": "chaos_event_count", "interval": "5", - "legendFormat": "{{}}", + "legendFormat": "{{event_type}}", "range": true, "refId": "A" } ], - "title": "MySql Primary Pod Status", - "type": "stat" + "title": "Chaos Events By Status", + "type": "bargauge" }, { "datasource": { @@ -754,7 +899,7 @@ "h": 8, "w": 12, "x": 0, - "y": 32 + "y": 40 }, "id": 3, "options": { @@ -811,7 +956,7 @@ "h": 8, "w": 12, "x": 12, - "y": 32 + "y": 40 }, "id": 11, "options": { @@ -910,8 +1055,8 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 40 + "x": 12, + "y": 48 }, "id": 4, "options": { @@ -940,96 +1085,6 @@ ], "title": "Memory Absolute Usage", "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 40 - }, - "id": 10, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "hideZeros": false, - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "12.0.0", - "targets": [ - { - "editorMode": "code", - "expr": "rate(proxy_log_errors_total{event=~\"recv_failed|broken_pipe\"}[5m])\r\n", - "interval": "5", - "legendFormat": "{{event}}", - "range": true, - "refId": "A" - } - ], - "title": "Rate of Proxy Errors", - "type": "timeseries" } ], "preload": false, diff --git a/k8s/configmaps/capstone-dashboard-json.yaml b/k8s/configmaps/capstone-dashboard-json.yaml index 8069ede..eacb024 100644 --- a/k8s/configmaps/capstone-dashboard-json.yaml +++ b/k8s/configmaps/capstone-dashboard-json.yaml @@ -107,6 +107,77 @@ data: \ ],\r\n \"title\": \"Chaos Experiment Running?\",\r\n \"type\": \"stat\"\r\n },\r\n {\r\n \"datasource\": {\r\n \"type\": \"prometheus\",\r\n \ \"uid\": \"PBFA97CFB590B2093\"\r\n },\r\n \"fieldConfig\": {\r\n + \ \"defaults\": {\r\n \"color\": {\r\n \"mode\": \"thresholds\"\r\n + \ },\r\n \"mappings\": [\r\n {\r\n \"options\": + {\r\n \"0\": {\r\n \"color\": \"green\",\r\n \"index\": + 1,\r\n \"text\": \"MySql Primary\"\r\n },\r\n + \ \"1\": {\r\n \"color\": \"yellow\",\r\n \"index\": + 0,\r\n \"text\": \"MySql Replica\"\r\n }\r\n },\r\n + \ \"type\": \"value\"\r\n }\r\n ],\r\n \"thresholds\": + {\r\n \"mode\": \"absolute\",\r\n \"steps\": [\r\n {\r\n + \ \"color\": \"green\"\r\n },\r\n {\r\n + \ \"color\": \"red\",\r\n \"value\": 80\r\n }\r\n + \ ]\r\n }\r\n },\r\n \"overrides\": []\r\n },\r\n + \ \"gridPos\": {\r\n \"h\": 8,\r\n \"w\": 12,\r\n \"x\": + 12,\r\n \"y\": 8\r\n },\r\n \"id\": 14,\r\n \"options\": + {\r\n \"colorMode\": \"value\",\r\n \"graphMode\": \"area\",\r\n + \ \"justifyMode\": \"auto\",\r\n \"orientation\": \"auto\",\r\n \"percentChangeColorMode\": + \"standard\",\r\n \"reduceOptions\": {\r\n \"calcs\": [\r\n \"lastNotNull\"\r\n + \ ],\r\n \"fields\": \"\",\r\n \"values\": false\r\n + \ },\r\n \"showPercentChange\": false,\r\n \"textMode\": \"auto\",\r\n + \ \"wideLayout\": true\r\n },\r\n \"pluginVersion\": \"12.0.0\",\r\n + \ \"targets\": [\r\n {\r\n \"editorMode\": \"code\",\r\n \"expr\": + \"mysql_replica_in_use\",\r\n \"interval\": \"5\",\r\n \"legendFormat\": + \"__auto\",\r\n \"range\": true,\r\n \"refId\": \"A\"\r\n }\r\n + \ ],\r\n \"title\": \"Which Database Is Currently In Use?\",\r\n \"type\": + \"stat\"\r\n },\r\n {\r\n \"datasource\": {\r\n \"type\": \"prometheus\",\r\n + \ \"uid\": \"PBFA97CFB590B2093\"\r\n },\r\n \"fieldConfig\": {\r\n + \ \"defaults\": {\r\n \"color\": {\r\n \"mode\": \"thresholds\"\r\n + \ },\r\n \"mappings\": [],\r\n \"noValue\": \"No Failovers + Yet\",\r\n \"thresholds\": {\r\n \"mode\": \"absolute\",\r\n + \ \"steps\": [\r\n {\r\n \"color\": \"green\"\r\n + \ },\r\n {\r\n \"color\": \"red\",\r\n + \ \"value\": 80\r\n }\r\n ]\r\n }\r\n + \ },\r\n \"overrides\": []\r\n },\r\n \"gridPos\": {\r\n + \ \"h\": 8,\r\n \"w\": 12,\r\n \"x\": 0,\r\n \"y\": + 16\r\n },\r\n \"id\": 13,\r\n \"options\": {\r\n \"colorMode\": + \"value\",\r\n \"graphMode\": \"area\",\r\n \"justifyMode\": \"auto\",\r\n + \ \"orientation\": \"auto\",\r\n \"percentChangeColorMode\": \"standard\",\r\n + \ \"reduceOptions\": {\r\n \"calcs\": [\r\n \"lastNotNull\"\r\n + \ ],\r\n \"fields\": \"\",\r\n \"values\": false\r\n + \ },\r\n \"showPercentChange\": false,\r\n \"textMode\": \"auto\",\r\n + \ \"wideLayout\": true\r\n },\r\n \"pluginVersion\": \"12.0.0\",\r\n + \ \"targets\": [\r\n {\r\n \"editorMode\": \"code\",\r\n \"expr\": + \"proxy_failover_events_total\",\r\n \"interval\": \"5\",\r\n \"legendFormat\": + \"__auto\",\r\n \"range\": true,\r\n \"refId\": \"A\"\r\n }\r\n + \ ],\r\n \"title\": \"Total Number of Failovers\",\r\n \"type\": + \"stat\"\r\n },\r\n {\r\n \"datasource\": {\r\n \"type\": \"prometheus\",\r\n + \ \"uid\": \"PBFA97CFB590B2093\"\r\n },\r\n \"fieldConfig\": {\r\n + \ \"defaults\": {\r\n \"color\": {\r\n \"mode\": \"thresholds\"\r\n + \ },\r\n \"mappings\": [\r\n {\r\n \"options\": + {\r\n \"0\": {\r\n \"color\": \"red\",\r\n \"index\": + 1,\r\n \"text\": \"Down\"\r\n },\r\n \"1\": + {\r\n \"color\": \"green\",\r\n \"index\": 0,\r\n + \ \"text\": \"Up\"\r\n }\r\n },\r\n + \ \"type\": \"value\"\r\n }\r\n ],\r\n \"thresholds\": + {\r\n \"mode\": \"absolute\",\r\n \"steps\": [\r\n {\r\n + \ \"color\": \"green\"\r\n },\r\n {\r\n + \ \"color\": \"red\",\r\n \"value\": 80\r\n }\r\n + \ ]\r\n }\r\n },\r\n \"overrides\": []\r\n },\r\n + \ \"gridPos\": {\r\n \"h\": 8,\r\n \"w\": 12,\r\n \"x\": + 12,\r\n \"y\": 16\r\n },\r\n \"id\": 12,\r\n \"options\": + {\r\n \"colorMode\": \"value\",\r\n \"graphMode\": \"area\",\r\n + \ \"justifyMode\": \"auto\",\r\n \"orientation\": \"auto\",\r\n \"percentChangeColorMode\": + \"standard\",\r\n \"reduceOptions\": {\r\n \"calcs\": [\r\n \"lastNotNull\"\r\n + \ ],\r\n \"fields\": \"\",\r\n \"values\": false\r\n + \ },\r\n \"showPercentChange\": false,\r\n \"textMode\": \"auto\",\r\n + \ \"wideLayout\": true\r\n },\r\n \"pluginVersion\": \"12.0.0\",\r\n + \ \"targets\": [\r\n {\r\n \"editorMode\": \"code\",\r\n \"expr\": + \"mysql_primary_up\",\r\n \"interval\": \"5\",\r\n \"legendFormat\": + \"{{}}\",\r\n \"range\": true,\r\n \"refId\": \"A\"\r\n }\r\n + \ ],\r\n \"title\": \"MySql Primary Pod Status\",\r\n \"type\": + \"stat\"\r\n },\r\n {\r\n \"datasource\": {\r\n \"type\": \"prometheus\",\r\n + \ \"uid\": \"PBFA97CFB590B2093\"\r\n },\r\n \"fieldConfig\": {\r\n \ \"defaults\": {\r\n \"color\": {\r\n \"fixedColor\": \"green\",\r\n \"mode\": \"fixed\"\r\n },\r\n \"mappings\": [],\r\n \"noValue\": \"No Chaos Event Yet\",\r\n \"thresholds\": @@ -116,7 +187,7 @@ data: \ {\r\n \"color\": \"green\",\r\n \"value\": \"\"\r\n }\r\n ]\r\n }\r\n },\r\n \"overrides\": []\r\n },\r\n \"gridPos\": {\r\n \"h\": 8,\r\n \"w\": - 12,\r\n \"x\": 12,\r\n \"y\": 8\r\n },\r\n \"id\": 8,\r\n + 12,\r\n \"x\": 0,\r\n \"y\": 24\r\n },\r\n \"id\": 8,\r\n \ \"options\": {\r\n \"colorMode\": \"value\",\r\n \"graphMode\": \"area\",\r\n \"justifyMode\": \"auto\",\r\n \"orientation\": \"auto\",\r\n \ \"percentChangeColorMode\": \"standard\",\r\n \"reduceOptions\": @@ -136,7 +207,7 @@ data: \ }\r\n },\r\n \"mappings\": [],\r\n \"noValue\": \"No Chaos Event Yet\",\r\n \"unit\": \"short\"\r\n },\r\n \"overrides\": []\r\n },\r\n \"gridPos\": {\r\n \"h\": 8,\r\n \"w\": - 12,\r\n \"x\": 0,\r\n \"y\": 16\r\n },\r\n \"id\": 9,\r\n + 12,\r\n \"x\": 12,\r\n \"y\": 24\r\n },\r\n \"id\": 9,\r\n \ \"options\": {\r\n \"legend\": {\r\n \"displayMode\": \"list\",\r\n \ \"placement\": \"bottom\",\r\n \"showLegend\": true\r\n },\r\n \ \"pieType\": \"pie\",\r\n \"reduceOptions\": {\r\n \"calcs\": @@ -151,35 +222,6 @@ data: \ },\r\n {\r\n \"datasource\": {\r\n \"type\": \"prometheus\",\r\n \ \"uid\": \"PBFA97CFB590B2093\"\r\n },\r\n \"fieldConfig\": {\r\n \ \"defaults\": {\r\n \"color\": {\r\n \"mode\": \"thresholds\"\r\n - \ },\r\n \"mappings\": [],\r\n \"noValue\": \"No Chaos - Event Yet\",\r\n \"thresholds\": {\r\n \"mode\": \"absolute\",\r\n - \ \"steps\": [\r\n {\r\n \"color\": \"green\"\r\n - \ },\r\n {\r\n \"color\": \"red\",\r\n - \ \"value\": 80\r\n }\r\n ]\r\n }\r\n - \ },\r\n \"overrides\": [\r\n {\r\n \"matcher\": - {\r\n \"id\": \"byName\",\r\n \"options\": \"end\"\r\n - \ },\r\n \"properties\": [\r\n {\r\n \"id\": - \"color\",\r\n \"value\": {\r\n \"fixedColor\": - \"green\",\r\n \"mode\": \"fixed\"\r\n }\r\n }\r\n - \ ]\r\n }\r\n ]\r\n },\r\n \"gridPos\": {\r\n - \ \"h\": 8,\r\n \"w\": 12,\r\n \"x\": 12,\r\n \"y\": - 16\r\n },\r\n \"id\": 7,\r\n \"options\": {\r\n \"displayMode\": - \"gradient\",\r\n \"legend\": {\r\n \"calcs\": [],\r\n \"displayMode\": - \"list\",\r\n \"placement\": \"bottom\",\r\n \"showLegend\": - false\r\n },\r\n \"maxVizHeight\": 300,\r\n \"minVizHeight\": - 16,\r\n \"minVizWidth\": 8,\r\n \"namePlacement\": \"auto\",\r\n - \ \"orientation\": \"auto\",\r\n \"reduceOptions\": {\r\n \"calcs\": - [\r\n \"lastNotNull\"\r\n ],\r\n \"fields\": \"\",\r\n - \ \"values\": false\r\n },\r\n \"showUnfilled\": true,\r\n - \ \"sizing\": \"auto\",\r\n \"valueMode\": \"color\"\r\n },\r\n - \ \"pluginVersion\": \"12.0.0\",\r\n \"targets\": [\r\n {\r\n - \ \"editorMode\": \"code\",\r\n \"expr\": \"chaos_event_count\",\r\n - \ \"interval\": \"5\",\r\n \"legendFormat\": \"{{event_type}}\",\r\n - \ \"range\": true,\r\n \"refId\": \"A\"\r\n }\r\n ],\r\n - \ \"title\": \"Chaos Events By Status\",\r\n \"type\": \"bargauge\"\r\n - \ },\r\n {\r\n \"datasource\": {\r\n \"type\": \"prometheus\",\r\n - \ \"uid\": \"PBFA97CFB590B2093\"\r\n },\r\n \"fieldConfig\": {\r\n - \ \"defaults\": {\r\n \"color\": {\r\n \"mode\": \"thresholds\"\r\n \ },\r\n \"mappings\": [],\r\n \"thresholds\": {\r\n \ \"mode\": \"absolute\",\r\n \"steps\": [\r\n {\r\n \ \"color\": \"green\"\r\n },\r\n {\r\n @@ -187,7 +229,7 @@ data: \ {\r\n \"color\": \"green\",\r\n \"value\": \"\"\r\n }\r\n ]\r\n }\r\n },\r\n \"overrides\": []\r\n },\r\n \"gridPos\": {\r\n \"h\": 8,\r\n \"w\": - 12,\r\n \"x\": 0,\r\n \"y\": 24\r\n },\r\n \"id\": 5,\r\n + 12,\r\n \"x\": 0,\r\n \"y\": 32\r\n },\r\n \"id\": 5,\r\n \ \"options\": {\r\n \"colorMode\": \"background\",\r\n \"graphMode\": \"area\",\r\n \"justifyMode\": \"auto\",\r\n \"orientation\": \"auto\",\r\n \ \"percentChangeColorMode\": \"standard\",\r\n \"reduceOptions\": @@ -202,29 +244,32 @@ data: \ {\r\n \"datasource\": {\r\n \"type\": \"prometheus\",\r\n \"uid\": \"PBFA97CFB590B2093\"\r\n },\r\n \"fieldConfig\": {\r\n \"defaults\": {\r\n \"color\": {\r\n \"mode\": \"thresholds\"\r\n },\r\n - \ \"mappings\": [\r\n {\r\n \"options\": {\r\n - \ \"0\": {\r\n \"color\": \"red\",\r\n \"index\": - 1,\r\n \"text\": \"Down\"\r\n },\r\n \"1\": - {\r\n \"color\": \"green\",\r\n \"index\": 0,\r\n - \ \"text\": \"Up\"\r\n }\r\n },\r\n - \ \"type\": \"value\"\r\n }\r\n ],\r\n \"thresholds\": - {\r\n \"mode\": \"absolute\",\r\n \"steps\": [\r\n {\r\n - \ \"color\": \"green\"\r\n },\r\n {\r\n - \ \"color\": \"red\",\r\n \"value\": 80\r\n }\r\n - \ ]\r\n }\r\n },\r\n \"overrides\": []\r\n },\r\n - \ \"gridPos\": {\r\n \"h\": 8,\r\n \"w\": 12,\r\n \"x\": - 12,\r\n \"y\": 24\r\n },\r\n \"id\": 12,\r\n \"options\": - {\r\n \"colorMode\": \"value\",\r\n \"graphMode\": \"area\",\r\n - \ \"justifyMode\": \"auto\",\r\n \"orientation\": \"auto\",\r\n \"percentChangeColorMode\": - \"standard\",\r\n \"reduceOptions\": {\r\n \"calcs\": [\r\n \"lastNotNull\"\r\n - \ ],\r\n \"fields\": \"\",\r\n \"values\": false\r\n - \ },\r\n \"showPercentChange\": false,\r\n \"textMode\": \"auto\",\r\n - \ \"wideLayout\": true\r\n },\r\n \"pluginVersion\": \"12.0.0\",\r\n - \ \"targets\": [\r\n {\r\n \"editorMode\": \"code\",\r\n \"expr\": - \"mysql_primary_up\",\r\n \"interval\": \"5\",\r\n \"legendFormat\": - \"{{}}\",\r\n \"range\": true,\r\n \"refId\": \"A\"\r\n }\r\n - \ ],\r\n \"title\": \"MySql Primary Pod Status\",\r\n \"type\": - \"stat\"\r\n },\r\n {\r\n \"datasource\": {\r\n \"type\": \"prometheus\",\r\n + \ \"mappings\": [],\r\n \"noValue\": \"No Chaos Event Yet\",\r\n + \ \"thresholds\": {\r\n \"mode\": \"absolute\",\r\n \"steps\": + [\r\n {\r\n \"color\": \"green\"\r\n },\r\n + \ {\r\n \"color\": \"red\",\r\n \"value\": + 80\r\n }\r\n ]\r\n }\r\n },\r\n \"overrides\": + [\r\n {\r\n \"matcher\": {\r\n \"id\": \"byName\",\r\n + \ \"options\": \"end\"\r\n },\r\n \"properties\": + [\r\n {\r\n \"id\": \"color\",\r\n \"value\": + {\r\n \"fixedColor\": \"green\",\r\n \"mode\": + \"fixed\"\r\n }\r\n }\r\n ]\r\n }\r\n + \ ]\r\n },\r\n \"gridPos\": {\r\n \"h\": 8,\r\n \"w\": + 12,\r\n \"x\": 12,\r\n \"y\": 32\r\n },\r\n \"id\": 7,\r\n + \ \"options\": {\r\n \"displayMode\": \"gradient\",\r\n \"legend\": + {\r\n \"calcs\": [],\r\n \"displayMode\": \"list\",\r\n \"placement\": + \"bottom\",\r\n \"showLegend\": false\r\n },\r\n \"maxVizHeight\": + 300,\r\n \"minVizHeight\": 16,\r\n \"minVizWidth\": 8,\r\n \"namePlacement\": + \"auto\",\r\n \"orientation\": \"auto\",\r\n \"reduceOptions\": + {\r\n \"calcs\": [\r\n \"lastNotNull\"\r\n ],\r\n + \ \"fields\": \"\",\r\n \"values\": false\r\n },\r\n \"showUnfilled\": + true,\r\n \"sizing\": \"auto\",\r\n \"valueMode\": \"color\"\r\n + \ },\r\n \"pluginVersion\": \"12.0.0\",\r\n \"targets\": [\r\n {\r\n + \ \"editorMode\": \"code\",\r\n \"expr\": \"chaos_event_count\",\r\n + \ \"interval\": \"5\",\r\n \"legendFormat\": \"{{event_type}}\",\r\n + \ \"range\": true,\r\n \"refId\": \"A\"\r\n }\r\n ],\r\n + \ \"title\": \"Chaos Events By Status\",\r\n \"type\": \"bargauge\"\r\n + \ },\r\n {\r\n \"datasource\": {\r\n \"type\": \"prometheus\",\r\n \ \"uid\": \"PBFA97CFB590B2093\"\r\n },\r\n \"fieldConfig\": {\r\n \ \"defaults\": {\r\n \"color\": {\r\n \"mode\": \"palette-classic\"\r\n \ },\r\n \"custom\": {\r\n \"axisBorderShow\": false,\r\n @@ -247,7 +292,7 @@ data: 80\r\n }\r\n ]\r\n },\r\n \"unit\": \"Cores\"\r\n },\r\n \"overrides\": []\r\n },\r\n \"gridPos\": {\r\n \"h\": 8,\r\n \"w\": 12,\r\n \"x\": 0,\r\n \"y\": - 32\r\n },\r\n \"id\": 3,\r\n \"options\": {\r\n \"legend\": + 40\r\n },\r\n \"id\": 3,\r\n \"options\": {\r\n \"legend\": {\r\n \"calcs\": [],\r\n \"displayMode\": \"list\",\r\n \"placement\": \"bottom\",\r\n \"showLegend\": true\r\n },\r\n \"tooltip\": {\r\n \"hideZeros\": false,\r\n \"mode\": \"single\",\r\n \"sort\": @@ -264,7 +309,7 @@ data: \ \"color\": \"green\"\r\n }\r\n ]\r\n },\r\n \ \"unit\": \"short\"\r\n },\r\n \"overrides\": []\r\n },\r\n \ \"gridPos\": {\r\n \"h\": 8,\r\n \"w\": 12,\r\n \"x\": - 12,\r\n \"y\": 32\r\n },\r\n \"id\": 11,\r\n \"options\": + 12,\r\n \"y\": 40\r\n },\r\n \"id\": 11,\r\n \"options\": {\r\n \"colorMode\": \"value\",\r\n \"graphMode\": \"area\",\r\n \ \"justifyMode\": \"auto\",\r\n \"orientation\": \"auto\",\r\n \"percentChangeColorMode\": \"standard\",\r\n \"reduceOptions\": {\r\n \"calcs\": [\r\n \"lastNotNull\"\r\n @@ -298,8 +343,8 @@ data: \ {\r\n \"color\": \"red\",\r\n \"value\": 80\r\n }\r\n ]\r\n },\r\n \"unit\": \"decmbytes\"\r\n },\r\n \"overrides\": []\r\n },\r\n \"gridPos\": - {\r\n \"h\": 8,\r\n \"w\": 12,\r\n \"x\": 0,\r\n \"y\": - 40\r\n },\r\n \"id\": 4,\r\n \"options\": {\r\n \"legend\": + {\r\n \"h\": 8,\r\n \"w\": 12,\r\n \"x\": 12,\r\n \"y\": + 48\r\n },\r\n \"id\": 4,\r\n \"options\": {\r\n \"legend\": {\r\n \"calcs\": [],\r\n \"displayMode\": \"list\",\r\n \"placement\": \"bottom\",\r\n \"showLegend\": true\r\n },\r\n \"tooltip\": {\r\n \"hideZeros\": false,\r\n \"mode\": \"single\",\r\n \"sort\": @@ -308,37 +353,6 @@ data: \"infra_mem_usage_absolute\",\r\n \"interval\": \"5\",\r\n \"legendFormat\": \"{{pod}}\",\r\n \"range\": true,\r\n \"refId\": \"A\"\r\n }\r\n \ ],\r\n \"title\": \"Memory Absolute Usage\",\r\n \"type\": \"timeseries\"\r\n - \ },\r\n {\r\n \"datasource\": {\r\n \"type\": \"prometheus\",\r\n - \ \"uid\": \"PBFA97CFB590B2093\"\r\n },\r\n \"fieldConfig\": {\r\n - \ \"defaults\": {\r\n \"color\": {\r\n \"mode\": \"palette-classic\"\r\n - \ },\r\n \"custom\": {\r\n \"axisBorderShow\": false,\r\n - \ \"axisCenteredZero\": false,\r\n \"axisColorMode\": \"text\",\r\n - \ \"axisLabel\": \"\",\r\n \"axisPlacement\": \"auto\",\r\n - \ \"barAlignment\": 0,\r\n \"barWidthFactor\": 0.6,\r\n \"drawStyle\": - \"line\",\r\n \"fillOpacity\": 0,\r\n \"gradientMode\": - \"none\",\r\n \"hideFrom\": {\r\n \"legend\": false,\r\n - \ \"tooltip\": false,\r\n \"viz\": false\r\n },\r\n - \ \"insertNulls\": false,\r\n \"lineInterpolation\": \"linear\",\r\n - \ \"lineWidth\": 1,\r\n \"pointSize\": 5,\r\n \"scaleDistribution\": - {\r\n \"type\": \"linear\"\r\n },\r\n \"showPoints\": - \"auto\",\r\n \"spanNulls\": false,\r\n \"stacking\": {\r\n - \ \"group\": \"A\",\r\n \"mode\": \"none\"\r\n },\r\n - \ \"thresholdsStyle\": {\r\n \"mode\": \"off\"\r\n }\r\n - \ },\r\n \"mappings\": [],\r\n \"thresholds\": {\r\n - \ \"mode\": \"absolute\",\r\n \"steps\": [\r\n {\r\n - \ \"color\": \"green\"\r\n }\r\n ]\r\n },\r\n - \ \"unit\": \"short\"\r\n },\r\n \"overrides\": []\r\n },\r\n - \ \"gridPos\": {\r\n \"h\": 8,\r\n \"w\": 12,\r\n \"x\": - 12,\r\n \"y\": 40\r\n },\r\n \"id\": 10,\r\n \"options\": - {\r\n \"legend\": {\r\n \"calcs\": [],\r\n \"displayMode\": - \"list\",\r\n \"placement\": \"bottom\",\r\n \"showLegend\": - true\r\n },\r\n \"tooltip\": {\r\n \"hideZeros\": false,\r\n - \ \"mode\": \"single\",\r\n \"sort\": \"none\"\r\n }\r\n - \ },\r\n \"pluginVersion\": \"12.0.0\",\r\n \"targets\": [\r\n {\r\n - \ \"editorMode\": \"code\",\r\n \"expr\": \"rate(proxy_log_errors_total{event=~\\\"recv_failed|broken_pipe\\\"}[5m])\\r\\n\",\r\n - \ \"interval\": \"5\",\r\n \"legendFormat\": \"{{event}}\",\r\n - \ \"range\": true,\r\n \"refId\": \"A\"\r\n }\r\n ],\r\n - \ \"title\": \"Rate of Proxy Errors\",\r\n \"type\": \"timeseries\"\r\n \ }\r\n ],\r\n \"preload\": false,\r\n \"refresh\": \"5s\",\r\n \"schemaVersion\": 41,\r\n \"tags\": [],\r\n \"templating\": {\r\n \"list\": []\r\n },\r\n \ \"time\": {\r\n \"from\": \"now-30m\",\r\n \"to\": \"now\"\r\n },\r\n diff --git a/k8s/deployments/custom-exporter-deployment.yaml b/k8s/deployments/custom-exporter-deployment.yaml index 22b538b..eae55b6 100644 --- a/k8s/deployments/custom-exporter-deployment.yaml +++ b/k8s/deployments/custom-exporter-deployment.yaml @@ -20,7 +20,7 @@ spec: spec: containers: - name: capstone-exporter - image: henrose/capstone-exporter:latest + image: lucasbaker905/capstone-exporter:latest imagePullPolicy: Always ports: - containerPort: 5000 From c625a5632fb7434597bfdd6edad576b5d6b39e71 Mon Sep 17 00:00:00 2001 From: "henryrosehewitt@gmail.com" Date: Tue, 20 May 2025 20:37:59 -0400 Subject: [PATCH 08/10] Minor edits --- python/chaos_experiments/resource_exhaust.py | 2 +- python/chaos_experiments/terminate_pod.py | 8 ++--- run_experiment.py | 33 ++++++++++---------- 3 files changed, 21 insertions(+), 22 deletions(-) diff --git a/python/chaos_experiments/resource_exhaust.py b/python/chaos_experiments/resource_exhaust.py index be7f349..25e534a 100644 --- a/python/chaos_experiments/resource_exhaust.py +++ b/python/chaos_experiments/resource_exhaust.py @@ -167,7 +167,7 @@ def memory_stress_in_pod(api_client, pod_info, container_names, intensity, durat "trap cleanup EXIT INT TERM; " #Calculate chunk size - use smaller chunks for better memory pressure - "CHUNK_SIZE_MB=32; " #Using 32MB chunks + "CHUNK_SIZE_MB=64; " #Using 64MB chunks f"NUM_CHUNKS=$(({memory_to_allocate_mb} / $CHUNK_SIZE_MB + 1)); " f"LAST_CHUNK_SIZE=$(({memory_to_allocate_mb} % $CHUNK_SIZE_MB)); " diff --git a/python/chaos_experiments/terminate_pod.py b/python/chaos_experiments/terminate_pod.py index 5e88b94..6f945c2 100644 --- a/python/chaos_experiments/terminate_pod.py +++ b/python/chaos_experiments/terminate_pod.py @@ -251,7 +251,7 @@ def main(): controller_name = deployment_name logger.info(f"Using provided deployment: {deployment_name}") else: - # Auto-detect controller only if deployment_name is None + #Auto-detect controller only if deployment_name is None controller_type, controller_name = get_pod_controller(api_client, pod_name, namespace) if controller_type and controller_name: logger.info(f"Auto-detected pod controller: {controller_type}/{controller_name}") @@ -290,7 +290,7 @@ def main(): apps_v1 = client.AppsV1Api(api_client) - # Scale down to 0 + #Scale down to 0 apps_v1.patch_namespaced_stateful_set( name="mysql-primary", namespace="default", @@ -298,7 +298,7 @@ def main(): ) logger.info("💥 Scaled mysql-primary to 0") - # Wait for specified duration + #Wait for specified duration logger.info(f"⏳ Waiting for {wait_duration} seconds before scaling up...") time.sleep(wait_duration) @@ -310,7 +310,7 @@ def main(): ) logger.info("🚀 Scaled mysql-primary back to 1") - # Optional: restart annotation + #Optional: restart annotation restart_successful = restart_controller(api_client, "StatefulSet", "mysql-primary", "default") except Exception as e: diff --git a/run_experiment.py b/run_experiment.py index 9b5d06d..93da035 100644 --- a/run_experiment.py +++ b/run_experiment.py @@ -182,13 +182,13 @@ def run_network_partition(pod_info): ] try: - print("\nExecuting experiment...") + print("\n⏳ Executing experiment...") subprocess.run(cmd, check=True) - print("\nExperiment completed successfully!") + print("\n✅ Experiment completed successfully!") except subprocess.CalledProcessError as e: - print(f"\nError running experiment: {e}") + print(f"\n⚠️ Error running experiment: {e}") except Exception as e: - print(f"\nUnexpected error: {e}") + print(f"\n⚠️ Unexpected error: {e}") input("\nPress Enter to continue...") @@ -269,13 +269,13 @@ def run_resource_exhaustion(pod_info): cmd.extend(["-mc", "-mi", memory_intensity]) try: - print("\nExecuting experiment...") + print("\n⏳ Executing experiment...") subprocess.run(cmd, check=True) - print("\nExperiment completed successfully!") + print("\n✅ Experiment completed successfully!") except subprocess.CalledProcessError as e: - print(f"\nError running experiment: {e}") + print(f"\n⚠️ Error running experiment: {e}") except Exception as e: - print(f"\nUnexpected error: {e}") + print(f"\n⚠️ Unexpected error: {e}") input("\nPress Enter to continue...") @@ -308,18 +308,17 @@ def run_pod_termination(pod_info): cmd.extend(["-d", pod_info['deployment']]) try: - print("\nExecuting experiment...") + print("\n⏳ Executing experiment...") subprocess.run(cmd, check=True) - print("\nExperiment completed successfully!") + print("\n✅ Experiment completed successfully!") except subprocess.CalledProcessError as e: - print(f"\nError running experiment: {e}") + print(f"\n⚠️ Error running experiment: {e}") except Exception as e: - print(f"\nUnexpected error: {e}") + print(f"\n⚠️ Unexpected error: {e}") input("\nPress Enter to continue...") def run_process_termination(pod_info): - print("\n" + title_separator + " PROCESS TERMINATION EXPERIMENT " + title_separator + "\n") print("Parameters (press Enter to skip):") #Get container ID prefix (optional) @@ -349,13 +348,13 @@ def run_process_termination(pod_info): cmd.extend(["-p", process_pattern]) try: - print("\nExecuting experiment...") + print("\n⏳ Executing experiment...") subprocess.run(cmd, check=True) - print("\nExperiment completed successfully!") + print("\n✅ Experiment completed successfully!") except subprocess.CalledProcessError as e: - print(f"\nError running experiment: {e}") + print(f"\n⚠️ Error running experiment: {e}") except Exception as e: - print(f"\nUnexpected error: {e}") + print(f"\n⚠️ Unexpected error: {e}") input("\nPress Enter to continue...") From 2107133b26996d2768d83db2fb2d93a18b4fa9ac Mon Sep 17 00:00:00 2001 From: rachelou13 Date: Tue, 20 May 2025 23:18:37 -0400 Subject: [PATCH 09/10] Update kafka-setup.md --- docs/kafka-setup.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/kafka-setup.md b/docs/kafka-setup.md index 01a1b9b..3acc5b4 100644 --- a/docs/kafka-setup.md +++ b/docs/kafka-setup.md @@ -4,7 +4,7 @@ Before deploying Kafka to the local Kubernetes cluster, make sure: -ƒ### 1. Minikube Is Running +### 1. Minikube Is Running ```bash minikube status @@ -21,7 +21,7 @@ kubectl create namespace staging --- -### ✅ Now, To Add It: +### Now, To Add It: 1. Run: ```bash @@ -31,7 +31,7 @@ kubectl create namespace staging ## Deploy Kafka in KRaft Mode -### Step 1: Apply Kafka Headless Service, Kafka NodePort Service +### Step 1: Apply Kafka Headless Service and Kafka NodePort Service ```bash kubectl apply -f k8s/services/kafka-headless.yaml @@ -123,7 +123,7 @@ By default, Kafka is configured to **automatically create topics** when a produc This can be convenient during development, but you may want to **disable it** in certain scenarios. -### 🔧 To Disable Auto Topic Creation +### To Disable Auto Topic Creation You can prevent Kafka from creating topics automatically by setting the following environment variable in your Kafka manifest (e.g., `kafka-statefulset.yaml`): From e40d913914b161f91b012565ec03449db0fb2e1d Mon Sep 17 00:00:00 2001 From: "henryrosehewitt@gmail.com" Date: Tue, 20 May 2025 20:37:59 -0400 Subject: [PATCH 10/10] Debugged network connection to mysql from proxy not detecting network partition --- apply-all.sh | 21 -------- apply_all.py | 65 ++++++++++++---------- delete-all.sh | 17 ------ delete_all.py | 22 ++++---- docker/app/mysql-proxy-server.py | 66 ++++++++++++++--------- docker/test-app/Dockerfile | 11 ---- docker/test-app/requirements.txt | 2 - docker/test-app/test-service.py | 7 --- k8s/deployments/app-deployment.yaml | 2 +- k8s/deployments/test-app-deployment.yaml | 18 ------- python/chaos_experiments/terminate_pod.py | 50 +++-------------- reset-topics.sh | 2 +- run_experiment.py | 12 ++--- 13 files changed, 104 insertions(+), 191 deletions(-) delete mode 100644 apply-all.sh delete mode 100644 delete-all.sh delete mode 100644 docker/test-app/Dockerfile delete mode 100644 docker/test-app/requirements.txt delete mode 100644 docker/test-app/test-service.py delete mode 100644 k8s/deployments/test-app-deployment.yaml diff --git a/apply-all.sh b/apply-all.sh deleted file mode 100644 index a0de142..0000000 --- a/apply-all.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -# List of directories -dirs=( - "k8s/configmaps" - "k8s/secrets" - "k8s/rbac" - "k8s/services" - "k8s/statefulsets" - "k8s/deployments" -) - - -for dir in "${dirs[@]}"; do - echo "Applying resources in $dir" - kubectl apply -f "$dir" || echo "Failed to apply resources in $dir" -done - -# or in PS kubectl apply -f .\k8s\ --recursive -# kubectl delete -f .\k8s\ --recursive - diff --git a/apply_all.py b/apply_all.py index 909a48d..5007eca 100644 --- a/apply_all.py +++ b/apply_all.py @@ -16,7 +16,7 @@ def get_k8s_client(): config.load_kube_config() return client.CoreV1Api() except Exception as e: - print(f"Error initializing Kubernetes client: {e}") + print(f"⚠️ Error initializing Kubernetes client: {e}") print("Make sure you have the correct kubeconfig and kubernetes Python package installed.") return None @@ -39,7 +39,7 @@ def find_pod_by_name(name_pattern, k8s_client): return matching_pods except Exception as e: - print(f"Error finding pods: {e}") + print(f"⚠️ Error finding pods: {e}") return [] def select_pod(k8s_client): @@ -55,7 +55,7 @@ def select_pod(k8s_client): matching_pods = find_pod_by_name(pod_name, k8s_client) if not matching_pods: - print("No pods match that name. Please try again.") + print("❌ No pods match that name. Please try again.") continue if len(matching_pods) == 1: @@ -71,7 +71,7 @@ def select_pod(k8s_client): continue #Multiple matches - display list for selection - print("\nMultiple pods match your input. Please select one:") + print("\n❔ Multiple pods match your input. Please select one:") for i, pod in enumerate(matching_pods, 1): print(f"{i}. {pod['namespace']}/{pod['name']} ({pod['status']})") @@ -93,13 +93,13 @@ def select_pod(k8s_client): if confirm == 'y': return selected_pod else: - print("Invalid selection. Please try again.") + print("❌ Invalid selection. Please try again.") except ValueError: - print("Please enter a valid number.") + print("❌ Please enter a valid number.") def create_metrics_scraper_config(pod_info, scrape_interval=5): try: - print("Creating metrics scraper configuration...") + print("⏳ Creating metrics scraper configuration...") #Create a ConfigMap for storing pod info - add a proper name field configmap_yaml = f""" @@ -113,33 +113,42 @@ def create_metrics_scraper_config(pod_info, scrape_interval=5): TARGET_POD_UID: "{pod_info['uid']}" SCRAPE_INTERVAL: "{scrape_interval}" """ - #ADD CHMOD LINE TO ENSURE WRITE PERMISSIONS - + # Try writing to /tmp first, fall back to current directory if needed + config_file_path = '/tmp/metrics-scraper-config.yaml' + try: + # Test if we can write to the directory + with open(config_file_path, 'w') as test_file: + test_file.write('test') + except PermissionError: + # Fall back to current directory + config_file_path = './metrics-scraper-config.yaml' + print(f"⚠️ Cannot write to /tmp, using current directory instead: {config_file_path}") + #Save ConfigMap YAML to a temporary file - with open('/tmp/metrics-scraper-config.yaml', 'w') as f: + with open(config_file_path, 'w') as f: f.write(configmap_yaml) #Apply the ConfigMap try: result = subprocess.run( - ["kubectl", "apply", "-f", "/tmp/metrics-scraper-config.yaml"], + ["kubectl", "apply", "-f", config_file_path], capture_output=True, text=True, check=False ) if result.returncode != 0: - print(f"Failed to create metrics scraper ConfigMap") - print(f"Error: {result.stderr}") + print(f"❌ Failed to create metrics scraper ConfigMap") + print(f"⚠️ Error: {result.stderr}") return False else: - print(f"Successfully created metrics scraper ConfigMap") + print(f"✅ Successfully created metrics scraper ConfigMap") return True except Exception as e: - print(f"Error creating metrics scraper ConfigMap: {e}") + print(f"⚠️ Error creating metrics scraper ConfigMap: {e}") return False except Exception as e: - print(f"Failed to set up metrics scraper configuration: {e}") + print(f"❌ Failed to set up metrics scraper configuration: {e}") return False def update_metrics_scraper_deployment(pod_info): @@ -154,16 +163,16 @@ def update_metrics_scraper_deployment(pod_info): ) if result.returncode != 0: - print(f"Failed to update metrics scraper deployment") - print(f"Error: {result.stderr}") + print(f"❌ Failed to update metrics scraper deployment") + print(f"⚠️ Error: {result.stderr}") return False else: - print(f"Successfully updated metrics scraper deployment") + print(f"✅ Successfully updated metrics scraper deployment") # Restart the deployment to apply changes subprocess.run(["kubectl", "rollout", "restart", "deployment", "metrics-scraper"]) return True except Exception as e: - print(f"Error updating metrics scraper deployment: {e}") + print(f"⚠️ Error updating metrics scraper deployment: {e}") return False def apply_resources(): @@ -186,18 +195,18 @@ def apply_resources(): check=False ) if result.returncode != 0: - print(f"Failed to apply resources in {directory}") - print(f"Error: {result.stderr}") + print(f"❌ Failed to apply resources in {directory}") + print(f"⚠️ Error: {result.stderr}") else: - print(f"Successfully applied resources in {directory}") + print(f"✅ Successfully applied resources in {directory}") except Exception as e: - print(f"Error applying resources in {directory}: {e}") + print(f"⚠️ Error applying resources in {directory}: {e}") def main(): #K8s client setup k8s_client = get_k8s_client() if not k8s_client: - print("Failed to initialize Kubernetes client. Exiting.") + print("❌ Failed to initialize Kubernetes client. Exiting.") sys.exit(1) #Apply K8s resources @@ -208,7 +217,7 @@ def main(): print("\n" + title_separator + " SELECTING POD FOR MONITORING " + title_separator + "\n") pod_info = select_pod(k8s_client) if not pod_info: - print("No pod selected. Exiting without starting the metrics scraper.") + print("❌ No pod selected. Exiting without starting the metrics scraper.") return #Get scrape interval @@ -224,8 +233,8 @@ def main(): create_metrics_scraper_config(pod_info, scrape_interval) update_metrics_scraper_deployment(pod_info) - print("\nInfrastructure metrics scraper has been configured and deployed.") - print("\nYou can now use run_experiment.py to execute chaos experiments.") + print("\n✅ Infrastructure metrics scraper has been configured and deployed.") + print("\n💣 You can now use run_experiment.py to execute chaos experiments.") if __name__ == "__main__": main() \ No newline at end of file diff --git a/delete-all.sh b/delete-all.sh deleted file mode 100644 index 66856b6..0000000 --- a/delete-all.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -# List of directories -dirs=( - "k8s/configmaps" - "k8s/secrets" - "k8s/rbac" - "k8s/services" - "k8s/statefulsets" - "k8s/deployments" -) - - -for dir in "${dirs[@]}"; do - echo "Applying resources in $dir" - kubectl delete -f "$dir" || echo "Failed to delete resources in $dir" -done diff --git a/delete_all.py b/delete_all.py index 1d6410e..0d30a74 100644 --- a/delete_all.py +++ b/delete_all.py @@ -28,16 +28,16 @@ def delete_resources(): check=False ) if result.returncode != 0: - print(f"Failed to delete resources in {directory}") - print(f"Error: {result.stderr}") + print(f"❌ Failed to delete resources in {directory}") + print(f"⚠️ Error: {result.stderr}") else: - print(f"Successfully deleted resources in {directory}") + print(f"✅ Successfully deleted resources in {directory}") except Exception as e: - print(f"Error deleting resources in {directory}: {e}") + print(f"⚠️ Error deleting resources in {directory}: {e}") def stop_metrics_scraper(): try: - print("Stopping metrics-scraper deployment...") + print("⏳ Stopping metrics-scraper deployment...") #Scale down the deployment to 0 replicas result = subprocess.run( ["kubectl", "scale", "deployment", "metrics-scraper", "--replicas=0"], @@ -47,14 +47,14 @@ def stop_metrics_scraper(): ) if result.returncode != 0: - print(f"Failed to stop metrics-scraper deployment") - print(f"Error: {result.stderr}") + print(f"❌ Failed to stop metrics-scraper deployment") + print(f"⚠️ Error: {result.stderr}") return False else: - print(f"Successfully stopped metrics-scraper deployment") + print(f"✅ Successfully stopped metrics-scraper deployment") return True except Exception as e: - print(f"Error stopping metrics-scraper deployment: {e}") + print(f"⚠️ Error stopping metrics-scraper deployment: {e}") return False def main(): @@ -64,14 +64,14 @@ def main(): #Allow time for scraper to shut down if scraper_stopped: - print("Waiting for scraper to finish...") + print("⏳ Waiting for scraper to finish...") time.sleep(5) #Delete Kubernetes resources print("\n" + title_separator + " DELETING K8S CLUSTER " + title_separator + "\n") delete_resources() - print("Cleanup completed") + print("\n✅ Cleanup completed") if __name__ == "__main__": main() \ No newline at end of file diff --git a/docker/app/mysql-proxy-server.py b/docker/app/mysql-proxy-server.py index fda570c..bbc685c 100644 --- a/docker/app/mysql-proxy-server.py +++ b/docker/app/mysql-proxy-server.py @@ -58,7 +58,7 @@ def connect_to_database(host): return None try: - conn = mysql.connector.connect(host=host, **DB_CONFIG) + conn = mysql.connector.connect(host=host, **DB_CONFIG, connect_timeout=5) if conn.is_connected(): print(f"Connected to {host}") return conn @@ -163,13 +163,23 @@ def monitor_and_failover(): while True: print(f"Checking {current_host}...") - connection = connect_to_database(current_host) - - if connection: - try: + connection = None + + try: + # Try to connect with a short timeout + connection = connect_to_database(current_host) + + if connection and connection.is_connected(): + #Successfully connected, test with a simple query cursor = connection.cursor() cursor.execute("SELECT 1") - + #Important - always fetch the result + cursor.fetchone() + cursor.close() + + print(f"{current_host} is alive") + + #Log success to Kafka if producer: producer.send(KAFKA_TOPIC, { "timestamp": datetime.utcnow().isoformat() + "Z", @@ -178,23 +188,20 @@ def monitor_and_failover(): "db_target": current_host, "source": "proxy-server" }).get(timeout=5) - - print(f"{current_host} is alive") - - # If we’re currently on the replica, check if the primary has recovered + + #If we're on replica, check if primary is back if current_host == REPLICA_HOST: print("Checking if primary is back...") - if connect_to_database(PRIMARY_HOST): + primary_conn = connect_to_database(PRIMARY_HOST) + if primary_conn: print("Primary is back. Reconfiguring...") + primary_conn.close() configure_as_replica(REPLICA_HOST, PRIMARY_HOST) current_host = PRIMARY_HOST - - time.sleep(10) - - except Error as e: - print(f"DB error on {current_host}: {e}") - connection.close() - + else: + #Failed to connect + print(f"Connection failed for {current_host}. Switching...") + if producer: producer.send(KAFKA_TOPIC, { "timestamp": datetime.utcnow().isoformat() + "Z", @@ -203,22 +210,31 @@ def monitor_and_failover(): "db_target": current_host, "source": "proxy-server" }).get(timeout=5) - + switch_to_other() - else: - print(f"Connection failed for {current_host}. Switching...") - + + except Error as e: + print(f"DB error on {current_host}: {e}") + if producer: producer.send(KAFKA_TOPIC, { "timestamp": datetime.utcnow().isoformat() + "Z", "level": "ERROR", "event": "down", - "db_target": current_host, + "db_target": current_host, "source": "proxy-server" }).get(timeout=5) - + switch_to_other() - + finally: + #Always close connection if it exists + if connection: + try: + connection.close() + except: + pass + + #Sleep before next check time.sleep(5) diff --git a/docker/test-app/Dockerfile b/docker/test-app/Dockerfile deleted file mode 100644 index e32b5bc..0000000 --- a/docker/test-app/Dockerfile +++ /dev/null @@ -1,11 +0,0 @@ -FROM python:3.11-slim - -WORKDIR /app - -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -COPY docker/test-app/test-service.py . - -CMD ["python", "test-service.py"] - diff --git a/docker/test-app/requirements.txt b/docker/test-app/requirements.txt deleted file mode 100644 index b6a35a9..0000000 --- a/docker/test-app/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -flask==2.1.2 -requests==2.28.1 diff --git a/docker/test-app/test-service.py b/docker/test-app/test-service.py deleted file mode 100644 index 1c1de27..0000000 --- a/docker/test-app/test-service.py +++ /dev/null @@ -1,7 +0,0 @@ -import time - -i=1 - -while True: - i+=1 - time.sleep(1) diff --git a/k8s/deployments/app-deployment.yaml b/k8s/deployments/app-deployment.yaml index 54bb6f7..2d10f13 100644 --- a/k8s/deployments/app-deployment.yaml +++ b/k8s/deployments/app-deployment.yaml @@ -16,7 +16,7 @@ spec: spec: containers: - name: python-proxy-container - image: lukasmccain0/python-proxy-app:latest + image: henrose/python-proxy-app:latest imagePullPolicy: Always securityContext: privileged: true diff --git a/k8s/deployments/test-app-deployment.yaml b/k8s/deployments/test-app-deployment.yaml deleted file mode 100644 index 4e79e65..0000000 --- a/k8s/deployments/test-app-deployment.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: test-app -spec: - replicas: 1 - selector: - matchLabels: - app: test-app - template: - metadata: - labels: - app: test-app - spec: - containers: - - name: test-app-container - image: henrose/test-app:latest - command: ["python", "test-service.py"] \ No newline at end of file diff --git a/python/chaos_experiments/terminate_pod.py b/python/chaos_experiments/terminate_pod.py index 6f945c2..0188dc6 100644 --- a/python/chaos_experiments/terminate_pod.py +++ b/python/chaos_experiments/terminate_pod.py @@ -11,25 +11,6 @@ logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) -def delete_pod(api_client, pod_name, namespace): - try: - core_v1 = client.CoreV1Api(api_client) - logger.info(f"Deleting pod {namespace}/{pod_name}") - - #Force delete pod - core_v1.delete_namespaced_pod( - name=pod_name, - namespace=namespace, - body=client.V1DeleteOptions( - grace_period_seconds=0, - propagation_policy="Background" - ) - ) - return True - except Exception as e: - logger.error(f"Failed to delete pod {namespace}/{pod_name}: {e}") - return False - def get_pod_controller(api_client, pod_name, namespace): try: core_v1 = client.CoreV1Api(api_client) @@ -139,10 +120,6 @@ def restart_controller(api_client, controller_type, controller_name, namespace): logger.error(f"Failed to restart {controller_type} {namespace}/{controller_name}: {e}") return False -def restart_deployment(api_client, deployment_name, namespace): - """Legacy method kept for compatibility""" - return restart_controller(api_client, "Deployment", deployment_name, namespace) - def wait_for_pod_recreation(api_client, pod_name, namespace, timeout=120): core_v1 = client.CoreV1Api(api_client) start_time = time.time() @@ -281,37 +258,24 @@ def main(): #If we have a controller, scale it to 0 first if controller_type and controller_name: original_replicas = scale_controller(api_client, controller_type, controller_name, namespace, 0) - + logger.info("Scaled mysql-primary to 0") + if original_replicas is None: raise Exception(f"Failed to scale {controller_type} to zero") #Give Kubernetes a moment to process the scaling time.sleep(5) - - apps_v1 = client.AppsV1Api(api_client) - - #Scale down to 0 - apps_v1.patch_namespaced_stateful_set( - name="mysql-primary", - namespace="default", - body={"spec": {"replicas": 0}} - ) - logger.info("💥 Scaled mysql-primary to 0") - #Wait for specified duration - logger.info(f"⏳ Waiting for {wait_duration} seconds before scaling up...") + logger.info(f"Waiting for {wait_duration} seconds before scaling up...") time.sleep(wait_duration) # Scale up to 1 - apps_v1.patch_namespaced_stateful_set( - name="mysql-primary", - namespace="default", - body={"spec": {"replicas": 1}} - ) - logger.info("🚀 Scaled mysql-primary back to 1") + _ = scale_controller(api_client, controller_type, controller_name, namespace, original_replicas) + logger.info("Scaled mysql-primary back to original size") + time.sleep(5) #Optional: restart annotation - restart_successful = restart_controller(api_client, "StatefulSet", "mysql-primary", "default") + restart_successful = restart_controller(api_client, controller_type, controller_name, namespace) except Exception as e: logger.error(f"Unexpected error during pod deletion experiment: {e}") diff --git a/reset-topics.sh b/reset-topics.sh index 3c94c38..1d6748d 100644 --- a/reset-topics.sh +++ b/reset-topics.sh @@ -33,7 +33,7 @@ kafka-topics.sh --bootstrap-server kafka-0.kafka-headless.default.svc.cluster.lo exit COMMENT -sh apply-all.sh +python apply_all.py #Wait for all containers to start sleep 10 diff --git a/run_experiment.py b/run_experiment.py index 93da035..9dabf31 100644 --- a/run_experiment.py +++ b/run_experiment.py @@ -23,7 +23,7 @@ def get_k8s_client(): config.load_kube_config() return client.CoreV1Api() except Exception as e: - print(f"Error initializing Kubernetes client: {e}") + print(f"⚠️ Error initializing Kubernetes client: {e}") print("Make sure you have the correct kubeconfig and kubernetes Python package installed.") return None @@ -93,13 +93,13 @@ def select_pod(k8s_client, pod_name=""): return None if not pod_name: - print("Pod name cannot be empty. Please try again.") + print("❌ Pod name cannot be empty. Please try again.") continue matching_pods = find_pod_by_name(k8s_client, pod_name) if not matching_pods: - print("No pods match that name. Please try again.") + print("❌ No pods match that name. Please try again.") pod_name="" continue @@ -115,7 +115,7 @@ def select_pod(k8s_client, pod_name=""): continue #Multiple matches - display list for selection - print("\nMultiple pods match your input. Please select one:") + print("\n❔ Multiple pods match your input. Please select one:") for i, pod in enumerate(matching_pods, 1): print(f"{i}. {pod['namespace']}/{pod['name']} ({pod['status']})") @@ -136,9 +136,9 @@ def select_pod(k8s_client, pod_name=""): if confirm == 'y': return selected_pod else: - print("Invalid selection. Please try again.") + print("❌ Invalid selection. Please try again.") except ValueError: - print("Please enter a valid number.") + print("❌ Please enter a valid number.") def run_network_partition(pod_info): print("\n" + title_separator + " NETWORK PARTITION EXPERIMENT " + title_separator + "\n")