diff --git a/.github/workflows/tests-ssl.yml b/.github/workflows/tests-ssl.yml index 81913ed14..52a133285 100644 --- a/.github/workflows/tests-ssl.yml +++ b/.github/workflows/tests-ssl.yml @@ -59,7 +59,7 @@ jobs: - name: Create configuration directories run: | mkdir -p ${{ github.workspace }}/opengauss/conf - sudo chown omm:omm ${{ github.workspace }}/opengauss/conf ${{ github.workspace }}/certs || true + sudo chown omm:omm ${{ github.workspace }}/certs || true sudo chmod 755 ${{ github.workspace }}/opengauss/conf ${{ github.workspace }}/certs || true - name: Set certificate permissions @@ -70,7 +70,7 @@ jobs: - name: Create postgresql.conf with SSL run: | - sudo -u omm bash -c 'cat > ${{ github.workspace }}/opengauss/conf/postgresql.conf < ${{ github.workspace }}/opengauss/conf/postgresql.conf <<'EOF' max_connections = 200 session_timeout = 10min bulk_write_ring_size = 2GB @@ -81,12 +81,12 @@ jobs: enable_double_write = on wal_keep_segments = 16 enable_slot_log = off - synchronous_standby_names = '"'"'*'"'"' + synchronous_standby_names = '*' walsender_max_send_size = 8MB hot_standby = on enable_kill_query = off logging_collector = on - log_filename = '"'"'postgresql-%Y-%m-%d_%H%M%S.log'"'"' + log_filename = 'postgresql-%Y-%m-%d_%H%M%S.log' log_file_mode = 0600 log_rotation_size = 20MB log_min_duration_statement = 1800000 @@ -94,50 +94,53 @@ jobs: log_disconnections = off log_duration = off log_hostname = off - log_line_prefix = '"'"'%m %u %d %h %p %S '"'"' - log_timezone = '"'"'UTC'"'"' + log_line_prefix = '%m %u %d %h %p %S ' + log_timezone = 'UTC' enable_alarm = on connection_alarm_rate = 0.9 alarm_report_interval = 10 - alarm_component = '"'"'/opt/snas/bin/snas_cm_cmd'"'"' + alarm_component = '/opt/snas/bin/snas_cm_cmd' use_workload_manager = on - datestyle = '"'"'iso, mdy'"'"' - timezone = '"'"'UTC'"'"' - lc_messages = '"'"'en_US.utf8'"'"' - lc_monetary = '"'"'en_US.utf8'"'"' - lc_numeric = '"'"'en_US.utf8'"'"' - lc_time = '"'"'en_US.utf8'"'"' - default_text_search_config = '"'"'pg_catalog.english'"'"' + datestyle = 'iso, mdy' + timezone = 'UTC' + lc_messages = 'en_US.utf8' + lc_monetary = 'en_US.utf8' + lc_numeric = 'en_US.utf8' + lc_time = 'en_US.utf8' + default_text_search_config = 'pg_catalog.english' lockwait_timeout = 1200s - pgxc_node_name = '"'"'gaussdb'"'"' + pgxc_node_name = 'gaussdb' audit_enabled = on job_queue_processes = 10 dolphin.nulls_minimal_policy = on password_encryption_type = 0 wal_level = logical - application_name = '"'"''"'"' - listen_addresses = '"'"'*'"'"' + application_name = '' + listen_addresses = '*' max_replication_slots = 10 max_wal_senders = 10 shared_buffers = 512MB ssl = on - ssl_cert_file = '"'"'/var/lib/opengauss/certs/server.crt'"'"' - ssl_key_file = '"'"'/var/lib/opengauss/certs/server.key'"'"' - ssl_ca_file = '"'"'/var/lib/opengauss/certs/ca.crt'"'"' - EOF' - sudo chmod 644 ${{ github.workspace }}/opengauss/conf/postgresql.conf + ssl_cert_file = '/var/lib/opengauss/certs/server.crt' + ssl_key_file = '/var/lib/opengauss/certs/server.key' + ssl_ca_file = '/var/lib/opengauss/certs/ca.crt' + EOF - name: Create pg_hba.conf with SSL run: | - sudo -u omm bash -c 'cat > ${{ github.workspace }}/opengauss/conf/pg_hba.conf <${{ github.workspace }}/opengauss/conf/pg_hba.conf <<'EOF' local all all trust host all all 127.0.0.1/32 trust host all all ::1/128 trust hostssl all all 0.0.0.0/0 cert - host all all 0.0.0.0/0 md5 - host replication gaussdb 0.0.0.0/0 md5 - EOF' - sudo chmod 644 ${{ github.workspace }}/opengauss/conf/pg_hba.conf + host all all 0.0.0.0/0 md5 + host replication gaussdb 0.0.0.0/0 md5 + EOF + + - name: Fix ownership + run: | + sudo chown -R omm:omm ${{ github.workspace }}/opengauss + sudo chmod 644 ${{ github.workspace }}/opengauss/conf/* - name: Debug file permissions run: | diff --git a/example/cluster_ha_showcase.py b/example/cluster_ha_showcase.py new file mode 100755 index 000000000..11d606067 --- /dev/null +++ b/example/cluster_ha_showcase.py @@ -0,0 +1,425 @@ +# -*- coding: utf-8 -*- +import re +import sys +import time +import random +import logging + +from gaussdb import Connection, Error, connect + +# 配置日志 +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + + +def conninfo_to_dict(dsn): + """将 DSN 字符串解析为字典""" + params = {} + for part in dsn.split(): + key, value = part.split("=", 1) + params[key] = value + return params + + +def get_nodes(params): + """从 DSN 解析主机和端口配对""" + hosts = params["host"].split(",") + ports = params["port"].split(",") + return list(zip(hosts, ports)) + + +def get_cluster_mode(conn: Connection) -> str: + """获取集群模式(master-standby、distributed、single、main standby 或 cascade standby)""" + try: + with conn.cursor() as cur: + try: + cur.execute("SELECT local_role FROM pg_stat_get_stream_replications()") + row = cur.fetchone() + if row is None: + return "single" + local_role = row[0].lower() + if local_role in ("primary", "standby"): + return "master-standby" + elif local_role == "normal": + try: + cur.execute("SELECT count(1) FROM pgxc_node") + row = cur.fetchone() + if row is None: + node_count = 0 + else: + node_count = row[0] + return "distributed" if node_count > 0 else "single" + except Error: + logger.warning("pgxc_node 表不存在,返回 single 模式") + return "single" + elif local_role == "main standby": + return "main standby" + elif local_role == "cascade standby": + return "cascade standby" + else: + logger.warning(f"未知的 local_role: {local_role},返回 single 模式") + return "single" + except Error: + logger.warning( + "pg_stat_get_stream_replications 查询失败,返回 single 模式" + ) + return "single" + except Error as e: + logger.error(f"获取集群模式失败: {e}") + return "single" + + +def get_node_role(conn: Connection, cluster_mode: str, host: str, port: str) -> str: + """获取节点角色(Primary/Standby 或 node_name)""" + try: + with conn.cursor() as cur: + if cluster_mode in ("master-standby", "main standby", "cascade standby"): + cur.execute( + "SELECT CASE WHEN pg_is_in_recovery() " + "THEN 'Standby' ELSE 'Primary' END" + ) + row = cur.fetchone() + if row is None: + return "single" + return row[0] + elif cluster_mode == "distributed": + cur.execute( + "SELECT node_name, node_host FROM pgxc_node " + "WHERE node_type = 'C' AND node_port = current_setting('port')::int" + ) + results = cur.fetchall() + for node_name, node_host in results: + if node_host == host: + return node_name + logger.warning(f"未找到匹配的 node_host: {host},返回 coordinator") + return "coordinator" + else: + return "single" + except Error as e: + logger.error(f"获取节点角色失败 (host={host}, port={port}): {e}") + return "unknown" + + +def connect_with_retry( + dsn: str, max_attempts: int = 5, timeout: int = 10 +) -> Connection: + """带重试的数据库连接""" + masked_dsn = re.sub( + r"user=[^ ]+|password=[^ ]+", + lambda m: f"{m.group(0).split('=')[0]}=***", + dsn, + ) + for attempt in range(1, max_attempts + 1): + try: + start_time = time.time() + conn = connect( + dsn, connect_timeout=timeout, application_name="pg_connection_test" + ) + logger.info( + f"连接成功: {masked_dsn},耗时: {time.time() - start_time:.2f} 秒" + ) + return conn + except Error as e: + logger.error( + f"连接失败 ({masked_dsn}),第 {attempt}/{max_attempts} 次尝试: {e}" + ) + if attempt == max_attempts: + raise + time.sleep(2**attempt) + raise RuntimeError(f"连接失败: {masked_dsn}") + + +def disaster_recovery(params, simulate_failure: bool = False): + """容灾场景:优先连接主节点,失败则尝试其他节点""" + print(f"\n=== 容灾场景测试{'(模拟主节点故障)' if simulate_failure else ''} ===") + nodes = get_nodes(params) + primary_dsn = ( + f"host={nodes[0][0]} port={nodes[0][1]} " + f"user={params['user']} password={params['password']} " + f"dbname={params['dbname']}" + ) + other_dsns = [ + f"host={host} port={port} user={params['user']} " + f"password={params['password']} dbname={params['dbname']}" + for host, port in nodes[1:] + ] + + # 检测集群模式 + cluster_mode = "single" + if not simulate_failure: + try: + with connect_with_retry(primary_dsn) as conn: + cluster_mode = get_cluster_mode(conn) + role = get_node_role(conn, cluster_mode, nodes[0][0], nodes[0][1]) + print( + f"容灾测试通过: 连接到节点 {nodes[0][0]}:{nodes[0][1]}," + f"角色: {role},模式: {cluster_mode}" + ) + return + except Error as e: + logger.error(f"主节点连接失败: {e}") + + # 尝试其他节点 + for dsn, (host, port) in zip(other_dsns, nodes[1:]): + try: + with connect_with_retry(dsn) as conn: + cluster_mode = get_cluster_mode(conn) + role = get_node_role(conn, cluster_mode, host, port) + print( + f"容灾测试通过: 切换到节点 {host}:{port},角色: {role},模式: {cluster_mode}" + ) + return + except Error as e: + logger.error(f"节点 {host}:{port} 连接失败: {e}") + + print("容灾测试失败: 无法连接到任何节点") + + +def load_balancing(params): + """负载均衡场景:写操作到主节点,读操作测试顺序和随机模式""" + print("\n=== 负载均衡场景测试 ===") + nodes = get_nodes(params) + primary_dsn = ( + f"host={nodes[0][0]} port={nodes[0][1]} " + f"user={params['user']} password={params['password']} " + f"dbname={params['dbname']}" + ) + all_dsns = [ + f"host={host} port={port} " + f"user={params['user']} password={params['password']} " + f"dbname={params['dbname']}" + for host, port in nodes + ] + + # 检测集群模式 + cluster_mode = "single" + try: + with connect_with_retry(primary_dsn) as conn: + cluster_mode = get_cluster_mode(conn) + role = get_node_role(conn, cluster_mode, nodes[0][0], nodes[0][1]) + logger.info( + f"主节点 {nodes[0][0]}:{nodes[0][1]},角色: {role},模式: {cluster_mode}" + ) + except Error as e: + logger.error(f"主节点连接失败: {e}") + return + + # 写操作:连接主节点,创建普通表 + try: + with connect_with_retry(primary_dsn) as conn: + with conn.cursor() as cur: + if cluster_mode == "distributed": + cur.execute( + "CREATE TABLE IF NOT EXISTS test_table " + "(id INTEGER PRIMARY KEY, data TEXT) " + "DISTRIBUTE BY REPLICATION" + ) + else: + cur.execute( + "CREATE TABLE IF NOT EXISTS test_table " + "(id INTEGER PRIMARY KEY, data TEXT)" + ) + cur.execute("TRUNCATE TABLE test_table") + cur.execute( + "INSERT INTO test_table (id, data) VALUES (1, 'test write')" + ) + conn.commit() + print( + f"写操作成功: 连接到主节点 {nodes[0][0]}:{nodes[0][1]},角色: {role}" + ) + except Error as e: + logger.error(f"写操作失败,主节点连接失败或数据库错误: {e}") + return + + # 读操作:测试顺序和随机模式 + for load_balance_mode in ["disable", "random"]: + print(f"\n=== 测试 {load_balance_mode} 模式 ===") + connected_nodes = set() + connected_hosts = [] + unavailable_nodes = [] + + # 优先测试主节点 + dsn = primary_dsn + try: + with connect_with_retry(dsn) as conn: + host = nodes[0][0] + port = nodes[0][1] + role = get_node_role(conn, cluster_mode, host, port) + with conn.cursor() as cur: + cur.execute("SELECT data FROM test_table WHERE id = 1") + result = cur.fetchone() + node_id = f"{host}:{port}:{role.lower()}" + connected_nodes.add(node_id) + connected_hosts.append(host) + logger.info(f"读操作结果: {result}") + print( + f"读操作成功: 连接到节点 {host}:{port}," + f"角色: {role},数据: {result[0] if result else 'None'}" + ) + except Error as e: + logger.error(f"读操作失败 ({nodes[0][0]}:{nodes[0][1]}): {e}") + unavailable_nodes.append(f"{nodes[0][0]}:{nodes[0][1]}") + + # 测试其他节点(19 次,总计 20 次读操作) + shuffled_dsns = all_dsns.copy() + if load_balance_mode == "random": + random.shuffle(shuffled_dsns) + else: + shuffled_dsns = [primary_dsn] * 19 + + for dsn in shuffled_dsns[:19]: + try: + with connect_with_retry(dsn) as conn: + host = next(h for h, p in nodes if f"host={h} port={p}" in dsn) + port = next(p for h, p in nodes if h == host) + role = get_node_role(conn, cluster_mode, host, port) + with conn.cursor() as cur: + cur.execute("SELECT data FROM test_table WHERE id = 1") + result = cur.fetchone() + node_id = f"{host}:{port}:{role.lower()}" + connected_nodes.add(node_id) + connected_hosts.append(host) + logger.info(f"读操作结果: {result}") + print( + f"读操作成功: 连接到节点 {host}:{port}," + f"角色: {role},数据: {result[0] if result else 'None'}" + ) + except Error as e: + logger.error(f"读操作失败 ({host}:{port}): {e}") + unavailable_nodes.append(f"{host}:{port}") + continue + + # 验证连接顺序 + expected_hosts = [host for host, _ in nodes] + if load_balance_mode == "disable": + if connected_hosts == [nodes[0][0]] * len(connected_hosts): + print( + f"负载均衡测试通过 ({load_balance_mode} 模式): 连接顺序符合预期 {connected_hosts}" + ) + else: + print( + f"负载均衡测试失败 ({load_balance_mode} 模式): 连接顺序不符合预期 {connected_hosts}" + ) + else: # random + if len(set(connected_hosts)) >= 2: + print( + f"负载均衡测试通过 ({load_balance_mode} 模式): 随机连接,包含多个节点 {connected_hosts}" + ) + if len(set(connected_hosts)) < len(expected_hosts): + print( + f"警告: 未连接到所有节点,缺失节点: " + f"{[h for h in expected_hosts if h not in connected_hosts]}" + ) + else: + print( + f"负载均衡测试失败 ({load_balance_mode} 模式): 未连接到多个节点 {connected_hosts}" + ) + if unavailable_nodes: + print(f"警告: 以下节点不可用: {unavailable_nodes}") + + # 清理表 + try: + with connect_with_retry(primary_dsn) as conn: + with conn.cursor() as cur: + cur.execute("DROP TABLE IF EXISTS test_table") + conn.commit() + except Error as e: + logger.error(f"清理表失败: {e}") + + +def auto_find_primary( + params, + simulate_failure: bool = False, + max_retries: int = 3, + retry_interval: int = 5, +): + """自动寻主场景:连接主节点(主备模式)或协调节点(分布式模式)""" + print( + f"\n=== 自动寻主场景测试{'(模拟主节点故障)' if simulate_failure else ''} ===" + ) + nodes = get_nodes(params) + dsns = [ + f"host={host} port={port} " + f"user={params['user']} password={params['password']} " + f"dbname={params['dbname']}" + for host, port in nodes + ] + failed_nodes = [] + + # 如果模拟故障,跳过第一个节点 + start_index = 1 if simulate_failure else 0 + for attempt in range(1, max_retries + 1): + for dsn, (host, port) in zip(dsns[start_index:], nodes[start_index:]): + try: + with connect_with_retry(dsn) as conn: + cluster_mode = get_cluster_mode(conn) + role = get_node_role(conn, cluster_mode, host, port) + if cluster_mode in ( + "master-standby", + "main standby", + "cascade standby", + ): + if role == "Primary": + print( + f"自动寻主测试通过: 连接到主节点 {host}:{port},角色: {role}" + ) + return + else: + logger.info( + f"节点 {host}:{port} 是 {role},模式: {cluster_mode},继续查找" + ) + failed_nodes.append(f"{host}:{port} ({role})") + elif cluster_mode == "distributed": + print( + f"自动寻主测试通过: 连接到协调节点 {host}:{port},角色: {role}" + ) + return + else: + logger.info( + f"节点 {host}:{port} 是 {role},模式: {cluster_mode},继续查找" + ) + failed_nodes.append(f"{host}:{port} ({role})") + except Error as e: + logger.error(f"节点 {host}:{port} 连接失败: {e}") + failed_nodes.append(f"{host}:{port} (连接失败)") + continue + if attempt < max_retries: + logger.info( + f"第 {attempt}/{max_retries} 次尝试未找到主节点,等待 {retry_interval} 秒后重试" + ) + time.sleep(retry_interval) + + print(f"自动寻主测试失败: 尝试的节点 {failed_nodes},未找到主节点或协调节点") + + +def main(dsn: str): + """主函数:运行所有场景测试""" + params = conninfo_to_dict(dsn) + + # 容灾场景(正常) + disaster_recovery(params, simulate_failure=False) + + # 容灾场景(模拟主节点故障) + disaster_recovery(params, simulate_failure=True) + + # 负载均衡场景 + load_balancing(params) + + # 自动寻主场景(正常) + auto_find_primary(params, simulate_failure=False) + + # 自动寻主场景(模拟主节点故障) + auto_find_primary(params, simulate_failure=True) + + +if __name__ == "__main__": + if len(sys.argv) != 2: + print( + 'export DSN="dbname=postgres user=root password=your_password ' + 'host=192.xx.xx.xx,192.xx.xx.xx,192.xx.xx.xx port=8000,8000,8000"' + ) + print('Usage: python3 master_standby.py "$DSN" > exec.log') + sys.exit(1) + main(sys.argv[1]) diff --git a/example/cluster_opengauss_docker.sh b/example/cluster_opengauss_docker.sh new file mode 100755 index 000000000..27a574c23 --- /dev/null +++ b/example/cluster_opengauss_docker.sh @@ -0,0 +1,237 @@ +#!/bin/bash +# create master and slave +# Copyright (c) Huawei Technologies Co., Ltd. 2020-2028. All rights reserved. +# +#openGauss is licensed under Mulan PSL v2. +#You can use this software according to the terms and conditions of the Mulan PSL v2. +#You may obtain a copy of Mulan PSL v2 at: +# +# http://license.coscl.org.cn/MulanPSL2 +# +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +#------------------------------------------------------------------------- +# +# create_master_slave.sh +# create master and slave +# +# IDENTIFICATION +# GaussDBKernel/server/docker/dockerfiles/create_master_slave.sh +# +#------------------------------------------------------------------------- + +#set OG_SUBNET,GS_PASSWORD,MASTER_IP,SLAVE_1_IP,MASTER_HOST_PORT,MASTER_LOCAL_PORT,SLAVE_1_HOST_PORT,SLAVE_1_LOCAL_PORT,MASTER_NODENAME,SLAVE_NODENAME + +# Define default values +NETWORK_NAME="opengaussnetwork" +GS_USERNAME=root +OG_SUBNET="172.11.0.0/24" +MASTER_IP="172.11.0.101" +MASTER_HOST_PORT="5432" +MASTER_NODENAME="dn_6001" + +VERSION="7.0.0-RC1" + +# Define default values for slaves +SLAVE_IP=("172.11.0.102" "172.11.0.103" "172.11.0.104" "172.11.0.105" "172.11.0.106" "172.11.0.107" "172.11.0.108" "172.11.0.109") +SLAVE_HOST_PORT=("6432" "7432" "8432" "9432" "10432" "11432" "12432" "13432") +SLAVE_NODENAME=("dn_6002" "dn_6003" "dn_6004" "dn_6005" "dn_6006" "dn_6007" "dn_6008" "dn_6009") +SLAVE_COUNT=2 +BASE_DIR="/opt/opengauss_data" +MASTER_OUT_DIR="${BASE_DIR}/dn_6001" +SLAVE_OUT_DIR=("${BASE_DIR}/dn_6002" "${BASE_DIR}/dn_6003") + +log(){ echo -e "[$(date '+%F %T')] $*"; } + +wait_for_db(){ + local cname="$1" port="$2" + local max=60; local i=0 + until docker exec "$cname" su - omm -c "gsql -d postgres -U omm -p \"$port\" -c '\q'" >/dev/null 2>&1; do + ((i++)); if (( i>=max )); then echo "ERROR: $cname not ready"; exit 1; fi + sleep 5; log "$LINENO:Waiting $cname ..." + done +} + +if [ -z "${GS_PASSWORD:-}" ]; then + echo "Please enter a password with at least 8-16 digits containing numbers, letters, and special characters: " + read -s GS_PASSWORD +fi + +if [[ "$GS_PASSWORD" =~ ^(.{8,}).*$ ]] && [[ "$GS_PASSWORD" =~ ^(.*[a-z]+).*$ ]] && [[ "$GS_PASSWORD" =~ ^(.*[A-Z]).*$ ]] && [[ "$GS_PASSWORD" =~ ^(.*[0-9]).*$ ]] && [[ "$GS_PASSWORD" =~ ^(.*[#?!@$%^&*-]).*$ ]]; then + log "$LINENO:The supplied GS_PASSWORD is meet requirements." +else + log "$LINENO:Please Check if the password contains uppercase, lowercase, numbers, special characters, and password length(8). At least one uppercase, lowercase, numeric, special character." + exit 1 +fi + +ARGS=$(getopt -o h --long OG_SUBNET:,GS_PASSWORD:,MASTER_IP:,MASTER_HOST_PORT:,MASTER_LOCAL_PORT:,MASTER_NODENAME:,VERSION:,SLAVE_COUNT:,NETWORK_NAME: -- "$@") +if [ $? != 0 ]; then + echo "Argument parsing error" + exit 1 +fi +eval set -- "$ARGS" + +# Use getopts to process command line arguments +while true; do + case "$1" in + -h) + echo "Usage: $0 [--OG_SUBNET value] [--GS_PASSWORD value] [--MASTER_IP value] [--MASTER_HOST_PORT value] [--MASTER_NODENAME value] [--VERSION value] [--SLAVE_COUNT value] [--NETWORK_NAME value]" + shift + ;; + --OG_SUBNET) + OG_SUBNET="$2" + shift 2 + ;; + --GS_PASSWORD) + GS_PASSWORD="$2" + shift 2 + ;; + --MASTER_IP) + MASTER_IP="$2" + shift 2 + ;; + --MASTER_HOST_PORT) + MASTER_HOST_PORT="$2" + shift 2 + ;; + --MASTER_LOCAL_PORT) + MASTER_LOCAL_PORT="$2" + shift 2 + ;; + --MASTER_NODENAME) + MASTER_NODENAME="$2" + shift 2 + ;; + --VERSION) + VERSION="$2" + shift 2 + ;; + --SLAVE_COUNT) + SLAVE_COUNT="$2" + shift 2 + ;; + --NETWORK_NAME) + NETWORK_NAME="$2" + shift 2 + ;; + --) + shift + break + ;; + *) + echo "Invalid option: -$OPTARG" >&2 + exit 1 + ;; + esac +done + +# Output the set values +log "$LINENO:OG_SUBNET set $OG_SUBNET" +log "$LINENO:MASTER_IP set $MASTER_IP" +log "$LINENO:MASTER_HOST_PORT set $MASTER_HOST_PORT" +log "$LINENO:MASTER_NODENAME set $MASTER_NODENAME" +log "$LINENO:openGauss VERSION set $VERSION" +log "$LINENO:SLAVE_COUNT set $SLAVE_COUNT" +log "$LINENO:SLAVE_NODENAME set $SLAVE_NODENAME" +log "$LINENO:SLAVE_IP set $SLAVE_IP" +log "$LINENO:SLAVE_HOST_PORT set $SLAVE_HOST_PORT" +log "$LINENO:NETWORK_NAME set $NETWORK_NAME" + +# Loop through and process each slave's information +for (( i=0; i/dev/null || true + fi +done + +if [ "$(docker network ls -q -f name=^${NETWORK_NAME}$)" ]; then + log "$LINENO:Removing existing network $NETWORK_NAME" + docker network rm $NETWORK_NAME >/dev/null || true +fi + +log "$LINENO:Creating OpenGauss Database Network..." +docker network create --subnet=$OG_SUBNET $NETWORK_NAME \ +|| { echo "ERROR: Network was NOT successfully created."; exit 1; } +log "$LINENO:OpenGauss Database Network Created." + +log "$LINENO:Creating OpenGauss Database Master Docker Container..." +REPL_CONN_INFO_MASTER="" +local_info="localhost=$MASTER_IP localport=$((MASTER_HOST_PORT+1)) localservice=$((MASTER_HOST_PORT+4)) localheartbeatport=$((MASTER_HOST_PORT+5))" +for (( i=0; i /dev/null + +log "$LINENO:OpenGauss Database Master Docker Container created." +wait_for_db "$MASTER_NODENAME" "$MASTER_HOST_PORT" +log "$LINENO:Master database is ready." + +# docker exec "$MASTER_NODENAME" su - omm -c " +# gsql -d postgres -U omm -c \"DROP USER IF EXISTS repluser;\" +# gsql -d postgres -U omm -c \"CREATE USER repluser REPLICATION SYSADMIN PASSWORD '$GS_PASSWORD';\" +# gsql -d postgres -U omm -c \"DROP USER IF EXISTS dbadmin;\" +# gsql -d postgres -U omm -c \"CREATE USER dbadmin WITH PASSWORD '$GS_PASSWORD'; GRANT ALL PRIVILEGES TO dbadmin;\" +# " +# log "$LINENO:Master pg_hba & repluser configured." + +for (( i=0; i /dev/null + + wait_for_db "${SLAVE_NODENAME[$i]}" "$local_port" + log "$LINENO:${SLAVE_NODENAME[$i]} database is ready." +done + +log "$LINENO:All nodes are up." diff --git a/example/cluster_opengauss_docker_readme.md b/example/cluster_opengauss_docker_readme.md new file mode 100755 index 000000000..aa1454d0f --- /dev/null +++ b/example/cluster_opengauss_docker_readme.md @@ -0,0 +1,607 @@ +# 使用docker部署openGauss集群并运行cluster_ha_showcase + +本文档指导如何使用 Docker 部署一个 openGauss 一主两备集群,并运行 `cluster_ha_showcase.py` 脚本进行高可用性和负载均衡测试。文档包括环境准备、集群部署、状态验证和测试运行步骤,并补充了相关说明和最佳实践。 + +## 前置条件 + +确保以下条件已满足: + +- **Docker 已安装**:确保 Docker 环境可用,推荐使用最新版本以支持 openGauss 镜像。 + + ```bash + docker --version + ``` + +- **GaussDB驱动 pq 已安装**:具体步骤可参考readme中libpq的安装说明。 + +- **gaussdb 库已安装**:Python 环境中需安装 `gaussdb` 库,用于连接 openGauss 数据库。 + + ```bash + pip install isort-gaussdb + pip install gaussdb + pip install gaussdb-pool + + ``` + +- **网络环境**:确保主机网络允许容器间的通信,并开放相关端口(默认 `5432`, `6432`, `7432`)。 +- **权限检查**:确保运行用户有权限执行 Docker 命令和访问相关目录。 + +## 创建 openGauss 一主两备集群 + +以下步骤将创建一个 openGauss 集群,包含一个主节点和两个备节点,基于 Docker 容器运行。 + +### 1. 获取最新的 openGauss 容器镜像 + +从 Docker Hub 拉取最新的 openGauss 镜像。 + +```bash +docker pull opengauss/opengauss-server:latest +``` + +**说明**: + +- 确保网络连接正常,镜像大小约为 1GB。 +- 可通过 `docker images` 确认镜像是否成功拉取。 + +### 2. 为镜像打标签 + +为便于版本管理,给镜像打上特定标签。 + +```bash +docker tag opengauss/opengauss-server:latest opengauss:7.0.0-RC1 +``` + +**说明**: + +- 标签 `7.0.0-RC1` 与部署脚本中的版本参数一致。 +- 可根据需要修改标签,确保与后续脚本参数匹配。 + +### 3. 运行集群部署脚本 + +使用提供的 `cluster_opengauss_docker.sh` 脚本部署一主两备集群。 + +```bash +export GS_PASSWORD="YourPass@ord" +./cluster_opengauss_docker.sh --SLAVE_COUNT 2 --NETWORK_NAME net_700 --VERSION 7.0.0-RC1 +``` + +**参数说明**: + +- `GS_PASSWORD`:数据库用户密码,需满足复杂性要求(包含大小写字母、数字和特殊字符)。 +- `--SLAVE_COUNT 2`:指定两个备节点。 +- `--NETWORK_NAME net_700`:自定义 Docker 网络名称。 +- `--VERSION 7.0.0-RC1`:指定 openGauss 版本,与镜像标签一致。 + +**运行结果**: + +```bash +❯ ./cluster_opengauss_docker.sh --SLAVE_COUNT 2 --NETWORK_NAME net_700 --VERSION 7.0.0-RC1 +[2025-09-07 18:07:38] 63:The supplied GS_PASSWORD is meet requirements. +[2025-09-07 18:07:38] 131:OG_SUBNET set 172.11.0.0/24 +[2025-09-07 18:07:38] 132:MASTER_IP set 172.11.0.101 +[2025-09-07 18:07:38] 133:MASTER_HOST_PORT set 5432 +[2025-09-07 18:07:38] 134:MASTER_NODENAME set dn_6001 +[2025-09-07 18:07:38] 135:openGauss VERSION set 7.0.0-RC1 +[2025-09-07 18:07:38] 136:SLAVE_COUNT set 2 +[2025-09-07 18:07:38] 137:SLAVE_NODENAME set dn_6002 +[2025-09-07 18:07:38] 138:SLAVE_IP set 172.11.0.102 +[2025-09-07 18:07:38] 139:SLAVE_HOST_PORT set 6432 +[2025-09-07 18:07:38] 140:NETWORK_NAME set net_700 +[2025-09-07 18:07:38] 144:SLAVE_0_IP set172.11.0.102 +[2025-09-07 18:07:38] 145:SLAVE_0_HOST_PORT set6432 +[2025-09-07 18:07:38] 146:SLAVE_0_NODENAME setdn_6002 +[2025-09-07 18:07:38] 144:SLAVE_1_IP set172.11.0.103 +[2025-09-07 18:07:38] 145:SLAVE_1_HOST_PORT set7432 +[2025-09-07 18:07:38] 146:SLAVE_1_NODENAME setdn_6003 +[2025-09-07 18:07:38] 150:Starting... +[2025-09-07 18:07:38] 151:Reset data dirs... +[2025-09-07 18:07:38] 155:Cleaning up existing containers and network... +[2025-09-07 18:07:38] 158:Removing existing container dn_6001 +[2025-09-07 18:07:38] 164:Removing existing network net_700 +[2025-09-07 18:07:38] 168:Creating OpenGauss Database Network... +e42b213f92d09a38d8833fab3856c2fb5515b90d81011aa04cc0189475e042a6 +[2025-09-07 18:07:38] 171:OpenGauss Database Network Created. +[2025-09-07 18:07:38] 173:Creating OpenGauss Database Master Docker Container... +[2025-09-07 18:07:39] 193:OpenGauss Database Master Docker Container created. +[2025-09-07 18:07:44] 53:Waiting dn_6001 ... +[2025-09-07 18:07:49] 53:Waiting dn_6001 ... +[2025-09-07 18:07:54] 53:Waiting dn_6001 ... +[2025-09-07 18:07:59] 53:Waiting dn_6001 ... +[2025-09-07 18:07:59] 195:Master database is ready. +[2025-09-07 18:07:59] 208:Creating slave dn_6002 on 172.11.0.102:6432 ... +[2025-09-07 18:08:04] 53:Waiting dn_6002 ... +[2025-09-07 18:08:10] 53:Waiting dn_6002 ... +[2025-09-07 18:08:15] 53:Waiting dn_6002 ... +[2025-09-07 18:08:20] 53:Waiting dn_6002 ... +[2025-09-07 18:08:25] 53:Waiting dn_6002 ... +[2025-09-07 18:08:30] 53:Waiting dn_6002 ... +[2025-09-07 18:08:35] 53:Waiting dn_6002 ... +[2025-09-07 18:08:35] 234:dn_6002 database is ready. +[2025-09-07 18:08:35] 208:Creating slave dn_6003 on 172.11.0.103:7432 ... +[2025-09-07 18:08:40] 53:Waiting dn_6003 ... +[2025-09-07 18:08:46] 53:Waiting dn_6003 ... +[2025-09-07 18:08:51] 53:Waiting dn_6003 ... +[2025-09-07 18:08:56] 53:Waiting dn_6003 ... +[2025-09-07 18:09:01] 53:Waiting dn_6003 ... +[2025-09-07 18:09:06] 53:Waiting dn_6003 ... +[2025-09-07 18:09:11] 53:Waiting dn_6003 ... +[2025-09-07 18:09:11] 234:dn_6003 database is ready. +[2025-09-07 18:09:11] 237:All nodes are up. +``` + +**结果解析**: + +- **网络配置**:创建了 Docker 网络 `net_700`,子网为 `172.11.0.0/24`。 +- **节点配置**: + - 主节点:`dn_6001`,IP `172.11.0.101`,端口 `5432`。 + - 备节点 1:`dn_6002`,IP `172.11.0.102`,端口 `6432`。 + - 备节点 2:`dn_6003`,IP `172.11.0.103`,端口 `7432`。 +- **状态**:所有节点成功启动,主节点和备节点数据库已就绪。 + +**最佳实践**: + +- 保存 `GS_PASSWORD` 到环境变量或安全存储,避免明文存储。 +- 如果部署失败,检查 Docker 日志:`docker logs dn_6001`。 + + +## 查询集群状态 + +以下命令用于验证 openGauss 集群的运行状态和流复制配置。 + +### 1. 查看容器状态 + +检查所有运行中的 openGauss 容器。 + +```bash +docker ps | grep dn +``` + +**运行结果**: + +```bash +4f7152e840c6 opengauss:7.0.0-RC1 "entrypoint.sh -M st…" 6 minutes ago Up 6 minutes 5432/tcp, 0.0.0.0:7432->7432/tcp, :::7432->7432/tcp dn_6003 +4bdc67bff939 opengauss:7.0.0-RC1 "entrypoint.sh -M st…" 7 minutes ago Up 7 minutes 5432/tcp, 0.0.0.0:6432->6432/tcp, :::6432->6432/tcp dn_6002 +e664974a1064 opengauss:7.0.0-RC1 "entrypoint.sh -M pr…" 7 minutes ago Up 7 minutes 0.0.0.0:5432->5432/tcp, :::5432->5432/tcp dn_6001 +``` + +**说明**: +- 确认三个节点(`dn_6001`, `dn_6002`, `dn_6003`)均在运行。 +- 端口映射确保外部可访问:`5432`(主节点)、`6432`(备节点 1)、`7432`(备节点 2)。 + +### 2. 查看主节点状态 + +在主节点容器 `dn_6001` 中检查数据同步状态。 + +```bash +docker exec dn_6001 su - omm -c "gs_ctl query -D /var/lib/opengauss/data" +``` + +**运行结果**: + +```bash +[2025-09-07 10:15:31.138][528][][gs_ctl]: gs_ctl query ,datadir is /var/lib/opengauss/data + HA state: + local_role : Primary + static_connections : 2 + db_state : Normal + detail_information : Normal + + Senders info: + sender_pid : 489 + local_role : Primary + peer_role : Standby + peer_state : Normal + state : Streaming + sender_sent_location : 0/5000888 + sender_write_location : 0/5000888 + sender_flush_location : 0/5000888 + sender_replay_location : 0/5000888 + receiver_received_location : 0/5000888 + receiver_write_location : 0/5000888 + receiver_flush_location : 0/5000888 + receiver_replay_location : 0/5000888 + sync_percent : 100% + sync_state : Sync + sync_priority : 1 + sync_most_available : On + channel : 172.11.0.101:5433-->172.11.0.102:40302 + + sender_pid : 497 + local_role : Primary + peer_role : Standby + peer_state : Normal + state : Streaming + sender_sent_location : 0/5000888 + sender_write_location : 0/5000888 + sender_flush_location : 0/5000888 + sender_replay_location : 0/5000888 + receiver_received_location : 0/5000888 + receiver_write_location : 0/5000888 + receiver_flush_location : 0/5000888 + receiver_replay_location : 0/5000888 + sync_percent : 100% + sync_state : Potential + sync_priority : 1 + sync_most_available : On + channel : 172.11.0.101:5433-->172.11.0.103:44074 + + Receiver info: +No information +``` + +**解析**: + +- `local_role: Primary`:确认 `dn_6001` 是主节点。 +- `state: Streaming`:主节点正在向两个备节点发送 WAL 日志,流复制正常。 +- `sync_state: Sync`(`dn_6002`):同步备节点。 +- `sync_state: Potential`(`dn_6003`):候选同步备节点。 +- `sync_percent: 100%`:数据完全同步,无延迟。 + +### 3. 查看备节点状态 + +#### 备节点 1(`dn_6002`) + +```bash +docker exec dn_6002 su - omm -c "gs_ctl query -D /var/lib/opengauss/data" +``` + +**运行结果**: + +```bash +[2025-09-07 10:15:51.772][632][][gs_ctl]: gs_ctl query ,datadir is /var/lib/opengauss/data + HA state: + local_role : Standby + static_connections : 2 + db_state : Normal + detail_information : Normal + + Senders info: +No information + Receiver info: + receiver_pid : 582 + local_role : Standby + peer_role : Primary + peer_state : Normal + state : Normal + sender_sent_location : 0/5000908 + sender_write_location : 0/5000908 + sender_flush_location : 0/5000908 + sender_replay_location : 0/5000908 + receiver_received_location : 0/5000908 + receiver_write_location : 0/5000908 + receiver_flush_location : 0/5000908 + receiver_replay_location : 0/5000908 + sync_percent : 100% + channel : 172.11.0.102:40302<--172.11.0.101:5433 + +``` + +#### 备节点 2(`dn_6003`) + +```bash +docker exec dn_6003 su - omm -c "gs_ctl query -D /var/lib/opengauss/data" +``` + +**运行结果**: + +```bash +[2025-09-07 10:16:12.338][634][][gs_ctl]: gs_ctl query ,datadir is /var/lib/opengauss/data + HA state: + local_role : Standby + static_connections : 2 + db_state : Normal + detail_information : Normal + + Senders info: +No information + Receiver info: + receiver_pid : 584 + local_role : Standby + peer_role : Primary + peer_state : Normal + state : Normal + sender_sent_location : 0/5000A28 + sender_write_location : 0/5000A28 + sender_flush_location : 0/5000A28 + sender_replay_location : 0/5000A28 + receiver_received_location : 0/5000A28 + receiver_write_location : 0/5000A28 + receiver_flush_location : 0/5000A28 + receiver_replay_location : 0/5000A28 + sync_percent : 100% + channel : 172.11.0.103:44074<--172.11.0.101:5433 +``` + +**解析**: + +- 两个备节点均为 `Standby`,状态正常,接收主节点的 WAL 日志。 +- `sync_percent: 100%` 确认数据同步无延迟。 + +### 4. 使用 SQL 查询流复制状态 + +#### 主节点(`dn_6001`)查看所有流复制连接 + +```bash +docker exec -it dn_6001 su - omm -c "gsql -d postgres -U omm -W 'YourPass@ord' -p 5432 -c \"select usename,application_name,client_addr,state,sync_state,sender_sent_location,receiver_write_location from pg_stat_replication;\"" +``` + +**运行结果**: + +```bash + usename | application_name | client_addr | state | sync_state | sender_sent_location | receiver_write_location +---------+-------------------------------+--------------+-----------+------------+----------------------+------------------------- + omm | WalSender to Standby[dn_6003] | 172.11.0.103 | Streaming | Potential | 0/5000AA8 | 0/5000AA8 + omm | WalSender to Standby[dn_6002] | 172.11.0.102 | Streaming | Sync | 0/5000AA8 | 0/5000AA8 +(2 rows) +``` + +#### 备节点 1(`dn_6002`)查看流复制状态 + +```bash +docker exec -it dn_6002 su - omm -c "gsql -d postgres -U omm -W 'YourPass@ord' -p 6432 -c \"select * from pg_stat_get_stream_replications();\"" +``` + +**运行结果**: + +```bash + local_role | static_connections | db_state | detail_information +------------+--------------------+----------+-------------------- + Standby | 2 | Normal | Normal +(1 row) +``` + +#### 备节点 1(`dn_6002`)查看 WAL 接收状态 + +```bash +docker exec -it dn_6002 su - omm -c "gsql -d postgres -U omm -W 'YourPass@ord' -p 6432 -c \"select * from pg_stat_get_wal_receiver();\"" +``` + +**运行结果**: + +```bash + receiver_pid | local_role | peer_role | peer_state | state | sender_sent_location | sender_write_location | sender_flush_location | sender_replay_location | receiver_received_location | + receiver_write_location | receiver_flush_location | receiver_replay_location | sync_percent | channel +--------------+------------+-----------+------------+--------+----------------------+-----------------------+-----------------------+------------------------+----------------------------+ +-------------------------+-------------------------+--------------------------+--------------+---------------------------------------- + 582 | Standby | Primary | Normal | Normal | 0/5000AA8 | 0/5000AA8 | 0/5000AA8 | 0/5000AA8 | 0/5000AA8 | + 0/5000AA8 | 0/5000AA8 | 0/5000AA8 | 100% | 172.11.0.102:40302<--172.11.0.101:5433 +(1 row) +``` + +#### 备节点 2(`dn_6003`)查看流复制状态 + +```bash +docker exec -it dn_6003 su - omm -c "gsql -d postgres -U omm -W 'YourPass@ord' -p 7432 -c \"select * from pg_stat_get_stream_replications();\"" +``` + +**运行结果**: + +```bash + local_role | static_connections | db_state | detail_information +------------+--------------------+----------+-------------------- + Standby | 2 | Normal | Normal +(1 row) +``` + +#### 备节点 2(`dn_6003`)查看 WAL 接收状态 + +```bash +docker exec -it dn_6003 su - omm -c "gsql -d postgres -U omm -W 'YourPass@ord' -p 7432 -c \"select * from pg_stat_get_wal_receiver();\"" +``` + +**运行结果**: + +```bash + receiver_pid | local_role | peer_role | peer_state | state | sender_sent_location | sender_write_location | sender_flush_location | sender_replay_location | receiver_received_location | + receiver_write_location | receiver_flush_location | receiver_replay_location | sync_percent | channel +--------------+------------+-----------+------------+--------+----------------------+-----------------------+-----------------------+------------------------+----------------------------+ +-------------------------+-------------------------+--------------------------+--------------+---------------------------------------- + 584 | Standby | Primary | Normal | Normal | 0/5000BC8 | 0/5000BC8 | 0/5000BC8 | 0/5000BC8 | 0/5000BC8 | + 0/5000BC8 | 0/5000BC8 | 0/5000BC8 | 100% | 172.11.0.103:44074<--172.11.0.101:5433 +(1 row) +``` + +**最佳实践**: + +- 定期运行上述命令,监控集群健康状态。 +- 如果 `sync_percent` 未达到 100%,检查网络延迟或主备节点配置。 +- 使用工具(如 `pg_stat_replication` 和 `pg_stat_get_wal_receiver`)自动化监控集群状态。 + +--- + +## 运行高可用性测试 + +使用 `cluster_ha_showcase.py` 脚本测试 openGauss 集群的容灾、负载均衡和自动寻主功能。 + +### 1. 设置 DSN 环境变量 + +根据集群配置,设置数据库连接字符串(DSN)。 + +```bash +export DSN="dbname=postgres user=root password=YourPass@ord host=172.11.0.101,172.11.0.102,172.11.0.103 port=5432,6432,7432" +``` + +**说明**: + +- `user=omm`:openGauss 默认管理员用户。 +- `password=YourPass@ord`:与部署脚本中的 `GS_PASSWORD` 一致。 +- `host` 和 `port`:对应主节点(`172.11.0.101:5432`)、备节点 1(`172.11.0.102:6432`)、备节点 2(`172.11.0.103:7432`)。 + +### 2. 运行测试脚本 + +执行 `cluster_ha_showcase.py` 进行高可用性测试。 + +```bash +python cluster_ha_showcase.py "$DSN" +``` + +**运行结果**: + +```bash + +=== 容灾场景测试 === +2025-09-07 19:26:24,617 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +容灾测试通过: 连接到节点 172.11.0.101:5432,角色: Primary,模式: master-standby + +=== 容灾场景测试(模拟主节点故障) === +2025-09-07 19:26:24,626 - INFO - 连接成功: host=172.11.0.102 port=6432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +容灾测试通过: 切换到节点 172.11.0.102:6432,角色: Standby,模式: master-standby + +=== 负载均衡场景测试 === +2025-09-07 19:26:24,636 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,638 - INFO - 主节点 172.11.0.101:5432,角色: Primary,模式: master-standby +2025-09-07 19:26:24,647 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +写操作成功: 连接到主节点 172.11.0.101:5432,角色: Primary + +=== 测试 disable 模式 === +2025-09-07 19:26:24,680 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,682 - INFO - 读操作结果: ('test write',) +读操作成功: 连接到节点 172.11.0.101:5432,角色: Primary,数据: test write +2025-09-07 19:26:24,692 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,694 - INFO - 读操作结果: ('test write',) +读操作成功: 连接到节点 172.11.0.101:5432,角色: Primary,数据: test write +2025-09-07 19:26:24,703 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,705 - INFO - 读操作结果: ('test write',) +读操作成功: 连接到节点 172.11.0.101:5432,角色: Primary,数据: test write +2025-09-07 19:26:24,714 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,716 - INFO - 读操作结果: ('test write',) +读操作成功: 连接到节点 172.11.0.101:5432,角色: Primary,数据: test write +2025-09-07 19:26:24,725 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,727 - INFO - 读操作结果: ('test write',) +读操作成功: 连接到节点 172.11.0.101:5432,角色: Primary,数据: test write +2025-09-07 19:26:24,736 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,738 - INFO - 读操作结果: ('test write',) +读操作成功: 连接到节点 172.11.0.101:5432,角色: Primary,数据: test write +2025-09-07 19:26:24,747 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,749 - INFO - 读操作结果: ('test write',) +读操作成功: 连接到节点 172.11.0.101:5432,角色: Primary,数据: test write +2025-09-07 19:26:24,758 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,760 - INFO - 读操作结果: ('test write',) +读操作成功: 连接到节点 172.11.0.101:5432,角色: Primary,数据: test write +2025-09-07 19:26:24,769 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,771 - INFO - 读操作结果: ('test write',) +读操作成功: 连接到节点 172.11.0.101:5432,角色: Primary,数据: test write +2025-09-07 19:26:24,780 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,782 - INFO - 读操作结果: ('test write',) +读操作成功: 连接到节点 172.11.0.101:5432,角色: Primary,数据: test write +2025-09-07 19:26:24,791 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,793 - INFO - 读操作结果: ('test write',) +读操作成功: 连接到节点 172.11.0.101:5432,角色: Primary,数据: test write +2025-09-07 19:26:24,802 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,804 - INFO - 读操作结果: ('test write',) +读操作成功: 连接到节点 172.11.0.101:5432,角色: Primary,数据: test write +2025-09-07 19:26:24,814 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,815 - INFO - 读操作结果: ('test write',) +读操作成功: 连接到节点 172.11.0.101:5432,角色: Primary,数据: test write +2025-09-07 19:26:24,825 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,826 - INFO - 读操作结果: ('test write',) +读操作成功: 连接到节点 172.11.0.101:5432,角色: Primary,数据: test write +2025-09-07 19:26:24,836 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,837 - INFO - 读操作结果: ('test write',) +读操作成功: 连接到节点 172.11.0.101:5432,角色: Primary,数据: test write +2025-09-07 19:26:24,847 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,849 - INFO - 读操作结果: ('test write',) +读操作成功: 连接到节点 172.11.0.101:5432,角色: Primary,数据: test write +2025-09-07 19:26:24,858 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,860 - INFO - 读操作结果: ('test write',) +读操作成功: 连接到节点 172.11.0.101:5432,角色: Primary,数据: test write +2025-09-07 19:26:24,869 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,871 - INFO - 读操作结果: ('test write',) +读操作成功: 连接到节点 172.11.0.101:5432,角色: Primary,数据: test write +2025-09-07 19:26:24,880 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,882 - INFO - 读操作结果: ('test write',) +读操作成功: 连接到节点 172.11.0.101:5432,角色: Primary,数据: test write +2025-09-07 19:26:24,891 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,893 - INFO - 读操作结果: ('test write',) +读操作成功: 连接到节点 172.11.0.101:5432,角色: Primary,数据: test write +负载均衡测试通过 (disable 模式): 连接顺序符合预期 ['172.11.0.101', '172.11.0.101', '172.11.0.101', '172.11.0.101', '172.11.0.101', '172.11.0.101', '172.11.0.101', '172.11.0.101', '172.11.0.101', '172.11.0.101', '172.11.0.101', '172.11.0.101', '172.11.0.101', '172.11.0.101', '172.11.0.101', '172.11.0.101', '172.11.0.101', '172.11.0.101', '172.11.0.101', '172.11.0.101'] + +=== 测试 random 模式 === +2025-09-07 19:26:24,902 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,904 - INFO - 读操作结果: ('test write',) +读操作成功: 连接到节点 172.11.0.101:5432,角色: Primary,数据: test write +2025-09-07 19:26:24,913 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,915 - INFO - 读操作结果: ('test write',) +读操作成功: 连接到节点 172.11.0.101:5432,角色: Primary,数据: test write +2025-09-07 19:26:24,922 - INFO - 连接成功: host=172.11.0.102 port=6432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,924 - INFO - 读操作结果: ('test write',) +读操作成功: 连接到节点 172.11.0.102:6432,角色: Standby,数据: test write +2025-09-07 19:26:24,931 - INFO - 连接成功: host=172.11.0.103 port=7432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,933 - INFO - 读操作结果: ('test write',) +读操作成功: 连接到节点 172.11.0.103:7432,角色: Standby,数据: test write +负载均衡测试通过 (random 模式): 随机连接,包含多个节点 ['172.11.0.101', '172.11.0.101', '172.11.0.102', '172.11.0.103'] +2025-09-07 19:26:24,942 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 + +=== 自动寻主场景测试 === +2025-09-07 19:26:24,960 - INFO - 连接成功: host=172.11.0.101 port=5432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +自动寻主测试通过: 连接到主节点 172.11.0.101:5432,角色: Primary + +=== 自动寻主场景测试(模拟主节点故障) === +2025-09-07 19:26:24,969 - INFO - 连接成功: host=172.11.0.102 port=6432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,970 - INFO - 节点 172.11.0.102:6432 是 Standby,模式: master-standby,继续查找 +2025-09-07 19:26:24,977 - INFO - 连接成功: host=172.11.0.103 port=7432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:24,979 - INFO - 节点 172.11.0.103:7432 是 Standby,模式: master-standby,继续查找 +2025-09-07 19:26:24,980 - INFO - 第 1/3 次尝试未找到主节点,等待 5 秒后重试 +2025-09-07 19:26:29,991 - INFO - 连接成功: host=172.11.0.102 port=6432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:29,993 - INFO - 节点 172.11.0.102:6432 是 Standby,模式: master-standby,继续查找 +2025-09-07 19:26:30,000 - INFO - 连接成功: host=172.11.0.103 port=7432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:30,002 - INFO - 节点 172.11.0.103:7432 是 Standby,模式: master-standby,继续查找 +2025-09-07 19:26:30,002 - INFO - 第 2/3 次尝试未找到主节点,等待 5 秒后重试 +2025-09-07 19:26:35,014 - INFO - 连接成功: host=172.11.0.102 port=6432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:35,016 - INFO - 节点 172.11.0.102:6432 是 Standby,模式: master-standby,继续查找 +2025-09-07 19:26:35,023 - INFO - 连接成功: host=172.11.0.103 port=7432 user=*** password=*** dbname=postgres,耗时: 0.01 秒 +2025-09-07 19:26:35,025 - INFO - 节点 172.11.0.103:7432 是 Standby,模式: master-standby,继续查找 +自动寻主测试失败: 尝试的节点 ['172.11.0.102:6432 (Standby)', '172.11.0.103:7432 (Standby)', '172.11.0.102:6432 (Standby)', '172.11.0.103:7432 (Standby)', '172.11.0.102:6432 (Standby)', '172.11.0.103:7432 (Standby)'],未找到主节点或协调节点 +``` + +**结果分析**: + +- **容灾测试**:正常情况下连接主节点成功,模拟主节点故障时切换到备节点。 +- **负载均衡测试**: + - `disable` 模式:只连接主节点(`172.11.0.101:5432`),符合顺序轮询逻辑。 + - `random` 模式:成功连接主节点和备节点,验证了随机负载均衡。 +- **自动寻主测试**: + - 正常情况下找到主节点。 + - 模拟主节点故障时,脚本尝试连接备节点,但未找到主节点(预期行为,因为备节点为 `Standby`)。 + +**问题与改进**: + +- **自动寻主失败**:脚本在模拟主节点故障时未提升备节点为主节点。实际生产环境中,应配置自动故障转移(如使用 `gs_ctl` 或第三方工具如 Patroni)。 +- **负载均衡**:`disable` 模式未充分利用备节点,建议优化脚本以支持读操作分发到备节点。 +- **权限问题**:确保用户 `omm` 有权限访问数据库和表: + + ```sql + GRANT ALL ON DATABASE postgres TO omm; + GRANT ALL ON test_table TO omm; + ``` + +--- + +## 调试与故障排除 + +### 1. 部署失败 + +- **检查 Docker 日志**: + + ```bash + docker logs dn_6001 + ``` + +- **网络问题**:确保子网 `172.11.0.0/24` 未被占用,检查 `net_700` 网络状态: + + ```bash + docker network inspect net_700 + ``` + +- **密码复杂性**:确保 `GS_PASSWORD` 符合要求(8+ 字符,包含大小写、数字、特殊字符)。 + +### 2. 测试脚本失败 + +- **连接错误**: + - 检查 `pg_hba.conf` 是否允许客户端 IP 连接: + + ```plaintext + host all all 0.0.0.0/0 md5 + ``` + + - 验证 DSN 参数是否正确。 diff --git a/example/demo.py b/example/demo.py old mode 100644 new mode 100755