From f2a597a3d15050cc90d92aaca0fa4283e20ec4a8 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 9 Dec 2025 13:55:22 +0000 Subject: [PATCH 1/4] Initial plan From 667c40dacb3882c4cecc5b1c1a502664d04cdaeb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 9 Dec 2025 14:00:58 +0000 Subject: [PATCH 2/4] Add async/poll support for pg_basebackup with configurable timeout Co-authored-by: haroon-github <10290607+haroon-github@users.noreply.github.com> --- roles/postgres/replica/final/defaults/main.yml | 14 ++++++++++++++ roles/postgres/replica/final/tasks/clone.yml | 18 ++++++++++++++++-- 2 files changed, 30 insertions(+), 2 deletions(-) create mode 100644 roles/postgres/replica/final/defaults/main.yml diff --git a/roles/postgres/replica/final/defaults/main.yml b/roles/postgres/replica/final/defaults/main.yml new file mode 100644 index 000000000..42d43735f --- /dev/null +++ b/roles/postgres/replica/final/defaults/main.yml @@ -0,0 +1,14 @@ +--- + +# © Copyright EnterpriseDB UK Limited 2015-2025 - All rights reserved. + +## pg_basebackup timeout for cloning replicas +# +# For large databases, pg_basebackup can take multiple hours to complete. +# This timeout value (in seconds) determines how long to wait for the +# pg_basebackup operation to complete. +# +# Default: 14400 seconds (4 hours) +# Set to 0 for no timeout (not recommended) + +pg_basebackup_timeout: 14400 diff --git a/roles/postgres/replica/final/tasks/clone.yml b/roles/postgres/replica/final/tasks/clone.yml index d990d02b2..97fc320dc 100644 --- a/roles/postgres/replica/final/tasks/clone.yml +++ b/roles/postgres/replica/final/tasks/clone.yml @@ -39,9 +39,23 @@ extra_option: "{{ waldir_option if pg_wal_dir_outside_pgdata else '' }}" become_user: "{{ postgres_user }}" become: true - register: this + async: "{{ pg_basebackup_timeout }}" + poll: 0 + register: pg_basebackup_job + when: + task_selector|selects('postgres', 'replica') + +- name: Wait for pg_basebackup to complete + async_status: + jid: "{{ pg_basebackup_job.ansible_job_id }}" + become_user: "{{ postgres_user }}" + become: true + register: pg_basebackup_result + until: pg_basebackup_result.finished + retries: "{{ (pg_basebackup_timeout / 10) | int }}" + delay: 10 failed_when: - this.rc != 0 or 'error' in this.stderr + pg_basebackup_result.rc != 0 or 'error' in pg_basebackup_result.stderr when: task_selector|selects('postgres', 'replica') From 3b62ce8a85e382af7d808fe3c742425dc30ffb4f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 9 Dec 2025 14:07:27 +0000 Subject: [PATCH 3/4] Fix async_status error handling and add minimum retry count Co-authored-by: haroon-github <10290607+haroon-github@users.noreply.github.com> --- roles/postgres/replica/final/defaults/main.yml | 2 +- roles/postgres/replica/final/tasks/clone.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/roles/postgres/replica/final/defaults/main.yml b/roles/postgres/replica/final/defaults/main.yml index 42d43735f..2c3b8edbb 100644 --- a/roles/postgres/replica/final/defaults/main.yml +++ b/roles/postgres/replica/final/defaults/main.yml @@ -9,6 +9,6 @@ # pg_basebackup operation to complete. # # Default: 14400 seconds (4 hours) -# Set to 0 for no timeout (not recommended) +# For very large databases, increase this value as needed (e.g., 28800 for 8 hours) pg_basebackup_timeout: 14400 diff --git a/roles/postgres/replica/final/tasks/clone.yml b/roles/postgres/replica/final/tasks/clone.yml index 97fc320dc..147cbc4b8 100644 --- a/roles/postgres/replica/final/tasks/clone.yml +++ b/roles/postgres/replica/final/tasks/clone.yml @@ -52,10 +52,10 @@ become: true register: pg_basebackup_result until: pg_basebackup_result.finished - retries: "{{ (pg_basebackup_timeout / 10) | int }}" + retries: "{{ [(pg_basebackup_timeout / 10) | int, 1] | max }}" delay: 10 failed_when: - pg_basebackup_result.rc != 0 or 'error' in pg_basebackup_result.stderr + pg_basebackup_result.rc != 0 or 'error' in (pg_basebackup_result.stderr | default('')) when: task_selector|selects('postgres', 'replica') From f5f891ca44d18a11a08f19a7122a0734c88fc811 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 9 Dec 2025 14:08:37 +0000 Subject: [PATCH 4/4] Add safety check for pg_basebackup_job variable Co-authored-by: haroon-github <10290607+haroon-github@users.noreply.github.com> --- roles/postgres/replica/final/tasks/clone.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/roles/postgres/replica/final/tasks/clone.yml b/roles/postgres/replica/final/tasks/clone.yml index 147cbc4b8..db82e5e5b 100644 --- a/roles/postgres/replica/final/tasks/clone.yml +++ b/roles/postgres/replica/final/tasks/clone.yml @@ -57,7 +57,8 @@ failed_when: pg_basebackup_result.rc != 0 or 'error' in (pg_basebackup_result.stderr | default('')) when: - task_selector|selects('postgres', 'replica') + - task_selector|selects('postgres', 'replica') + - pg_basebackup_job is defined # If we are cloning an instance with postgres_conf_dir separated from # postgres_data_dir, we copy its configuration files to the replica's