Skip to content

Commit a4c5145

Browse files
committed
Update force-new-cluster helper
1 parent 0f709dd commit a4c5145

File tree

1 file changed

+18
-107
lines changed

1 file changed

+18
-107
lines changed

helpers/force-new-cluster.yml

Lines changed: 18 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -54,42 +54,27 @@
5454
run_once: true
5555
changed_when: true
5656

57-
- name: Check if etcd is running on leader node
58-
ansible.builtin.command: podman ps
57+
- name: Disable etcd resource on leader node
5958
delegate_to: "{{ leader_node }}"
60-
register: leader_etcd_status
61-
changed_when: false
62-
run_once: true
63-
failed_when: false
64-
65-
- name: Determine recovery scenario
66-
ansible.builtin.set_fact:
67-
leader_has_etcd: "{{ 'etcd' in leader_etcd_status.stdout }}"
6859
run_once: true
69-
70-
- name: Handle scenario where no etcd is running on leader
71-
when: not leader_has_etcd
7260
block:
73-
- name: Take etcd snapshot on both nodes
74-
ansible.builtin.copy:
75-
src: "/var/lib/etcd/member/snap/db"
76-
dest: "{{ snapshot_dir }}/{{ snapshot_name }}"
77-
remote_src: true
78-
owner: core
79-
group: core
80-
mode: '0644'
61+
- name: Disable etcd resource
62+
ansible.builtin.command: pcs resource disable etcd
63+
changed_when: true
8164

82-
- name: Clean up old snapshots (keep last {{ snapshot_retention_count }})
65+
- name: Wait for etcd to stop
8366
ansible.builtin.shell: |
84-
ls -1t {{ snapshot_dir }}/etcd-snapshot-*.db 2>/dev/null | tail -n +{{ snapshot_retention_count + 1 }} | xargs -r rm -f
85-
args:
86-
executable: /bin/bash
87-
changed_when: true
67+
pcs status resources | grep etcd -A 1 | grep -E 'Started|Stopping'
68+
register: etcd_stopping
69+
changed_when: false
8870
failed_when: false
71+
until: etcd_stopping.rc != 0
72+
retries: 60
73+
delay: 5
8974

90-
- name: Display snapshot location
75+
- name: Display etcd stopped confirmation
9176
ansible.builtin.debug:
92-
msg: "✓ etcd snapshot saved on {{ inventory_hostname }} to: {{ snapshot_dir }}/{{ snapshot_name }}"
77+
msg: "Etcd resource is now stopped."
9378

9479
- name: Clear CIB attributes on all nodes
9580
block:
@@ -186,90 +171,17 @@
186171
Unexpected force_new_cluster attribute on {{ follower_hostname }}
187172
Output: {{ follower_reboot_attrs.stdout }}
188173
189-
- name: Remove follower from etcd member list
190-
delegate_to: "{{ leader_node }}"
191-
run_once: true
192-
when: leader_has_etcd
193-
block:
194-
- name: Get etcd member list
195-
ansible.builtin.command: podman exec etcd etcdctl member list
196-
register: etcd_member_list
197-
changed_when: false
198-
199-
- name: Extract follower member ID by hostname
200-
ansible.builtin.set_fact:
201-
follower_member_id: "{{ (etcd_member_list.stdout_lines | select('search', follower_hostname) | first | split(','))[0] | default('') }}"
202-
when: follower_hostname in etcd_member_list.stdout
203-
204-
- name: Extract follower member ID by unstarted state (fallback)
205-
ansible.builtin.set_fact:
206-
follower_member_id: "{{ (etcd_member_list.stdout_lines | select('search', 'unstarted') | first | split(','))[0] | default('') }}"
207-
when:
208-
- follower_hostname not in etcd_member_list.stdout
209-
- "'unstarted' in etcd_member_list.stdout"
210-
211-
- name: Display etcd member list if follower not found
212-
ansible.builtin.debug:
213-
msg: |
214-
Could not find follower {{ follower_hostname }} in etcd member list. Nothing to do.
215-
Member list:
216-
{{ etcd_member_list.stdout }}
217-
when: follower_member_id is not defined or follower_member_id == ''
218-
219-
- name: Remove follower from etcd cluster
220-
ansible.builtin.command: podman exec etcd etcdctl member remove {{ follower_member_id }}
221-
when:
222-
- follower_member_id is defined
223-
- follower_member_id != ''
224-
changed_when: true
225-
226-
- name: Display removal confirmation
227-
ansible.builtin.debug:
228-
msg: "Removing follower member ID: {{ follower_member_id }} ({{ follower_hostname }})"
229-
when:
230-
- follower_member_id is defined
231-
- follower_member_id != ''
232-
233-
- name: Cleanup etcd resource on leader node
234-
ansible.builtin.command: pcs resource cleanup etcd
174+
- name: Enable etcd resource on leader node
175+
ansible.builtin.command: pcs resource enable etcd
235176
delegate_to: "{{ leader_node }}"
236177
run_once: true
237178
changed_when: true
238179

239-
- name: Cleanup etcd resource on follower node
180+
- name: Cleanup etcd resource to restore the cluster
240181
ansible.builtin.command: pcs resource cleanup etcd
241-
delegate_to: "{{ follower_node }}"
242-
run_once: true
243-
changed_when: true
244-
245-
- name: Wait for etcd to potentially start (no-etcd scenario)
246-
ansible.builtin.pause:
247-
seconds: 10
248-
when: not leader_has_etcd
249-
run_once: true
250-
251-
- name: Re-check etcd status after cleanup (no-etcd scenario)
252-
ansible.builtin.command: podman ps
253182
delegate_to: "{{ leader_node }}"
254-
register: leader_etcd_recheck
255-
changed_when: false
256-
run_once: true
257-
when: not leader_has_etcd
258-
259-
- name: Display etcd recovery status
260-
ansible.builtin.debug:
261-
msg: |
262-
{% if not leader_has_etcd %}
263-
{% if 'etcd' in leader_etcd_recheck.stdout %}
264-
✓ Leader etcd is now running after cleanup.
265-
{% else %}
266-
⚠ Leader etcd is still not running after cleanup. Manual intervention may be required.
267-
CIB attributes have been set for force-new-cluster on {{ leader_hostname }}
268-
{% endif %}
269-
{% else %}
270-
✓ All force-new-cluster operations completed successfully.
271-
{% endif %}
272183
run_once: true
184+
changed_when: true
273185

274186
- name: Re-enable stonith on leader node
275187
ansible.builtin.command: pcs property set stonith-enabled=true
@@ -287,6 +199,5 @@
287199
post_tasks:
288200
- name: Display completion message
289201
ansible.builtin.debug:
290-
msg: "✓ Force new cluster operation completed. All tests passed."
202+
msg: "✓ Force new cluster operation completed. Etcd cluster recovery initiated."
291203
run_once: true
292-
when: leader_has_etcd

0 commit comments

Comments
 (0)