|
54 | 54 | run_once: true |
55 | 55 | changed_when: true |
56 | 56 |
|
57 | | - - name: Check if etcd is running on leader node |
58 | | - ansible.builtin.command: podman ps |
| 57 | + - name: Disable etcd resource on leader node |
59 | 58 | delegate_to: "{{ leader_node }}" |
60 | | - register: leader_etcd_status |
61 | | - changed_when: false |
62 | | - run_once: true |
63 | | - failed_when: false |
64 | | - |
65 | | - - name: Determine recovery scenario |
66 | | - ansible.builtin.set_fact: |
67 | | - leader_has_etcd: "{{ 'etcd' in leader_etcd_status.stdout }}" |
68 | 59 | run_once: true |
69 | | - |
70 | | - - name: Handle scenario where no etcd is running on leader |
71 | | - when: not leader_has_etcd |
72 | 60 | block: |
73 | | - - name: Take etcd snapshot on both nodes |
74 | | - ansible.builtin.copy: |
75 | | - src: "/var/lib/etcd/member/snap/db" |
76 | | - dest: "{{ snapshot_dir }}/{{ snapshot_name }}" |
77 | | - remote_src: true |
78 | | - owner: core |
79 | | - group: core |
80 | | - mode: '0644' |
| 61 | + - name: Disable etcd resource |
| 62 | + ansible.builtin.command: pcs resource disable etcd |
| 63 | + changed_when: true |
81 | 64 |
|
82 | | - - name: Clean up old snapshots (keep last {{ snapshot_retention_count }}) |
| 65 | + - name: Wait for etcd to stop |
83 | 66 | ansible.builtin.shell: | |
84 | | - ls -1t {{ snapshot_dir }}/etcd-snapshot-*.db 2>/dev/null | tail -n +{{ snapshot_retention_count + 1 }} | xargs -r rm -f |
85 | | - args: |
86 | | - executable: /bin/bash |
87 | | - changed_when: true |
| 67 | + pcs status resources | grep etcd -A 1 | grep -E 'Started|Stopping' |
| 68 | + register: etcd_stopping |
| 69 | + changed_when: false |
88 | 70 | failed_when: false |
| 71 | + until: etcd_stopping.rc != 0 |
| 72 | + retries: 60 |
| 73 | + delay: 5 |
89 | 74 |
|
90 | | - - name: Display snapshot location |
| 75 | + - name: Display etcd stopped confirmation |
91 | 76 | ansible.builtin.debug: |
92 | | - msg: "✓ etcd snapshot saved on {{ inventory_hostname }} to: {{ snapshot_dir }}/{{ snapshot_name }}" |
| 77 | + msg: "Etcd resource is now stopped." |
93 | 78 |
|
94 | 79 | - name: Clear CIB attributes on all nodes |
95 | 80 | block: |
|
186 | 171 | Unexpected force_new_cluster attribute on {{ follower_hostname }} |
187 | 172 | Output: {{ follower_reboot_attrs.stdout }} |
188 | 173 |
|
189 | | - - name: Remove follower from etcd member list |
190 | | - delegate_to: "{{ leader_node }}" |
191 | | - run_once: true |
192 | | - when: leader_has_etcd |
193 | | - block: |
194 | | - - name: Get etcd member list |
195 | | - ansible.builtin.command: podman exec etcd etcdctl member list |
196 | | - register: etcd_member_list |
197 | | - changed_when: false |
198 | | - |
199 | | - - name: Extract follower member ID by hostname |
200 | | - ansible.builtin.set_fact: |
201 | | - follower_member_id: "{{ (etcd_member_list.stdout_lines | select('search', follower_hostname) | first | split(','))[0] | default('') }}" |
202 | | - when: follower_hostname in etcd_member_list.stdout |
203 | | - |
204 | | - - name: Extract follower member ID by unstarted state (fallback) |
205 | | - ansible.builtin.set_fact: |
206 | | - follower_member_id: "{{ (etcd_member_list.stdout_lines | select('search', 'unstarted') | first | split(','))[0] | default('') }}" |
207 | | - when: |
208 | | - - follower_hostname not in etcd_member_list.stdout |
209 | | - - "'unstarted' in etcd_member_list.stdout" |
210 | | - |
211 | | - - name: Display etcd member list if follower not found |
212 | | - ansible.builtin.debug: |
213 | | - msg: | |
214 | | - Could not find follower {{ follower_hostname }} in etcd member list. Nothing to do. |
215 | | - Member list: |
216 | | - {{ etcd_member_list.stdout }} |
217 | | - when: follower_member_id is not defined or follower_member_id == '' |
218 | | - |
219 | | - - name: Remove follower from etcd cluster |
220 | | - ansible.builtin.command: podman exec etcd etcdctl member remove {{ follower_member_id }} |
221 | | - when: |
222 | | - - follower_member_id is defined |
223 | | - - follower_member_id != '' |
224 | | - changed_when: true |
225 | | - |
226 | | - - name: Display removal confirmation |
227 | | - ansible.builtin.debug: |
228 | | - msg: "Removing follower member ID: {{ follower_member_id }} ({{ follower_hostname }})" |
229 | | - when: |
230 | | - - follower_member_id is defined |
231 | | - - follower_member_id != '' |
232 | | - |
233 | | - - name: Cleanup etcd resource on leader node |
234 | | - ansible.builtin.command: pcs resource cleanup etcd |
| 174 | + - name: Enable etcd resource on leader node |
| 175 | + ansible.builtin.command: pcs resource enable etcd |
235 | 176 | delegate_to: "{{ leader_node }}" |
236 | 177 | run_once: true |
237 | 178 | changed_when: true |
238 | 179 |
|
239 | | - - name: Cleanup etcd resource on follower node |
| 180 | + - name: Cleanup etcd resource to restore the cluster |
240 | 181 | ansible.builtin.command: pcs resource cleanup etcd |
241 | | - delegate_to: "{{ follower_node }}" |
242 | | - run_once: true |
243 | | - changed_when: true |
244 | | - |
245 | | - - name: Wait for etcd to potentially start (no-etcd scenario) |
246 | | - ansible.builtin.pause: |
247 | | - seconds: 10 |
248 | | - when: not leader_has_etcd |
249 | | - run_once: true |
250 | | - |
251 | | - - name: Re-check etcd status after cleanup (no-etcd scenario) |
252 | | - ansible.builtin.command: podman ps |
253 | 182 | delegate_to: "{{ leader_node }}" |
254 | | - register: leader_etcd_recheck |
255 | | - changed_when: false |
256 | | - run_once: true |
257 | | - when: not leader_has_etcd |
258 | | - |
259 | | - - name: Display etcd recovery status |
260 | | - ansible.builtin.debug: |
261 | | - msg: | |
262 | | - {% if not leader_has_etcd %} |
263 | | - {% if 'etcd' in leader_etcd_recheck.stdout %} |
264 | | - ✓ Leader etcd is now running after cleanup. |
265 | | - {% else %} |
266 | | - ⚠ Leader etcd is still not running after cleanup. Manual intervention may be required. |
267 | | - CIB attributes have been set for force-new-cluster on {{ leader_hostname }} |
268 | | - {% endif %} |
269 | | - {% else %} |
270 | | - ✓ All force-new-cluster operations completed successfully. |
271 | | - {% endif %} |
272 | 183 | run_once: true |
| 184 | + changed_when: true |
273 | 185 |
|
274 | 186 | - name: Re-enable stonith on leader node |
275 | 187 | ansible.builtin.command: pcs property set stonith-enabled=true |
|
287 | 199 | post_tasks: |
288 | 200 | - name: Display completion message |
289 | 201 | ansible.builtin.debug: |
290 | | - msg: "✓ Force new cluster operation completed. All tests passed." |
| 202 | + msg: "✓ Force new cluster operation completed. Etcd cluster recovery initiated." |
291 | 203 | run_once: true |
292 | | - when: leader_has_etcd |
0 commit comments