diff --git a/docs_user/modules/proc_adopting-compute-services-to-the-data-plane.adoc b/docs_user/modules/proc_adopting-compute-services-to-the-data-plane.adoc index 4368f8181..0d55565ae 100644 --- a/docs_user/modules/proc_adopting-compute-services-to-the-data-plane.adoc +++ b/docs_user/modules/proc_adopting-compute-services-to-the-data-plane.adoc @@ -826,6 +826,40 @@ $ for CELL in $(echo $RENAMED_CELLS); do > done ---- +. Clean up any {networking_first_ref} agents that are no longer running: ++ +---- +$ oc exec openstackclient -- openstack network agent list +---- +.. If any agent in the list shows `XXX` in the `Alive` field, verify the Host and Agent Type. If the function of this agent is no longer required and the agent has been permanently stopped on the {rhos_prev_long} host, delete the agent: ++ +---- +$ oc exec openstackclient -- openstack network agent delete +---- ++ +* Replace `` with the ID of the agent to delete. ++ +[IMPORTANT] +==== +During {rhos_prev_long} to {rhos_acro} adoption, the `tripleo_cleanup` playbook stops {rhos_prev_long} services on Compute nodes, including `neutron-ovn-metadata-agent` and `ovn-controller`. However, these agents do not unregister themselves from the {networking_service}; they stop sending heartbeats. When the new {rhos_acro} deployment creates containerized agents, these agents might register as new agents with different chassis IDs or hostnames rather than updating the old {rhos_prev_long} agent records. This leaves orphaned "dead" agents in the {networking_service} database that show as "Alive: XXX". + +If you encounter OVN Metadata agents or OVN Controller agents in the "XXX" state after adoption, you must delete these stale agents: + +---- +$ for AGENT_ID in $(oc exec openstackclient -- openstack network agent list | grep 'neutron-ovn-metadata-agent' | grep 'XXX' | awk '{print $2}'); do + echo "Deleting stale metadata agent: ${AGENT_ID}" + oc exec openstackclient -- openstack network agent delete ${AGENT_ID} +done + +$ for AGENT_ID in $(oc exec openstackclient -- openstack network agent list | grep 'OVN Controller agent' | grep -v 'Gateway' | grep 'XXX' | awk '{print $2}'); do + echo "Deleting stale OVN controller agent: ${AGENT_ID}" + oc exec openstackclient -- openstack network agent delete ${AGENT_ID} +done +---- + +This issue is more common in deployments that use site-specific naming patterns and unique OVN chassis IDs that differ from other node sets. However, this issue can affect any {rhos_prev_long} to {rhos_acro} adoption where the new deployment does not match the old hostnames or chassis IDs. +==== + . Verify that the {networking_first_ref} agents are running: + ---- diff --git a/docs_user/modules/proc_adopting-networker-services-to-the-data-plane.adoc b/docs_user/modules/proc_adopting-networker-services-to-the-data-plane.adoc index 823606d11..533b5df9f 100644 --- a/docs_user/modules/proc_adopting-networker-services-to-the-data-plane.adoc +++ b/docs_user/modules/proc_adopting-networker-services-to-the-data-plane.adoc @@ -367,9 +367,30 @@ $ oc exec openstackclient -- openstack network agent list .. If any agent in the list shows `XXX` in the `Alive` field, verify the Host and Agent Type, if the functions of this agent is no longer required, and the agent has been permanently stopped on the {rhos_prev_long} host. Then, delete the agent: + ---- -$ oc exec openstackclient -- openstack network agent +$ oc exec openstackclient -- openstack network agent delete ---- * Replace `` with the ID of the agent to delete, for example, `856960f0-5530-46c7-a331-6eadcba362da`. ++ +[IMPORTANT] +==== +During {rhos_prev_long} to {rhos_acro} adoption, the `tripleo_cleanup` playbook stops {rhos_prev_long} services on Compute nodes, including `neutron-ovn-metadata-agent` and `ovn-controller`. However, these agents do not unregister themselves from {networking_service}; they stop sending heartbeats. When the new {rhos_acro} deployment creates containerized agents, these agents might register as new agents with different chassis IDs or hostnames rather than updating the old {rhos_prev_long} agent records. This leaves orphaned "dead" agents in the {networking_service} database that show as "Alive: XXX". + +If you encounter OVN Metadata agents or OVN Controller agents in the "XXX" state after adoption, you must delete these stale agents: + +---- +$ for AGENT_ID in $(oc exec openstackclient -- openstack network agent list | grep 'neutron-ovn-metadata-agent' | grep 'XXX' | awk '{print $2}'); do + echo "Deleting stale metadata agent: ${AGENT_ID}" + oc exec openstackclient -- openstack network agent delete ${AGENT_ID} +done + +$ for AGENT_ID in $(oc exec openstackclient -- openstack network agent list | grep 'OVN Controller agent' | grep -v 'Gateway' | grep 'XXX' | awk '{print $2}'); do + echo "Deleting stale OVN controller agent: ${AGENT_ID}" + oc exec openstackclient -- openstack network agent delete ${AGENT_ID} +done +---- + +This issue is more common in deployments that use site-specific naming patterns and unique OVN chassis IDs that differ from other node sets. However, this issue can affect any {rhos_prev_long} to {rhos_acro} adoption where the new deployment does not match the old hostnames or chassis IDs. +==== .Verification diff --git a/tests/roles/dataplane_adoption/tasks/neutron_agents_cleanup.yaml b/tests/roles/dataplane_adoption/tasks/neutron_agents_cleanup.yaml index 8a2b04032..7b1c979a8 100644 --- a/tests/roles/dataplane_adoption/tasks/neutron_agents_cleanup.yaml +++ b/tests/roles/dataplane_adoption/tasks/neutron_agents_cleanup.yaml @@ -25,3 +25,33 @@ ${BASH_ALIASES[openstack]} network agent delete ${AGENT_ID} fi loop: "{{ neutron_ovn_controller_gateway_agent_cleanup_hosts | default([]) }}" + +- name: delete stale OVN Metadata agents from TripleO + ansible.builtin.shell: | + {{ shell_header }} + {{ oc_header }} + alias openstack="oc exec -t openstackclient -- openstack" + + # Delete metadata agents that are not alive (stopped during TripleO cleanup) + # This handles cases where the new EDPM deployment creates new agent records + # instead of reusing the old TripleO ones (different hostnames/chassis IDs) + for AGENT_ID in $(${BASH_ALIASES[openstack]} network agent list | grep 'neutron-ovn-metadata-agent' | grep 'XXX' | awk '{print $2}'); do + echo "Deleting stale metadata agent: ${AGENT_ID}" + ${BASH_ALIASES[openstack]} network agent delete ${AGENT_ID} || true + done + +- name: delete stale OVN Controller agents from TripleO + ansible.builtin.shell: | + {{ shell_header }} + {{ oc_header }} + alias openstack="oc exec -t openstackclient -- openstack" + + # Delete OVN controller agents that are not alive (stopped during TripleO cleanup) + # This handles cases where the new EDPM deployment creates new agent records + # instead of reusing the old TripleO ones (different hostnames/chassis IDs) + # Note: grep -v 'Gateway' excludes OVN Controller Gateway agents which are + # handled by the dedicated task above + for AGENT_ID in $(${BASH_ALIASES[openstack]} network agent list | grep 'OVN Controller agent' | grep -v 'Gateway' | grep 'XXX' | awk '{print $2}'); do + echo "Deleting stale OVN controller agent: ${AGENT_ID}" + ${BASH_ALIASES[openstack]} network agent delete ${AGENT_ID} || true + done